In [1]:
try:
    import numpy as np
    print('NumPy already installed, only imported')
except:
    !pip install numpy
    import numpy as np
    print('NumPy was not installed, installed and imported')
    
try:
    import pandas as pd
    print('pandas already installed, only imported')
except:
    !pip install pandas
    import pandas as pd
    print('pandas was not installed, installed and imported')

try:
    import sklearn
    print('sklearn already installed, only imported')
except:
    !pip install scikit-learn
    import sklearn
    print('sklearn was not installed, installed and imported')

try: 
    import time    
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.feature_selection import mutual_info_regression
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_absolute_error
    import random
    print('All of the remaning libraries have been imported')
except: 
    print("Not all libraries have been imported correctly, please check again")

NumPy already installed, only imported
pandas already installed, only imported
sklearn already installed, only imported
All of the remaning libraries have been imported


In [2]:
def execute_pipeline(cars_DF, new_data=None):
    
    if new_data is not None:
        cars_DF = pd.concat([cars_DF, new_data], ignore_index=True)
        
    #remove unnecessary columns
    cars_DF.drop(columns = ['Unnamed: 0', 'New_Price'],inplace = True)
    
    #check for duplicates and NaN
    print(f"There are {cars_DF.duplicated().sum()} duplicate values in the dataset.")
    print(f"There are {cars_DF.isnull().sum().sum()} null values in the dataset.")
    
    #remove duplicates
    cars_DF = cars_DF.dropna()
    
    # conver object/string types into floats by extracting the numerical values
    cars_DF.loc[:, 'Mileage'] = cars_DF['Mileage'].str.split(expand=True)[0].astype(float)
    cars_DF.loc[:, 'Engine'] = cars_DF['Engine'].str.split(expand=True)[0].astype(float)
    cars_DF.loc[:, 'Power'] = cars_DF['Power'].replace('null bhp', None)
    cars_DF.loc[:, 'Power'] = cars_DF['Power'].str.split(expand=True)[0].astype(float)
    
    # for every numerical column, remove possible outliers
    num_cols = ['Engine','Power','Kilometers_Driven','Mileage','Price']
    
    for col in num_cols:
        Q1 = cars_DF[col].quantile(0.25)
        Q3 = cars_DF[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 2.5*IQR
        upper_bound = Q3 + 2.5*IQR
        cars_DF = cars_DF[(cars_DF[col] >= lower_bound) & (cars_DF[col] <= upper_bound)]
    
    #create new feature based on the Name column by extract the name of the Company
    cars_DF['Company'] = cars_DF['Name'].str.split(expand=True)[0].str.lower()
    cars_DF['Model'] = cars_DF['Name'].str.split().str[0:2].str.join(' ').str.lower()
    cars_DF.drop(columns = ['Name'],inplace = True)
    
    # perform feature extraction by finding out what the top 3 features are
    cat_cols = ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Company', 'Model']
    encoded_DF = pd.get_dummies(cars_DF, columns=cat_cols)
    
    print("\nTop 3 Features using mutual information scores")
    X, y = encoded_DF.drop('Price', axis=1), cars_DF['Price']
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    print(mi_scores.head(3))
    
    print("\nTop 3 Features using feature importance")
    X, y = encoded_DF.drop('Price', axis=1), cars_DF['Price']
    model = RandomForestRegressor()
    model.fit(X, y)
    importances = model.feature_importances_
    sorted_indices = np.argsort(importances)[::-1]
    sorted_feature_names = X.columns[sorted_indices]
    print(sorted_feature_names[:3])
    
    print("\nTop 3 Features using correlation matrix")
    correlation_matrix = encoded_DF.corr(numeric_only=True)
    correlation_values = correlation_matrix['Price'].abs()
    sorted_correlation_values = correlation_values.sort_values(ascending=False)
    print(sorted_correlation_values.index[1:4])
    
    #create a numeric transformer to pre-process numerical features using an imputer and a scaler
    numeric_features = ['Power', 'Engine', 'Year', 'Kilometers_Driven', 'Mileage', 'Seats']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    #create a categorical transformer to pre-process categorical features using an imputer and an encoder
    categorical_features = ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Company', 'Model']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))])
    
    # add the transformer to the pre-processor variable
    preprocessor = ColumnTransformer(
        transformers=[
            ('numerical', numeric_transformer, numeric_features),
            ('categorical', categorical_transformer, categorical_features)])

    # create a pipeline and append the Support Vector Regressor classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestRegressor())])
    
    X = cars_DF.drop('Price', axis=1)
    y = cars_DF['Price']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
    
    pipeline.fit(X_train, y_train)
    
    #evaluate the model's performance
    print("\nEvalution of the score/accuracy:")
    score = pipeline.score(X_test, y_test)
    print(f'The R-Squared score is: {score}')
    print("Accuracy: %.2f%%" % (score * 100))
    
    y_pred = pipeline.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error: ", mse)

    mae = mean_absolute_error(y_test, y_pred)
    print("Mean Absolute Error:", mae)

In [167]:
start_time = time.time()

cars_DF = pd.read_csv('cars.csv')

execute_pipeline(cars_DF)

end_time = time.time()
execution_time = end_time - start_time
print("\nExecution time of the pipeline is:", execution_time, "seconds")

There are 0 duplicate values in the dataset.
There are 116 null values in the dataset.

Top 3 Features using mutual information scores
Power      0.666164
Engine     0.508778
Mileage    0.382315
dtype: float64

Top 3 Features using feature importance
Index(['Power', 'Year', 'Engine'], dtype='object')

Top 3 Features using correlation matrix
Index(['Transmission_Automatic', 'Transmission_Manual', 'Fuel_Type_Diesel'], dtype='object')

Evalution of the score/accuracy:
The R-Squared score is: 0.914806740595482
Accuracy: 91.48%
Mean Squared Error:  2.2060390921422806
Mean Absolute Error: 0.9096092146679257

Execution time of the pipeline is: 25.099812030792236 seconds


In [3]:
#create new data to fit into the pipeline

def generate_car_data(num_entries, year_range, mileage_range, engine_range, power_range, price_range):
    data = []
    for i in range(num_entries):
        entry = {
            "Unnamed: 0": 6019 + i,
            "Name": f"Car {6019 + i}",
            "Location": random.choice(["India", "United States", "Germany", "France", "Italy", "Spain", "Australia", "Canada", "Netherlands", "Poland"]),
            "Year": random.randint(year_range[0], year_range[1]),
            "Kilometers_Driven": random.randint(1000, 100000),
            "Fuel_Type": random.choice(["Petrol", "Diesel", "CNG", "LPG"]),
            "Transmission": random.choice(["Manual", "Automatic"]),
            "Owner_Type": random.choice(["First", "Second", "Third", "Fourth & Above"]),
            "Mileage": f"{random.uniform(mileage_range[0], mileage_range[1]):.1f} kmpl",
            "Engine": f"{random.randint(engine_range[0], engine_range[1])} CC",
            "Power": f"{random.randint(power_range[0], power_range[1])} bhp",
            "Seats": random.choice([4.0, 5.0, 6.0, 7.0]),
            "New_Price": np.nan,
            "Price": round(random.uniform(price_range[0], price_range[1]), 2)
        }
        data.append(entry)
    
    return pd.DataFrame(data)

new_data = generate_car_data(num_entries=100, year_range=[2010, 2022], mileage_range=[10, 25], engine_range=[1000, 3000], power_range=[50, 300], price_range=[2.0, 7.0])

In [169]:
new_data

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,6019,Car 6019,Netherlands,2011,11256,CNG,Automatic,Second,13.8 kmpl,1407 CC,154 bhp,5.0,,6.29
1,6020,Car 6020,Poland,2010,43491,LPG,Automatic,Second,19.1 kmpl,1579 CC,246 bhp,6.0,,5.93
2,6021,Car 6021,Spain,2021,7867,LPG,Automatic,Fourth & Above,14.4 kmpl,1100 CC,291 bhp,7.0,,3.14
3,6022,Car 6022,Spain,2019,18663,Petrol,Automatic,Fourth & Above,10.7 kmpl,2569 CC,93 bhp,7.0,,2.66
4,6023,Car 6023,Spain,2017,67470,Petrol,Manual,Second,23.3 kmpl,2266 CC,276 bhp,4.0,,4.04
5,6024,Car 6024,France,2011,18281,CNG,Manual,Third,14.5 kmpl,2549 CC,133 bhp,5.0,,6.12
6,6025,Car 6025,Germany,2022,93058,LPG,Manual,Second,23.2 kmpl,2959 CC,262 bhp,5.0,,5.74
7,6026,Car 6026,Poland,2019,11689,Diesel,Manual,Second,12.0 kmpl,1136 CC,212 bhp,7.0,,4.8
8,6027,Car 6027,India,2020,20042,LPG,Automatic,First,13.8 kmpl,2709 CC,292 bhp,6.0,,4.32
9,6028,Car 6028,Canada,2022,48612,Petrol,Manual,Third,23.7 kmpl,1360 CC,194 bhp,4.0,,4.41


In [4]:
start_time = time.time()

cars_DF = pd.read_csv('cars.csv')

#execute the pipeness with the addition of new data
execute_pipeline(cars_DF, new_data)

end_time = time.time()
execution_time = end_time - start_time
print("\nExecution time of the pipeline is:", execution_time, "seconds")

#set new data variable back to none so that it does not get re-used when method is executed
new_data = None 

There are 0 duplicate values in the dataset.
There are 116 null values in the dataset.

Top 3 Features using mutual information scores
Power      0.628486
Engine     0.479738
Mileage    0.367610
dtype: float64

Top 3 Features using feature importance
Index(['Power', 'Year', 'Company_car'], dtype='object')

Top 3 Features using correlation matrix
Index(['Transmission_Automatic', 'Transmission_Manual', 'Fuel_Type_Diesel'], dtype='object')

Evalution of the score/accuracy:
The R-Squared score is: 0.9225577069240186
Accuracy: 92.26%
Mean Squared Error:  1.9850930147305488
Mean Absolute Error: 0.8906798167817311

Execution time of the pipeline is: 33.243406534194946 seconds
