In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("data/cardekho_dataset.csv", index_col=[0])

In [4]:
df.head(2)

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000


In [5]:
df.shape

(15411, 13)

In [6]:
df.dtypes

car_name              object
brand                 object
model                 object
vehicle_age            int64
km_driven              int64
seller_type           object
fuel_type             object
transmission_type     object
mileage              float64
engine                 int64
max_power            float64
seats                  int64
selling_price          int64
dtype: object

In [7]:
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [8]:
## Remove Unnecessary Columns
df.drop('model', axis=1, inplace=True)
df.drop('brand', axis=1, inplace=True)

In [9]:
df.head(2)

Unnamed: 0,car_name,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000


In [10]:
df['car_name'].unique()

array(['Maruti Alto', 'Hyundai Grand', 'Hyundai i20', 'Ford Ecosport',
       'Maruti Wagon R', 'Hyundai i10', 'Hyundai Venue', 'Maruti Swift',
       'Hyundai Verna', 'Renault Duster', 'Mini Cooper', 'Maruti Ciaz',
       'Mercedes-Benz C-Class', 'Toyota Innova', 'Maruti Baleno',
       'Maruti Swift Dzire', 'Volkswagen Vento', 'Hyundai Creta',
       'Honda City', 'Mahindra Bolero', 'Toyota Fortuner', 'Renault KWID',
       'Honda Amaze', 'Hyundai Santro', 'Mahindra XUV500',
       'Mahindra KUV100', 'Maruti Ignis', 'Datsun RediGO',
       'Mahindra Scorpio', 'Mahindra Marazzo', 'Ford Aspire', 'Ford Figo',
       'Maruti Vitara', 'Tata Tiago', 'Volkswagen Polo', 'Kia Seltos',
       'Maruti Celerio', 'Datsun GO', 'BMW 5', 'Honda CR-V',
       'Ford Endeavour', 'Mahindra KUV', 'Honda Jazz', 'BMW 3', 'Audi A4',
       'Tata Tigor', 'Maruti Ertiga', 'Tata Safari', 'Mahindra Thar',
       'Tata Hexa', 'Land Rover Rover', 'Maruti Eeco', 'Audi A6',
       'Mercedes-Benz E-Class', 'Audi Q7'

In [11]:
len(df['car_name'].unique())

121

In [12]:
df['car_name'].value_counts()

car_name
Hyundai i20              906
Maruti Swift Dzire       890
Maruti Swift             781
Maruti Alto              778
Honda City               757
                        ... 
Mercedes-AMG C             1
Rolls-Royce Ghost          1
Maserati Quattroporte      1
Isuzu MUX                  1
Force Gurkha               1
Name: count, Length: 121, dtype: int64

In [13]:
# Create a copy of the series
car_counts = df['car_name'].value_counts()

In [17]:
pd.set_option('display.max_rows', None)
print(car_counts)

car_name
Hyundai i20               906
Maruti Swift Dzire        890
Maruti Swift              781
Maruti Alto               778
Honda City                757
Maruti Wagon R            717
Hyundai Grand             580
Toyota Innova             545
Hyundai Verna             492
Hyundai i10               410
Ford Ecosport             374
Volkswagen Polo           373
Maruti Baleno             364
Honda Amaze               362
Maruti Ciaz               346
Maruti Ertiga             343
Hyundai Creta             336
Mahindra XUV500           330
Renault KWID              306
Maruti Vitara             295
Mahindra Scorpio          273
Ford Figo                 271
Volkswagen Vento          247
Maruti Celerio            237
Renault Duster            218
Mahindra Bolero           211
Toyota Fortuner           187
Skoda Rapid               182
Honda Jazz                175
BMW 3                     152
Tata Tiago                145
Hyundai Santro            139
Mercedes-Benz E-Class     125
M

In [18]:
df['car_name'] = np.where(df['car_name'].map(car_counts) < 10, 'Others', df['car_name'])
print(df['car_name'].value_counts())

car_name
Hyundai i20               906
Maruti Swift Dzire        890
Maruti Swift              781
Maruti Alto               778
Honda City                757
Maruti Wagon R            717
Hyundai Grand             580
Toyota Innova             545
Hyundai Verna             492
Hyundai i10               410
Ford Ecosport             374
Volkswagen Polo           373
Maruti Baleno             364
Honda Amaze               362
Maruti Ciaz               346
Maruti Ertiga             343
Hyundai Creta             336
Mahindra XUV500           330
Renault KWID              306
Maruti Vitara             295
Mahindra Scorpio          273
Ford Figo                 271
Volkswagen Vento          247
Maruti Celerio            237
Renault Duster            218
Mahindra Bolero           211
Toyota Fortuner           187
Skoda Rapid               182
Honda Jazz                175
BMW 3                     152
Tata Tiago                145
Hyundai Santro            139
Others                    134
M

In [22]:
from sklearn.model_selection import train_test_split
X = df.drop(['selling_price'], axis=1)
y = df['selling_price']

In [23]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((12328, 10), (3083, 10))

In [24]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
X_train['car_name']=le.fit_transform(X_train['car_name'])

In [25]:
X_test['car_name']=le.transform(X_test['car_name']) 

In [26]:
len(df['seller_type'].unique()),len(df['fuel_type'].unique()),len(df['transmission_type'].unique())

(3, 5, 2)

In [27]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
onehot_columns = ['seller_type','fuel_type','transmission_type']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features)
        
    ],remainder='passthrough'
    
)

In [28]:
## applying Trnsformation in training(fit_transform)
X_train=preprocessor.fit_transform(X_train)

In [29]:
## apply tansformation on test(transform)
X_test=preprocessor.transform(X_test)

In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [31]:
##Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [32]:
## Beginning Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
   
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 553632.7410
- Mean Absolute Error: 266075.0012
- R2 Score: 0.6221
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 501531.1900
- Mean Absolute Error: 276793.0690
- R2 Score: 0.6659


Lasso
Model performance for Training set
- Root Mean Squared Error: 553632.7454
- Mean Absolute Error: 266072.9124
- R2 Score: 0.6221
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 501530.3830
- Mean Absolute Error: 276789.4489
- R2 Score: 0.6659


Ridge
Model performance for Training set
- Root Mean Squared Error: 553633.4922
- Mean Absolute Error: 266036.4605
- R2 Score: 0.6221
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 501522.5896
- Mean Absolute Error: 276732.2987
- R2 Score: 0.6659


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 323779.7906
- Mean 

In [33]:
#Initialize few parameter for Hyperparamter tuning
knn_params = {"n_neighbors": [2, 3, 10, 20, 40, 50]}
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

In [34]:
# Models list for Hyperparameter tuning
randomcv_models = [('KNN', KNeighborsRegressor(), knn_params),
                   ("RF", RandomForestRegressor(), rf_params)
                   
                   ]

In [36]:
##Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = [random.best_params_, random.best_score_]

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])



Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits


69 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
34 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\nage\develop\random forest\env\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\nage\develop\random forest\env\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\nage\develop\random forest\env\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\nage\develop\random forest\env\Lib\site-packages\sklearn\utils\_param_validation.py", line 95,

---------------- Best Params for KNN -------------------
[{'n_neighbors': 3}, np.float64(0.7932659368272382)]
---------------- Best Params for RF -------------------
[{'n_estimators': 100, 'min_samples_split': 2, 'max_features': 8, 'max_depth': 10}, np.float64(0.8625184074715472)]


In [39]:
model = RandomForestRegressor(n_estimators= 100, min_samples_split= 2, max_features= 8, max_depth= 10)

model.fit(X_train, y_train) # Train model

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

print(list(models.keys())[i])

print('Model performance for Training set')
print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
print("- R2 Score: {:.4f}".format(model_train_r2))

print('----------------------------------')

print('Model performance for Test set')
print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
print("- R2 Score: {:.4f}".format(model_test_r2))

print('='*35)
print('\n')

Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 167505.6539
- Mean Absolute Error: 84528.4216
- R2 Score: 0.9654
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 221413.1393
- Mean Absolute Error: 106554.4595
- R2 Score: 0.9349




In [41]:
import pickle

In [42]:
# Save the ColumnTransformer to a pickle file
with open('preprocessor.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)

In [43]:
# Save the ColumnTransformer to a pickle file
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(le, file)

In [44]:
# Save the ColumnTransformer to a pickle file
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)