In [79]:
# Data manupulation
import numpy as np
import pandas as pd

#importing visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# train test split
from sklearn.model_selection import train_test_split

# models 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor


#performance metrics
from sklearn.metrics import r2_score,mean_squared_error

# Cross validation
from sklearn.model_selection import cross_val_score

#hyper parammeter tuening libs
from  sklearn.model_selection import GridSearchCV,RandomizedSearchCV

# feature selection 
from sklearn.feature_selection import RFE

In [80]:
df=pd.read_excel("train.xlsx")

In [81]:
# 4% of data
df.shape[0],df.shape[0]*0.04

(6019, 240.76)

In [82]:
df.head(5)

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


In [83]:
df.dtypes

Location              object
Year                   int64
Kilometers_Driven      int64
Fuel_Type             object
Transmission          object
Owner_Type            object
Mileage               object
Engine                object
Power                 object
Seats                float64
Price                float64
dtype: object

In [84]:
print("4% of data is :",df.shape[0]*0.04)
df.isnull().sum()

4% of data is : 240.76


Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               2
Engine               36
Power                36
Seats                42
Price                 0
dtype: int64

In [85]:
df.dropna(inplace=True)

In [86]:
df.isnull().sum()

Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64

In [87]:
df.shape

(5975, 11)

In [88]:
df.dtypes

Location              object
Year                   int64
Kilometers_Driven      int64
Fuel_Type             object
Transmission          object
Owner_Type            object
Mileage               object
Engine                object
Power                 object
Seats                float64
Price                float64
dtype: object

# Handling object type feature 

## Encoding feature  based on frequency

In [89]:
def encoding(df,data):
    a=df[data].value_counts().to_dict()
    fq = df.groupby(data).size()/len(df) 
    df[data] = df[data].map(fq) 
    b=df[data].value_counts().to_dict()
    encode=dict(zip(a,b))    
    return encode

def cat_flot(df,data):
    df[data]=df[data].astype(str).str.extract('(\d+)').astype(float)
    return df

In [90]:
encoding(df,'Location')

{'Mumbai': 0.13121338912133892,
 'Hyderabad': 0.12401673640167364,
 'Kochi': 0.10845188284518828,
 'Coimbatore': 0.10610878661087866,
 'Pune': 0.10259414225941423,
 'Delhi': 0.09188284518828452,
 'Kolkata': 0.0887029288702929,
 'Chennai': 0.08200836820083682,
 'Jaipur': 0.06861924686192468,
 'Bangalore': 0.05907949790794979,
 'Ahmedabad': 0.03732217573221757}

In [91]:
encoding(df,'Fuel_Type')

{'Diesel': 0.5347280334728034,
 'Petrol': 0.45422594142259415,
 'CNG': 0.009372384937238494,
 'LPG': 0.0016736401673640166}

In [92]:
encoding(df,'Transmission')

{'Manual': 0.7139748953974896, 'Automatic': 0.28602510460251046}

In [93]:
encoding(df,'Owner_Type')

{'First': 0.8205857740585774,
 'Second': 0.1594979079497908,
 'Third': 0.018577405857740585,
 'Fourth & Above': 0.0013389121338912135}

In [94]:
cat_flot(df,'Mileage')
cat_flot(df,'Engine')
cat_flot(df,'Power')
df.dropna(inplace=True,axis=0)

In [95]:
df.isna().sum()

Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64

In [96]:
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,0.131213,2010,72000,0.009372,0.713975,0.820586,26.0,998.0,58.0,5.0,1.75
1,0.102594,2015,41000,0.534728,0.713975,0.820586,19.0,1582.0,126.0,5.0,12.5
2,0.082008,2011,46000,0.454226,0.713975,0.820586,18.0,1199.0,88.0,5.0,4.5
3,0.082008,2012,87000,0.534728,0.713975,0.820586,20.0,1248.0,88.0,7.0,6.0
4,0.106109,2013,40670,0.534728,0.286025,0.159498,15.0,1968.0,140.0,5.0,17.74


In [97]:
print(df.shape)

(5872, 11)


In [98]:
x=df.drop(['Price'],axis=1)
y=df['Price']

In [99]:
x.shape , y.shape

((5872, 10), (5872,))

In [100]:
dt=DecisionTreeRegressor()
rf=RandomForestRegressor()
adb=AdaBoostRegressor()
gb=GradientBoostingRegressor()
xgb=XGBRegressor()

In [101]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=0)

In [102]:
def built_model(model,model_name,X=xtrain,Y=ytrain):
    model.fit(xtrain,ytrain)
    ypred=model.predict(xtest)
    print(model_name)
    print("Model Accuracy : ",r2_score(ytest,ypred))
    print("RMSE :",np.sqrt(mean_squared_error(ytest,ypred)),'\n')

In [103]:
built_model(rf,"Random Forest")
built_model(gb,"GBoost")
built_model(xgb,"XGBoost")

Random Forest
Model Accuracy :  0.8726099114951902
RMSE : 4.482271780565534 

GBoost
Model Accuracy :  0.8749627760802214
RMSE : 4.440685618043572 

XGBoost
Model Accuracy :  0.8805360087558061
RMSE : 4.340591111167341 



# feature selection 

In [104]:
# model
def ranf(xtrain,xtest,ytrain,ytest):
    model=XGBRegressor(n_estimators=100)
    model.fit(xtrain,ytrain)
    pred=model.predict(xtest)
    result=r2_score(pred,ytest)
    print(result)

In [105]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=10)

In [106]:
for index in range(1,11):
    sel=RFE(XGBRegressor(),n_features_to_select=index)
    sel.fit(xtrain,ytrain)
    xtrain_rfe=sel.transform(xtrain)
    xtest_rfe=sel.transform(xtest)
    print("no of features selected ",index)
    ranf(xtrain_rfe,xtest_rfe,ytrain,ytest)
    print(xtrain.columns[sel.get_support()])
    print()

no of features selected  1
0.7051505742031287
Index(['Power'], dtype='object')

no of features selected  2
0.8610943246428402
Index(['Year', 'Power'], dtype='object')

no of features selected  3
0.8813185681465885
Index(['Year', 'Transmission', 'Power'], dtype='object')

no of features selected  4
0.9009547353708202
Index(['Year', 'Transmission', 'Engine', 'Power'], dtype='object')

no of features selected  5
0.8858752672108708
Index(['Year', 'Transmission', 'Engine', 'Power', 'Seats'], dtype='object')

no of features selected  6
0.8896998410894794
Index(['Year', 'Fuel_Type', 'Transmission', 'Engine', 'Power', 'Seats'], dtype='object')

no of features selected  7
0.8884404523388278
Index(['Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission', 'Engine',
       'Power', 'Seats'],
      dtype='object')

no of features selected  8
0.9053165558676679
Index(['Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission', 'Mileage',
       'Engine', 'Power', 'Seats'],
      dtype='object')

no o

In [107]:
x.drop(['Owner_Type'],axis=1,inplace=True)

# cross validation 

In [108]:
def k_fold(x,y,folds=20):
    rd=RandomForestRegressor()
    gb=GradientBoostingRegressor()
    xgb=XGBRegressor()
    score_rd=cross_val_score(rd,x,y,cv=folds)
    score_gb=cross_val_score(gb,x,y,cv=folds)
    score_xgb=cross_val_score(xgb,x,y,cv=folds)
    models=[rd,gb,xgb]
    model_names=['RandomForestRegressor','Gboost','XGboost']
    scores=[score_rd,score_gb,score_xgb]
    result=[]
    for x in range (0,len(models)):
        sc_mean=np.mean(scores[x])
        sc_std=np.std(scores[x])
        m_name=model_names[x]
        min=np.min(scores[x])
        max=np.max(scores[x])
        var=[m_name,sc_mean,sc_std,min,max]
        result.append(var)
    result_df=pd.DataFrame(result,columns=['ModelName','Averege_Accuracy','Standard_Deviation',"Min","Max"])
    return result_df

In [109]:
k_fold(x,y)

Unnamed: 0,ModelName,Averege_Accuracy,Standard_Deviation,Min,Max
0,RandomForestRegressor,0.888975,0.057715,0.72623,0.95518
1,Gboost,0.880653,0.054086,0.716247,0.941555
2,XGboost,0.894908,0.06504,0.731999,0.952124


In [110]:
def hpt(grid_or_random,estimator,param,X=x,Y=y):
    if grid_or_random=='grid':
        perform_obj=GridSearchCV(estimator,param,scoring='r2',n_jobs=-1)
    elif grid_or_random=='random':
        perform_obj=RandomizedSearchCV(estimator,param,scoring='r2',n_jobs=-1)
    
    obj_predict=perform_obj.fit(x,y)
    result=obj_predict.best_estimator_
    return result

In [111]:
#XGBoost 
import xgboost
xgb=XGBRegressor()
xgb_param={'n_estimators':[250,350,450,550]}
xgb_grid=hpt('grid',xgb,xgb_param)
score_ada_grid=cross_val_score(xgb_grid,x,y,cv=10)
print("XGB GridSearchCV : ",score_ada_grid.mean())

XGB GridSearchCV :  0.8893343761954654


In [112]:
xgb_grid

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=250, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [113]:
xgb_param={'n_estimators':[250,350,550,606,720],
           'learning_rate':[0.09,0.1,0.11,0.15,0.50,1],
           "random_state":[0,10,20,30,40]
          }
xgb_random=hpt('random',xgb,xgb_param)
score_xgb_random=cross_val_score(xgb_random,x,y,cv=10)
print("XGB RandomSearchCV :",score_xgb_random.mean() )

XGB RandomSearchCV : 0.8910265834146293


In [114]:
xgb_random

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.09, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=720, n_jobs=0, num_parallel_tree=1, random_state=30,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [115]:
model=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=6,
             min_child_weight=1, monotone_constraints='()',
             n_estimators=606, n_jobs=-1, num_parallel_tree=1,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [116]:
model.fit(xtrain,ytrain)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=606, n_jobs=-1, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [117]:
pre=model.predict(xtest)
r2_score(ytest,pre)

0.9272787735057759

In [118]:
import joblib
filename = 'car.pkl'
joblib.dump(model, filename)
#model = joblib.load(filename)

['car.pkl']

# Using Catboost

In [45]:
import catboost

In [46]:
from catboost import CatBoostRegressor

In [64]:
df=pd.read_excel("train.xlsx")

In [65]:
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


In [66]:
cat_flot(df,'Mileage')
cat_flot(df,'Engine')
cat_flot(df,'Power')
df.dropna(inplace=True,axis=0)

In [67]:
catdata=df.drop(['Owner_Type'], axis=1)

In [68]:
catdata.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Mileage,Engine,Power,Seats,Price
0,Mumbai,2010,72000,CNG,Manual,26.0,998.0,58.0,5.0,1.75
1,Pune,2015,41000,Diesel,Manual,19.0,1582.0,126.0,5.0,12.5
2,Chennai,2011,46000,Petrol,Manual,18.0,1199.0,88.0,5.0,4.5
3,Chennai,2012,87000,Diesel,Manual,20.0,1248.0,88.0,7.0,6.0
4,Coimbatore,2013,40670,Diesel,Automatic,15.0,1968.0,140.0,5.0,17.74


In [69]:
x=catdata.drop(['Price'],axis=1)

In [70]:
y=catdata['Price']

In [74]:
catxtrain,catxtest,catytrain,catytest=train_test_split(x,y,test_size=0.3,random_state=0)

In [75]:
cf=['Location',"Fuel_Type",'Transmission']

In [76]:
model = CatBoostRegressor(
    n_estimators = 200,
    loss_function = 'MAE',
    eval_metric = 'RMSE',
    cat_features = cf )

In [77]:
model.fit(catxtrain,catytrain)
a=model.predict(catxtest)
r2_score(ytest,a)

0:	learn: 11.0141876	total: 248ms	remaining: 49.4s
1:	learn: 10.8194699	total: 292ms	remaining: 28.9s
2:	learn: 10.6300377	total: 326ms	remaining: 21.4s
3:	learn: 10.4577289	total: 354ms	remaining: 17.3s
4:	learn: 10.2798931	total: 379ms	remaining: 14.8s
5:	learn: 10.1096645	total: 413ms	remaining: 13.3s
6:	learn: 9.9629506	total: 439ms	remaining: 12.1s
7:	learn: 9.8020910	total: 468ms	remaining: 11.2s
8:	learn: 9.6505232	total: 505ms	remaining: 10.7s
9:	learn: 9.4944562	total: 530ms	remaining: 10.1s
10:	learn: 9.3556858	total: 566ms	remaining: 9.72s
11:	learn: 9.2309494	total: 595ms	remaining: 9.32s
12:	learn: 9.1031066	total: 624ms	remaining: 8.97s
13:	learn: 8.9785045	total: 652ms	remaining: 8.66s
14:	learn: 8.8592720	total: 678ms	remaining: 8.37s
15:	learn: 8.7461386	total: 703ms	remaining: 8.08s
16:	learn: 8.6349268	total: 730ms	remaining: 7.86s
17:	learn: 8.5331869	total: 755ms	remaining: 7.63s
18:	learn: 8.4230626	total: 774ms	remaining: 7.38s
19:	learn: 8.3088685	total: 805ms	r

163:	learn: 4.2240254	total: 5.86s	remaining: 1.28s
164:	learn: 4.2239079	total: 5.87s	remaining: 1.25s
165:	learn: 4.2141420	total: 5.91s	remaining: 1.21s
166:	learn: 4.2076120	total: 5.96s	remaining: 1.18s
167:	learn: 4.1958736	total: 6s	remaining: 1.14s
168:	learn: 4.1943995	total: 6.04s	remaining: 1.11s
169:	learn: 4.1908942	total: 6.08s	remaining: 1.07s
170:	learn: 4.1871992	total: 6.12s	remaining: 1.04s
171:	learn: 4.1737379	total: 6.15s	remaining: 1s
172:	learn: 4.1715359	total: 6.18s	remaining: 964ms
173:	learn: 4.1682680	total: 6.2s	remaining: 926ms
174:	learn: 4.1620548	total: 6.23s	remaining: 890ms
175:	learn: 4.1556146	total: 6.25s	remaining: 852ms
176:	learn: 4.1333669	total: 6.28s	remaining: 816ms
177:	learn: 4.1329682	total: 6.3s	remaining: 779ms
178:	learn: 4.1315117	total: 6.32s	remaining: 741ms
179:	learn: 4.1281863	total: 6.34s	remaining: 704ms
180:	learn: 4.1163940	total: 6.36s	remaining: 667ms
181:	learn: 4.1090114	total: 6.39s	remaining: 632ms
182:	learn: 4.106704

0.8291161810275657