In [1]:
import pandas as pd 
import numpy as np

In [197]:
df = pd.read_csv('data/cars.csv')
df.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,"$36,945","$33,337",3.5,6.0,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,"$23,820","$21,761",2.0,4.0,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,"$26,990","$24,647",2.4,4.0,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,"$33,195","$30,299",3.2,6.0,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,"$43,755","$39,014",3.5,6.0,225,18,24,3880,115,197


# Features Engineering

In [4]:
df = df.drop(['Model','Origin','Invoice','EngineSize','Cylinders','Weight','Wheelbase','Length'], axis=1)

In [10]:
df['MSRP'] = df['MSRP'].map(lambda x: x.lstrip('$').replace(',','.'))

In [11]:
df['MSRP'] = pd.to_numeric(df['MSRP'])

# 1 Hot Encoding

In [13]:
df = pd.get_dummies(df, columns=['Make','Type','DriveTrain'])

In [14]:
df.head()

Unnamed: 0,MSRP,Horsepower,MPG_City,MPG_Highway,Make_Acura,Make_Audi,Make_BMW,Make_Buick,Make_Cadillac,Make_Chevrolet,...,Make_Volvo,Type_Hybrid,Type_SUV,Type_Sedan,Type_Sports,Type_Truck,Type_Wagon,DriveTrain_All,DriveTrain_Front,DriveTrain_Rear
0,36.945,265,17,23,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,23.82,200,24,31,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,26.99,200,22,29,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,33.195,270,20,28,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,43.755,225,18,24,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [195]:
df.columns

Index(['MSRP', 'Horsepower', 'MPG_City', 'MPG_Highway', 'Make_Acura',
       'Make_Audi', 'Make_BMW', 'Make_Buick', 'Make_Cadillac',
       'Make_Chevrolet', 'Make_Chrysler', 'Make_Dodge', 'Make_Ford',
       'Make_GMC', 'Make_Honda', 'Make_Hummer', 'Make_Hyundai',
       'Make_Infiniti', 'Make_Isuzu', 'Make_Jaguar', 'Make_Jeep', 'Make_Kia',
       'Make_Land Rover', 'Make_Lexus', 'Make_Lincoln', 'Make_MINI',
       'Make_Mazda', 'Make_Mercedes-Benz', 'Make_Mercury', 'Make_Mitsubishi',
       'Make_Nissan', 'Make_Oldsmobile', 'Make_Pontiac', 'Make_Porsche',
       'Make_Saab', 'Make_Saturn', 'Make_Scion', 'Make_Subaru', 'Make_Suzuki',
       'Make_Toyota', 'Make_Volkswagen', 'Make_Volvo', 'Type_Hybrid',
       'Type_SUV', 'Type_Sedan', 'Type_Sports', 'Type_Truck', 'Type_Wagon',
       'DriveTrain_All', 'DriveTrain_Front', 'DriveTrain_Rear'],
      dtype='object')

# Creating the model

In [23]:
from sklearn import ensemble
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [18]:
X = df.drop('MSRP', axis=1)
y = df['MSRP']
X = X.to_numpy()
y = y.to_numpy()

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [188]:
model = ensemble.GradientBoostingRegressor(
    n_estimators=5000,
    learning_rate=0.1,
    max_depth=10,
    min_samples_leaf=3,
    max_features=0.1,
    loss='lad',
    random_state=0
)

In [189]:
model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='lad', max_depth=10,
                          max_features=0.1, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=3, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=5000,
                          n_iter_no_change=None, presort='auto', random_state=0,
                          subsample=1.0, tol=0.0001, validation_fraction=0.1,
                          verbose=0, warm_start=False)

In [190]:
prediction = model.predict(X_test)

In [191]:
print("The R2 square value of Gradient Boosting is :", r2_score(y_test, prediction)*100)

The R2 square value of Gradient Boosting is : 85.81690556508484


In [192]:
import joblib

In [193]:
joblib.dump(model, 'model.pkl')

['model.pkl']