In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/used_car_canada_clean.csv')
df.head()

Unnamed: 0,price,miles,year,make,model,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block,state
0,179999.0,9966.0,2017.0,acura,NSX,coupe,Car,4WD,Automatic,hyrid,3.5,V,NB
1,179995.0,5988.0,2017.0,acura,NSX,coupe,Car,4WD,Automatic,hyrid,3.5,V,QC
2,168528.0,24242.0,2017.0,acura,NSX,coupe,Car,4WD,Automatic,hyrid,3.5,V,BC
3,220000.0,6637.0,2020.0,acura,NSX,coupe,Car,4WD,Automatic,hyrid,3.5,V,ON
4,220000.0,6637.0,2020.0,acura,NSX,coupe,Car,4WD,Automatic,hyrid,3.5,V,ON


In [3]:
cols_to_drop = ['body_type', 'vehicle_type', 'drivetrain', 'transmission', 'fuel_type', 'engine_block']
df = df.drop(cols_to_drop, axis=1)

In [4]:
df.head()

Unnamed: 0,price,miles,year,make,model,engine_size,state
0,179999.0,9966.0,2017.0,acura,NSX,3.5,NB
1,179995.0,5988.0,2017.0,acura,NSX,3.5,QC
2,168528.0,24242.0,2017.0,acura,NSX,3.5,BC
3,220000.0,6637.0,2020.0,acura,NSX,3.5,ON
4,220000.0,6637.0,2020.0,acura,NSX,3.5,ON


In [6]:
df_toyota_honda = df.loc[(df['make'] == 'honda') | (df['make'] == 'toyota')]

In [8]:
df_toyota_honda.to_csv('data/honda_toyota_ca.csv', index=False, header=True)

## Model 

In [9]:
df = pd.read_csv('data/honda_toyota_ca.csv')
df.head()

Unnamed: 0,price,miles,year,make,model,engine_size,state
0,4980.0,86132.0,2001.0,toyota,Prius,1.5,BC
1,18926.0,80516.0,2017.0,toyota,Prius,1.8,ON
2,23900.0,29295.0,2018.0,toyota,Prius,1.8,ON
3,27980.0,57894.0,2018.0,toyota,Prius,1.8,BC
4,22887.0,95106.0,2016.0,toyota,Prius,1.8,AB


In [10]:
from sklearn.model_selection import train_test_split

X = df.drop(['price'], axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=df[['make', 'model']], test_size=0.2, shuffle=True, random_state=42)

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor

cat_index = [2,3,5]

cat_features_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", cat_features_transformer, cat_index)
    ]
)


model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", GradientBoostingRegressor(random_state=42))
    ]
)

In [14]:
model.fit(X_train, y_train)

In [15]:
model.score(X_test, y_test)

0.4865534075230413

In [16]:
from joblib import dump

dump(model, 'model/model.joblib')

['model/model.joblib']