In [26]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [27]:
# get the data
car_sale = pd.read_csv('../data/car-sales-extended-missing-data.csv')
car_sale.head(3)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0


#### Fill in missing data

In [28]:
car_sale.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [29]:
# Clean up empty
car_sale['Make'] = car_sale['Make'].fillna('missing')
car_sale['Colour'] = car_sale['Colour'].fillna('missing')
car_sale['Odometer (KM)'] = car_sale['Odometer (KM)'].fillna(car_sale['Odometer (KM)'].mean())
car_sale['Doors'] = car_sale['Doors'].fillna(4)

car_sale.dropna(inplace=True)

In [30]:
car_sale.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

#### Features and Label

In [31]:
X = car_sale.drop('Price', axis=1)
y = car_sale['Price']

#### Feature Scale Odometer

In [32]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
odometer = np.array(X['Odometer (KM)']).reshape(-1, 1)
scaler.fit(odometer)
X['Odometer (KM)'] = scaler.transform(odometer)

In [33]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,-1.428286,4.0
1,BMW,Blue,0.922299,5.0
2,Honda,White,-0.691754,4.0
3,Toyota,White,0.349175,4.0
4,Nissan,Blue,0.755857,3.0


#### Convert to numericals to fit to a ml model

In [34]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

encoder = OneHotEncoder()
transformer = ColumnTransformer([('categories', encoder, ['Make', 'Colour', 'Doors'])], remainder='passthrough', sparse_threshold=0)
transformer.fit(X)
X_n = pd.DataFrame(transformer.transform(X))
X_n.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.428286
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.922299
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.691754


#### Train Test split

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_n, y, test_size=0.2)

#### Fit to model and score

In [37]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [38]:
model.score(X_train, y_train)

0.8709496949131966

In [39]:
model.score(X_test, y_test)

0.2573443698624456