In [1]:
# Import the required defaults
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load the data set for model training and testing
car_sales = pd.read_csv('../data/car-sales-extended-missing-data.csv')
car_sales.head(3)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0


In [3]:
# Check if the data has any missing features / labels
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [4]:
# Drop missing labels
car_sales.dropna(subset=['Price'], inplace=True)

In [5]:
car_sales.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [6]:
# Features and Lables
X = car_sales.drop('Price', axis=1)
y = car_sales['Price']

In [7]:
# Impute Features
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

text_imputer = SimpleImputer(strategy='constant', fill_value='missing')
numerical_imputer = SimpleImputer(strategy='mean')
door_imputer = SimpleImputer(strategy='constant', fill_value=4)

transformer = ColumnTransformer(
    [('text_imputer', text_imputer, ['Make', 'Colour']),
    ('numerical_imputer', numerical_imputer, ['Odometer (KM)']),
    ('door_imputer', door_imputer, ['Doors'])], 
    remainder='passthrough', 
    sparse_threshold=0
)

transformer.fit(X)
X_imputed = pd.DataFrame(transformer.transform(X), columns=['Make', 'Colour', 'Odometer (KM)', 'Doors'])
X_imputed.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0


In [8]:
X_imputed.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
dtype: int64

In [9]:
# Feature scale Odometer
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
odometer = np.array(X_imputed['Odometer (KM)']).reshape(-1, 1)
scaler.fit(odometer)
X_imputed['Odometer (KM)'] = scaler.transform(odometer)
X_imputed.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,-1.428085,4.0
1,BMW,Blue,0.9225,5.0
2,Honda,White,-0.691554,4.0
3,Toyota,White,0.349376,4.0
4,Nissan,Blue,0.756058,3.0


In [11]:
# Convert text to numerals
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

encoder = OneHotEncoder()
transformer = ColumnTransformer([('category', encoder, ['Make', 'Colour', 'Doors'])], remainder='passthrough', sparse_threshold=0)
transformer.fit(X_imputed)
X_clean = pd.DataFrame(transformer.transform(X_imputed))
X_clean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.428085
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9225
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.691554
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.349376
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.756058


In [12]:
# Train - Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_clean, y)

In [13]:
# Fit Train and Score
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)

In [14]:
model.score(X_train, y_train)

0.8764573101552555

In [15]:
model.score(X_test, y_test)

0.15106140399834556