In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load Dataset
car_sales = pd.read_csv('../data/car-sales-extended-missing-data.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [3]:
len(car_sales)

1000

In [4]:
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [5]:
# Add data to features
car_sales['Make'] = car_sales['Make'].fillna('missing')
car_sales['Colour'] = car_sales['Colour'].fillna('missing')
car_sales['Odometer (KM)'] = car_sales['Odometer (KM)'].fillna(car_sales['Odometer (KM)'].mean())
car_sales['Doors'] = car_sales['Doors'].fillna(4)

car_sales['Make'] = car_sales['Make'].astype('str')
car_sales['Colour'] = car_sales['Colour'].astype('str')
car_sales['Odometer (KM)'] = car_sales['Odometer (KM)'].astype('int')
car_sales['Doors'] = car_sales['Doors'].astype('int')

# Drop rows with Missing Price - Label
car_sales = car_sales.dropna()

In [6]:
car_sales.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [7]:
len(car_sales)

950

In [10]:
# Features & Labels
X = car_sales.drop('Price', axis=1)
y = car_sales['Price']
X.head(3)

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4


In [17]:
# Transform to numericals
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
car_sales_category = ['Make', 'Colour', 'Doors']
transformer = ColumnTransformer([('categories', encoder, car_sales_category)], remainder='passthrough')
transformer.fit(X)
pd.DataFrame(transformer.transform(X)).head(3)

Unnamed: 0,0
0,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
1,"(0, 0)\t1.0\n (0, 6)\t1.0\n (0, 13)\t1.0\n..."
2,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."


In [19]:
# Transform to numericals
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
car_sales_category = ['Make', 'Colour', 'Doors']
transformer = ColumnTransformer([('categories', encoder, car_sales_category)], remainder='passthrough', sparse_threshold=0)
transformer.fit(X)

X_n = transformer.transform(X)
pd.DataFrame(X_n).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,84714.0


In [20]:
# Train and Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_n, y)

# Fit to Model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [22]:
model.score(X_train, y_train)

0.8693873879188956

In [23]:
model.score(X_test, y_test)

0.33692762494424255