In [12]:
# Imports
import pandas as pd

In [14]:
# Import the data
car_sales = pd.read_csv('../data/car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [15]:
# Split the Data set
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

In [16]:
# Lets treat doors as categorical
car_sales['Doors'].value_counts()

4    856
5     79
3     65
Name: Doors, dtype: int64

In [19]:
# Boom! We have our data.
# Seems like we would like to predict Price (which is a Number) - so look like it will be a REGRESSION problem
# Also, sklearn only takes numbers as inputs so lets convert all these strings into numbers

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# define our categorical features in the dataset 
categorical_features = ["Make", "Colour", "Doors"]

# initialize one_hot
# What is one_hot encoding anyway? -- DO RESEARCH!
one_hot = OneHotEncoder()

# initialize the transformer
# basically tells the transformer to take the one_hot encoder, and apply it to the categorical_features; Also just 
# ignore the remaining values (not in the categorical_features) with the passthrough flag
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

# transform the data
transformed_x = transformer.fit_transform(X)
transformed_x_df = pd.DataFrame(transformed_x)
transformed_x_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [20]:
# Side note: We can do one hot encoding with Pandas as well
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [24]:
# Lets fit a model now that we have all the data in numerics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2) 
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.20325875336696786

## What if there were missing values?

1. Fill them with some value (avg, mean, median) -- this is called IMPUTATION
2. Remove the samples with the missing data (not that ideal, but depends on situation)

In [None]:
# Practice fillig missing data with Pandas

In [None]:
# Practice filling data with scikit learn 