In [1]:
# Imports
import pandas as pd

In [2]:
# Import the data
car_sales = pd.read_csv('../data/car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [3]:
# Split the Data set
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

In [4]:
# Lets treat doors as categorical
car_sales['Doors'].value_counts()

4    856
5     79
3     65
Name: Doors, dtype: int64

In [5]:
# Boom! We have our data.
# Seems like we would like to predict Price (which is a Number) - so look like it will be a REGRESSION problem
# Also, sklearn only takes numbers as inputs so lets convert all these strings into numbers

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# define our categorical features in the dataset 
categorical_features = ["Make", "Colour", "Doors"]

# initialize one_hot
# What is one_hot encoding anyway? -- DO RESEARCH!
one_hot = OneHotEncoder()

# initialize the transformer
# basically tells the transformer to take the one_hot encoder, and apply it to the categorical_features; Also just 
# ignore the remaining values (not in the categorical_features) with the passthrough flag
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

# transform the data
transformed_x = transformer.fit_transform(X)
transformed_x_df = pd.DataFrame(transformed_x)
transformed_x_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [6]:
# Side note: We can do one hot encoding with Pandas as well
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [7]:
# Lets fit a model now that we have all the data in numerics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2) 
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.26047736562656

## What if there were missing values?

1. Fill them with some value (avg, mean, median) -- this is called IMPUTATION
2. Remove the samples with the missing data (not that ideal, but depends on situation)

## Practice filling missing data with Pandas

In [46]:
# Import the data
car_sales_missing = pd.read_csv('../data/car-sales-extended-missing-data.csv')
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [47]:
# Analyze the df
car_sales_missing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [48]:
# Analyze the df
car_sales_missing.describe()

Unnamed: 0,Odometer (KM),Doors,Price
count,950.0,950.0,950.0
mean,131253.237895,4.011579,16042.814737
std,69094.857187,0.382539,8581.695036
min,10148.0,3.0,2796.0
25%,70391.25,4.0,9529.25
50%,131821.0,4.0,14297.0
75%,192668.5,4.0,20806.25
max,249860.0,5.0,52458.0


In [49]:
# See how many missing values are there
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [50]:
# Fill the missing values of the predictor variables

# Make Column
# First flatten your ndarray to obtain a single dimensional array, then apply set() on it:
print(set(car_sales_missing['Make'].values.flatten()))
# Fill the nan values with "missing"
car_sales_missing['Make'].fillna('missing', inplace=True)

# Colour
print(set(car_sales_missing['Colour'].values.flatten()))
car_sales_missing['Colour'].fillna('missing', inplace=True)

# Colour
print(set(car_sales_missing['Colour'].values.flatten()))
car_sales_missing['Colour'].fillna('missing', inplace=True)

# Odometer (KM) -- Numerical value
odometer_series = car_sales_missing['Odometer (KM)']
mean_odometer_series = odometer_series.mean()
print('Mean Odometer (KM)' + str(mean_odometer_series))
car_sales_missing['Odometer (KM)'].fillna(mean_odometer_series, inplace=True)


# Doors
#  print(car_sales_missing['Doors'].unique())
#  print(car_sales_missing['Doors'].describe())

# On avg the mean car is gonna have 4 doors (lets see the most occurances of doors)
print(car_sales_missing['Doors'].value_counts())
car_sales_missing['Doors'].fillna(4, inplace=True)

car_sales_missing.isna().sum()

{nan, 'Nissan', 'Honda', 'BMW', 'Toyota'}
{nan, 'Red', 'Green', 'Blue', 'Black', 'White'}
{'Red', 'Green', 'Blue', 'missing', 'Black', 'White'}
Mean Odometer (KM)131253.23789473684
4.0    811
5.0     75
3.0     64
Name: Doors, dtype: int64


Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [52]:
# Now we can split our dataset into the train, test and run our model on it!

# drop all the values that have missing values -- SHOULD ONLY BE THE PRICE COLUMN 
# It's okay to drop all of these as we do not wanna fill values of our y varibale column

car_sales_missing.dropna(inplace=True)

car_sales_missing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 950 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           950 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 44.5+ KB


In [53]:
# Separate into X and y
X_mis = car_sales_missing.drop('Price', axis=1)
y_mis = car_sales_missing['Price']

print(X_mis)
print(y_mis)

# Now run split and fit a model!

        Make Colour  Odometer (KM)  Doors
0      Honda  White        35431.0    4.0
1        BMW   Blue       192714.0    5.0
2      Honda  White        84714.0    4.0
3     Toyota  White       154365.0    4.0
4     Nissan   Blue       181577.0    3.0
..       ...    ...            ...    ...
995   Toyota  Black        35820.0    4.0
996  missing  White       155144.0    3.0
997   Nissan   Blue        66604.0    4.0
998    Honda  White       215883.0    4.0
999   Toyota   Blue       248360.0    4.0

[950 rows x 4 columns]
0      15323.0
1      19943.0
2      28343.0
3      13434.0
4      14043.0
        ...   
995    32042.0
996     5716.0
997    31570.0
998     4001.0
999    12732.0
Name: Price, Length: 950, dtype: float64


# Practice filling data with scikit learn 

In [55]:
car_sales_missing_2 = pd.read_csv('../data/car-sales-extended-missing-data.csv')
car_sales_missing_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [56]:
# Fill the missing values with sklearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' and numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")