In [66]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
cars_df_cleaned = pd.read_pickle('./dataset/cars_df_cleaned.pkl')

In [4]:
cars_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 350 entries, 0 to 359
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Full_Name          350 non-null    object 
 1   Brand              350 non-null    object 
 2   Price (HUF)        350 non-null    int64  
 3   Year               350 non-null    object 
 4   Condition          350 non-null    object 
 5   Design             350 non-null    object 
 6   Kilometers         350 non-null    int64  
 7   Persons            349 non-null    float64
 8   Doors              349 non-null    float64
 9   Colour             299 non-null    object 
 10  Own_weight (kg)    347 non-null    float64
 11  Full_weight (kg)   347 non-null    float64
 12  Trunk (liter)      344 non-null    float64
 13  AirCon_Type        347 non-null    object 
 14  Fuel               350 non-null    object 
 15  Cylinder (cm3)     346 non-null    float64
 16  Layout             342 non

We are going to exclude the imbalance columns, because our dataset is very small.
Having just a few rows of data for one class degrades model quality. If possible, we should have at least 100 rows of data for every class.

In [8]:
# Choose relevant columns
# We dont need columns which are highly correlated with others, such as weight. We also exclude the imbalance columns, such as Document, Colour, Wheel_drive etc.
df_model = cars_df_cleaned[['Brand','Year','Condition','Design','Kilometers','Trunk (liter)','AirCon_Type_short','Fuel','Cylinder (cm3)','Gearbox_short','Power (kW)','HorsePower (hp)','Price (HUF)']]

In [9]:
df_model.head()

Unnamed: 0,Brand,Year,Condition,Design,Kilometers,Trunk (liter),AirCon_Type_short,Fuel,Cylinder (cm3),Gearbox_short,Power (kW),HorsePower (hp),Price (HUF)
0,RENAULT,2011,Újszerű,Kombi,231000,491.0,Digitális,Dízel,1461.0,Manuális,66,90,1488000
1,MAZDA,2012,Normál,Ferdehátú,188000,,Manuális,Dízel,1560.0,Manuális,70,95,1490000
2,OPEL,2014,Megkímélt,Ferdehátú,199000,285.0,Automata,Benzin,1229.0,Manuális,63,86,1499000
3,FIAT,2012,Normál,Ferdehátú,108260,275.0,Manuális,Benzin,1368.0,Manuális,57,77,1499999
4,FIAT,2011,Normál,Egyterű,188000,,Manuális,Benzin,1400.0,Manuális,70,95,1515000


In [18]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 350 entries, 0 to 359
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Brand              350 non-null    object 
 1   Year               350 non-null    object 
 2   Condition          350 non-null    object 
 3   Design             350 non-null    object 
 4   Kilometers         350 non-null    int64  
 5   Trunk (liter)      344 non-null    float64
 6   AirCon_Type_short  350 non-null    object 
 7   Fuel               350 non-null    object 
 8   Cylinder (cm3)     346 non-null    float64
 9   Gearbox_short      345 non-null    object 
 10  Power (kW)         350 non-null    int64  
 11  HorsePower (hp)    350 non-null    int64  
 12  Price (HUF)        350 non-null    int64  
dtypes: float64(2), int64(4), object(7)
memory usage: 38.3+ KB


## Handling missing values
We are going to use regression model to predict the missing numerical values.

For the categorical ones, we are going to replace the null values with the most common value

### a) Categorical variables

In [17]:
most_common_val = df_model['AirCon_Type_short'].value_counts().sort_values(ascending = False).index[0]
df_model['AirCon_Type_short'].fillna(value = most_common_val, inplace=True)

most_common_val_gb = df_model['Gearbox_short'].value_counts().sort_values(ascending = False).index[0]
df_model['Gearbox_short'].fillna(value = most_common_val_gb, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


### b) Numerical variables

In [70]:
# One hot encoding, drop first to avoid dummy variable trap
df_model_dummies = pd.get_dummies(df_model, drop_first=True)

In [71]:
# avoid dummy variable trap
df_model_dummies

Unnamed: 0,Kilometers,Trunk (liter),Cylinder (cm3),Power (kW),HorsePower (hp),Price (HUF),Brand_AUDI,Brand_BMW,Brand_CHEVROLET,Brand_CITROEN,Brand_DACIA,Brand_FIAT,Brand_FORD,Brand_HONDA,Brand_HYUNDAI,Brand_KIA,Brand_MAZDA,Brand_MERCEDES-BENZ,Brand_MINI,Brand_MITSUBISHI,Brand_NISSAN,Brand_OPEL,Brand_PEUGEOT,Brand_RENAULT,Brand_SEAT,Brand_SKODA,Brand_SMART,Brand_SUZUKI,Brand_TOYOTA,Brand_VOLKSWAGEN,Brand_VOLVO,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018,Year_2019,Condition_Megkímélt,Condition_Normál,Condition_Sérülésmentes,Condition_Újszerű,Design_Egyterű,Design_Ferdehátú,Design_Kisbusz,Design_Kombi,Design_Sedan,Design_Városi terepjáró (crossover),AirCon_Type_short_Digitális,AirCon_Type_short_Manuális,Fuel_Dízel,Fuel_Elektromos,Fuel_Hibrid,Fuel_Hibrid (Dízel),Gearbox_short_Manuális,Gearbox_short_Szekvenciális
0,231000,491.0,1461.0,66,90,1488000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0
1,188000,,1560.0,70,95,1490000,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0
2,199000,285.0,1229.0,63,86,1499000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
3,108260,275.0,1368.0,57,77,1499999,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
4,188000,,1400.0,70,95,1515000,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
5,176028,430.0,1198.0,51,69,1550000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0
6,190000,480.0,1197.0,63,86,1590000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
7,347310,550.0,1598.0,66,90,1590000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,1,0
8,206000,482.0,1596.0,74,101,1599000,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0
9,247000,140.0,1686.0,81,110,1599000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0


#### Trunk variable

In [72]:
df_train_temp = df_model_dummies.loc[~df_model_dummies['Trunk (liter)'].isnull()]
df_test_temp = df_model_dummies.loc[df_model_dummies['Trunk (liter)'].isnull()]

In [82]:
train_cols = [x for x in df_train_temp.columns if x != 'Trunk (liter)' and x != 'Cylinder (cm3)']
test_col = 'Trunk (liter)'

In [84]:
X_train = np.array(df_train_temp[train_cols])
y_train = np.array(df_train_temp[test_col])
X_test = np.array(df_test_temp[train_cols])
y_test = np.array(df_test_temp[test_col])

In [85]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': np.arange(100,500,50)
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           cv = 5, n_jobs = -1, verbose = 2)

In [86]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   51.0s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed:  4.2min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110],
                         'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': array([100, 150, 200, 250, 300, 350, 400, 450])},
             verbose=2)

In [96]:
trunk_pred = grid_search.predict(X_test)

In [99]:
df_model.loc[df_model['Trunk (liter)'].isnull(),'Trunk (liter)'] = trunk_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


#### Cylinder variable

In [119]:
# Assign new value, because trunk has no missing anymore
df_model_dummies = pd.get_dummies(df_model, drop_first=True)

In [120]:
df_train_temp = df_model_dummies.loc[~df_model_dummies['Cylinder (cm3)'].isnull()]
df_test_temp = df_model_dummies.loc[df_model_dummies['Cylinder (cm3)'].isnull()]

In [122]:
train_cols = [x for x in df_train_temp.columns if x != 'Cylinder (cm3)']
test_col = 'Cylinder (cm3)'

In [123]:
X_train = np.array(df_train_temp[train_cols])
y_train = np.array(df_train_temp[test_col])
X_test = np.array(df_test_temp[train_cols])
y_test = np.array(df_test_temp[test_col])

In [124]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': np.arange(100,500,50)
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           cv = 5, n_jobs = -1, verbose = 2)

In [125]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed:  4.0min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110],
                         'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': array([100, 150, 200, 250, 300, 350, 400, 450])},
             verbose=2)

In [126]:
cylinder_pred = grid_search.predict(X_test)

In [127]:
cylinder_pred

array([1523.89062202, 1485.32068177, 1432.3907544 , 1475.74459447])

In [128]:
df_model.loc[df_model['Cylinder (cm3)'].isnull(),'Cylinder (cm3)'] = cylinder_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [130]:
df_model.to_pickle('./dataset/df_model.pkl')

## One hot encoding

## Train - Test split

## Feature scaling

## Model building

### Ensemble with regression?

## Model evaluation