#### Importing, Preprocessing

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
import os 

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_validate, cross_val_score

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../scr/clean_df.csv', header=0)

In [3]:
df['date_created'] = pd.to_datetime(df['date_created'],format="%Y-%m-%d %H:%M:%S.%f%z", errors='coerce')
df['year_created'] = df['date_created'].dt.year

df['man_period'] = df['year_created'] - df['manufacture_year']
df['stk_period'] = df['year_created'] - df['stk_year']

df = df.drop(['manufacture_year', 'stk_year', 'date_created', 'date_last_seen', 'year_created'], axis=1)

In [4]:
df

Unnamed: 0,maker,model,mileage,engine_displacement,engine_power,body_type,transmission,door_count,seat_count,fuel_type,price_eur,man_period,stk_period
0,ford,galaxy,151000.0,2000.0,138.12506,compact,man,5.0,7.0,diesel,10584.75,4.0,
1,skoda,octavia,143476.0,2000.0,108.62262,compact,man,5.0,5.0,diesel,8882.31,3.0,
2,bmw,,97676.0,1995.0,113.98670,compact,man,5.0,5.0,diesel,12065.06,5.0,
3,skoda,fabia,111970.0,1200.0,84.48426,compact,man,5.0,5.0,gasoline,2960.77,11.0,
4,skoda,fabia,128886.0,1200.0,84.48426,compact,man,5.0,5.0,gasoline,2738.71,11.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2834466,skoda,yeti,69267.0,1197.0,103.25854,offroad,auto,5.0,5.0,,24981.50,2.0,
2834467,chevrolet,matiz,59000.0,995.0,65.70998,compact,man,5.0,5.0,,24981.50,10.0,
2834468,chevrolet,aveo,86000.0,1200.0,71.07406,sedan,man,5.0,5.0,,24981.50,6.0,
2834469,ford,ka,123000.0,1299.0,68.39202,compact,man,3.0,4.0,,24981.50,11.0,


In [10]:
m1 = df.memory_usage(deep=True).sum()
m1

956108316

In [11]:
df2 = df.copy(deep=True)

In [12]:
# Filled NaN values with zeroes

df2.fillna(0, inplace=True)

In [13]:
numerical_columns = ['engine_power', 'mileage', 'engine_displacement']
conv_to_string_columns = ['door_count', 'seat_count', 'man_period', 'stk_period']
categorical_columns = ['maker', 'model', 'body_type', 'transmission', 'fuel_type']+conv_to_string_columns


In [14]:
def datatype_converter(x):

    for i in conv_to_string_columns:
        x[i] = x[i].apply(lambda x: str(x) if not pd.isnull(x) else x)
    
    for i in categorical_columns:
        x[i] = x[i].astype('category')
    
    x[numerical_columns] = x[numerical_columns].apply(pd.to_numeric, downcast="float")
    x['price_eur'] = x['price_eur'].apply(pd.to_numeric, downcast='float')

    return x

In [15]:
df = datatype_converter(df)

In [16]:
df2 = datatype_converter(df2)

m2 = df2.memory_usage(deep=True).sum()
m2

85111439

In [28]:
X = df2.drop('price_eur', axis=1)
y = df2['price_eur'].to_numpy()

In [29]:
encoded2 = pd.get_dummies(X, columns=categorical_columns)

In [30]:
encoded2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2834471 entries, 0 to 2834470
Columns: 921 entries, mileage to stk_period_0.0
dtypes: bool(918), float32(3)
memory usage: 2.5 GB


In [21]:
#df3 = encoded2.fillna(0)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(encoded2, y, test_size=0.2, random_state=10)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2267576, 921), (566895, 921), (2267576,), (566895,))

#### Random Forest

In [39]:
params = {
    "n_estimators": 10, 
    "max_depth": 10,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "ccp_alpha": 0,
    "random_state": 10,
    "n_jobs": -1,
    "bootstrap": True,
    "max_samples": 0.01
}

rf = RandomForestRegressor(**params)

rf = rf.fit(X_train, y_train)

In [33]:
y_pred = rf.predict(X_test)
print(metrics.r2_score(y_pred, y_test)) #0.6485229513084686

0.6485229513084686


#### Randomized Search CV for Random Forest

In [78]:
rs_param_grid = {
    "n_estimators": list((range(10, 20))),
    "max_depth": list((range(5, 15))),
    "min_samples_split": list((range(2, 10))),
    "min_samples_leaf": list((range(1, 7))),
    "ccp_alpha": [0, 0.001, 0.01, 0.1, 0.5],
    "max_samples": [0.1]
}


In [79]:
scoring_list = {'R2': metrics.make_scorer(metrics.r2_score)}

In [80]:
rf = RandomForestRegressor(random_state=10, n_jobs=-1)

rf_rs = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rs_param_grid,
    scoring = 'r2',
    n_iter=3,  # Number of parameter candidate settings to sample
    verbose=2,  # The higher this is, the more messages are outputed
    random_state=10,
    refit = True
)

In [81]:
rf_rs.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ccp_alpha=0.1, max_depth=11, max_samples=0.1, min_samples_leaf=5, min_samples_split=9, n_estimators=13; total time= 1.9min
[CV] END ccp_alpha=0.1, max_depth=11, max_samples=0.1, min_samples_leaf=5, min_samples_split=9, n_estimators=13; total time= 1.6min
[CV] END ccp_alpha=0.1, max_depth=11, max_samples=0.1, min_samples_leaf=5, min_samples_split=9, n_estimators=13; total time= 1.8min
[CV] END ccp_alpha=0.1, max_depth=11, max_samples=0.1, min_samples_leaf=5, min_samples_split=9, n_estimators=13; total time= 1.6min
[CV] END ccp_alpha=0.1, max_depth=11, max_samples=0.1, min_samples_leaf=5, min_samples_split=9, n_estimators=13; total time= 1.6min
[CV] END ccp_alpha=0.001, max_depth=10, max_samples=0.1, min_samples_leaf=2, min_samples_split=3, n_estimators=13; total time= 1.5min
[CV] END ccp_alpha=0.001, max_depth=10, max_samples=0.1, min_samples_leaf=2, min_samples_split=3, n_estimators=13; total time= 1.5min
[CV] END ccp

In [82]:
rf_rs.best_score_ 

# ms=0.001 -> 
# 0.759607066616578

# ms=0.01
# 0.8019836374427675

# ms=0.1
# 0.8198670082118591

0.8198670082118591

In [83]:
rf_rs.best_params_

# ms=0.001 ->

# {'n_estimators': 13,
#  'min_samples_split': 3,
#  'min_samples_leaf': 2,
#  'max_samples': 0.001,
#  'max_depth': 10,
#  'ccp_alpha': 0.001}

# ms=0.01
# {'n_estimators': 18,
#  'min_samples_split': 6,
#  'min_samples_leaf': 6,
#  'max_samples': 0.01,
#  'max_depth': 11,
#  'ccp_alpha': 0.1}

# ms=0.1
# {'n_estimators': 18,
#  'min_samples_split': 6,
#  'min_samples_leaf': 6,
#  'max_samples': 0.1,
#  'max_depth': 11,
#  'ccp_alpha': 0.1}

{'n_estimators': 18,
 'min_samples_split': 6,
 'min_samples_leaf': 6,
 'max_samples': 0.1,
 'max_depth': 11,
 'ccp_alpha': 0.1}

In [84]:
y_pred = rf_rs.best_estimator_.predict(X_test)
print(metrics.r2_score(y_pred, y_test)) 

# ms=0.01
# 0.6485229513084686

# ms=0.1
# 0.7717527292153455

0.7717527292153455


#### Conveting df to CSR matrix

In [47]:
df_csr = scipy.sparse.csr_matrix(df2.values)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(df_csr, y, test_size=0.3, random_state=10)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1984129, 914), (850342, 914), (1984129,), (850342,))

In [50]:
rf.fit(X_train, y_train) #4 minutes running time

In [51]:
y_pred = rf.predict(X_test)

print(metrics.r2_score(y_pred, y_test))

0.7245555619208068

In [52]:
df_csr

<2834471x914 sparse matrix of type '<class 'numpy.float64'>'
	with 25661236 stored elements in Compressed Sparse Row format>