## Pre-processing & Training Data Development

### <a href='#dummy'> Dummy Variables </a>

### <a href='#scaler'> Standardize the Data </a>

### <a href='#split'> The Train Test Split </a>

In [1]:
import pandas as pd

## import local files
file_name = './data/knn_cleaned.csv'
df = pd.read_csv(file_name)

In [12]:
import warnings
warnings.filterwarnings("ignore")

###  <span id="dummy"> Dummy Variables </span> 
The current df is already in a dummy variable form due to using simple imputer and knn imputer in earlier stages to fill missing values. 

In [2]:
# Check for dtypes
df.dtypes

Unnamed: 0        int64
id              float64
url             float64
region          float64
region_url      float64
price           float64
year            float64
manufacturer    float64
model           float64
condition       float64
cylinders       float64
fuel            float64
odometer        float64
title_status    float64
transmission    float64
vin             float64
drive           float64
size            float64
type            float64
paint_color     float64
image_url       float64
description     float64
state           float64
lat             float64
long            float64
vin_year        float64
dtype: object

###  <span id="scaler"> Standardize the Data </span> 

I standardize the data so that the magnitude and range of each feature are similar enough for modeling.

In [3]:
# standardize the columns individually
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss

StandardScaler()

In [4]:
ss.fit(df)
scaled = pd.DataFrame(ss.transform(df), columns=df.columns)
scaled.describe()

Unnamed: 0.1,Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,...,drive,size,type,paint_color,image_url,description,state,lat,long,vin_year
count,266524.0,266524.0,266524.0,266524.0,266524.0,266524.0,266524.0,266524.0,266524.0,266524.0,...,266524.0,266524.0,266524.0,266524.0,266524.0,266524.0,266524.0,266524.0,266524.0,266524.0
mean,-6.824862e-18,8.29515e-14,-6.824862e-18,-5.4598890000000005e-17,1.450283e-17,2.3033910000000003e-17,1.67175e-14,2.4740120000000002e-17,2.9112300000000005e-17,1.442818e-16,...,3.6683630000000005e-17,-8.517747000000001e-17,-1.7168790000000002e-17,-8.371119e-17,-5.4598890000000005e-17,3.7536740000000004e-17,-1.177289e-16,2.815255e-16,-2.098645e-15,-5.443382e-14
std,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,...,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002
min,-1.732044,-2.25909,-1.732044,-1.571951,-1.60127,-1.197031,-3.653691,-1.211226,-0.5876796,-1.315222,...,-1.139715,-0.604695,-0.9379663,-1.792803,-1.464109,-1.588215,-1.391315,-20.09984,-3.79672,-4.708434
25%,-0.8660222,-0.7552625,-0.8660222,-0.9812909,-0.9820284,-0.7191911,-0.7265959,-0.6941509,-0.5517599,-1.315222,...,-1.139715,-0.604695,-0.9379663,-0.7522108,-0.8840063,-0.8859741,-0.7718667,-0.6272349,-0.7089952,-0.5342272
50%,0.0,0.2020244,0.0,-0.002482323,0.01701446,-0.2411078,0.1343145,-0.4356132,-0.4333465,0.8323373,...,-0.303639,-0.604695,-0.2900774,-0.05848265,-0.1186057,-0.05942372,-0.2900739,0.1106797,0.334571,0.1732654
75%,0.8660222,0.9009389,0.8660222,0.8244421,0.8344132,0.4110818,0.8230427,0.7278062,0.05248323,0.8323373,...,1.368512,0.423948,0.3578115,0.9821095,0.8388249,0.8555812,0.811167,0.6395197,0.716567,0.7392595
max,1.732044,1.302025,1.732044,1.794813,1.767404,37.6426,1.511771,3.959527,4.93635,2.979897,...,1.368512,3.509877,3.273312,2.369566,1.95994,1.832388,2.050063,7.002963,11.76741,1.446752


### <span id="split"> Split the Data into Train & Test  </span>

This step is important so that the model we create can be tested against new data. 

In [5]:
from sklearn.model_selection import train_test_split

In [22]:
keep = ["fuel","odometer","year"]
more = ["fuel","odometer","year", "model","manufacturer","region","cylinders","condition","drive"]

X = scaled.loc[:,scaled.columns.isin(more)]
y = scaled.price.values


X = scaled.drop(columns="price")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



## <span id="split"> Modelling  </span>

### Linear Regression

In [23]:
# linear regression
from sklearn.linear_model import LinearRegression

lr = LinearRegression(fit_intercept=False,)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
lr.score(X_test, y_test)


# random forest

0.39745356359581807

### Support Vector Machine

In [8]:
# from sklearn.svm import SVR

# svm = SVR(C=1.0, epsilon=0.2)
# svm.fit(X_train, y_train)
# svm.predict(X_test)
# svm.score(X_test, y_test)

SVM r^2 score = 0.47239055872658975 ## took 30 min

### MLP Regressor

In [24]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)
mlp.predict(X_test)
mlp.score(X_test, y_test)

0.6855475504017188

### XGB

In [25]:
from sklearn.ensemble import GradientBoostingRegressor

reg = GradientBoostingRegressor(random_state=0)
reg.fit(X_train,y_train)
reg.predict(X_test)
reg.score(X_test, y_test)

0.6814373419783262

### Decision Tree 

In [26]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt.predict(X_test)
dt.score(X_test, y_test)

0.6921737228453941

In [27]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dt, X_train, y_train, cv=5)
scores

array([0.67275574, 0.67725938, 0.68611966, 0.68126496, 0.69013781])

### HyperParameters for Decision Tree

In [28]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

params = {"max_depth": range(16,22),
              "max_features": ["auto"],
              "min_samples_leaf": randint(1, 9),
              "splitter":["best","random"],
              "random_state": [0, None]}

dt = DecisionTreeRegressor()

tree_cv = RandomizedSearchCV(dt, params, cv=5)


# Fit it to the data
tree_cv.fit(X_train,y_train)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

Tuned Decision Tree Parameters: {'max_depth': 18, 'max_features': 'auto', 'min_samples_leaf': 7, 'random_state': 0, 'splitter': 'best'}
Best score is 0.7410168458168531
