### Import libraries and features 

In [20]:
import gc
import sys
import os 
import pandas as pd 
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

home = os.path.expanduser("~")
sys.path.append(f"{home}/Documents/projects/CarPriceRegression/Machine_Learning/scripts/")

In [3]:
from util import *
%load_ext autoreload
%autoreload 2

### Import data and split into train and test 

In [9]:
data = pd.read_csv(f"{home}/Documents/projects/CarPriceRegression/Processed_Data/car_onehot_data.csv")
features, y = data.drop("price",axis=1), data.price
X_train, X_test, y_train, y_test = train_test_split(features,y,test_size=0.2,random_state=2020)

In [10]:
X_train.shape,y_train.shape, X_test.shape,y_test.shape

((79348, 206), (79348,), (19838, 206), (19838,))

### garbage collect

In [11]:
gc.collect()
%reset -f out

Flushing output cache (1 entries)


### Generate a base model first 

In [30]:
base_model = ElasticNet(random_state=2020)

In [13]:
base_model.fit(X_train,y_train)

ElasticNet(random_state=2020)

In [16]:
train_pred = base_model.predict(X_train)
test_pred = base_model.predict(X_test)

In [18]:
mean_squared_error(y_train,train_pred), r2_score(y_train,train_pred)

(43477278.05742842, 0.5521688857933424)

In [19]:
mean_squared_error(y_test,test_pred), r2_score(y_test,test_pred)

(44652950.42503811, 0.5463218542056005)

### Key parameters

* alpha
* l1__ratio
* max_iter
* tol

In [69]:
params = {
    "alpha":[0.005,0.05,0.5,1],
    "l1_ratio":[1,0.5,0],
    "max_iter":[1000,2000],
    "tol":[1e-4,5e-5]
}

In [70]:
searchGrid = GridSearchCV(base_model,params,scoring="r2",n_jobs=20,verbose=True)

In [71]:
searchGrid.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:   22.0s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:  8.6min
[Parallel(n_jobs=20)]: Done 240 out of 240 | elapsed: 15.1min finished


GridSearchCV(estimator=ElasticNet(random_state=2020), n_jobs=20,
             param_grid={'alpha': [0.005, 0.05, 0.5, 1],
                         'l1_ratio': [1, 0.5, 0], 'max_iter': [1000, 2000],
                         'tol': [0.0001, 5e-05]},
             scoring='r2', verbose=True)

In [72]:
searchGrid.best_score_

0.8380027741135663

In [73]:
searchGrid.best_params_

{'alpha': 0.005, 'l1_ratio': 1, 'max_iter': 1000, 'tol': 5e-05}

In [74]:
final_model = searchGrid.best_estimator_

In [75]:
train_pred_tuned = final_model.predict(X_train)
test_pred_tuned = final_model.predict(X_test)

In [76]:
mean_squared_error(y_train,train_pred_tuned), r2_score(y_train,train_pred_tuned)

(15474563.998141116, 0.8406065985088567)

In [77]:
mean_squared_error(y_test,test_pred_tuned), r2_score(y_test,test_pred_tuned)

(16397191.53880948, 0.8334030028754511)

### Coefficients and look for feature ranking

In [78]:
final_model.coef_

array([ 3.31818624e+03, -2.06867801e+03, -1.46818249e+03,  2.78779592e+03,
        2.31312289e+02,  3.06234228e+03,  3.68548099e+03,  4.94388892e+03,
        4.56292285e+03,  1.45789082e+04,  3.51691421e+04,  2.27760533e+03,
       -5.70613054e+02, -3.69415513e+01,  1.81611935e+03,  3.58338465e+03,
        4.98239031e+03,  5.76504045e+03,  8.19172614e+03,  1.24986482e+04,
       -4.99978542e+03, -6.79714785e+03, -5.29101792e+03,  5.26478403e+03,
       -6.39478919e+03, -1.14624857e+03, -6.84012842e+03,  3.65789237e+03,
       -4.31117898e+03, -2.64702137e+03, -1.63295643e+03, -6.58484354e+03,
       -1.27297936e+02, -4.92288257e+03, -1.60295320e+03,  4.26146773e+03,
        1.23023450e+03, -3.48769993e+03, -9.00605048e+02,  3.12372145e+03,
        3.91703186e+03, -3.88384214e+03, -7.15328540e+03,  6.93244305e+03,
       -2.15702291e+01, -1.65685026e+03, -1.10967254e+03, -1.84212084e+03,
        3.31711160e+04, -1.47586051e+03,  1.69288904e+04, -2.32105287e+03,
       -5.86293767e+03, -

In [79]:
final_model.intercept_

15020.955811657714