# Car Price Prediction Model Selection

## Imports

In [1983]:
import numpy as np
import pandas as pd
import joblib
from sklearn.feature_selection import f_regression,mutual_info_regression,\
                                      SelectPercentile
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator,TransformerMixin,clone
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso,SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error,r2_score
from mltoolkit.preprocessing import MyScaler
from mltoolkit.utils import dump_model

## Classes

In [1968]:
class MyScaler(TransformerMixin,BaseEstimator):
    def __init__(self,scalertype:str=None,**kwargs):
        '''
        Optional scaling method in one class. Allowing user to pass argument
        to decide which scaling method to use, or to skip it.

        Parameters:
        ----------
        scalertype: str | {'std','minmax','skip'}, Default: None
        specifying the scaler to transform the data
        '''
        if scalertype: 
            if scalertype in ['std','minmax','skip']:
                self.scalertype = scalertype
            else:
                raise ValueError('Specify scaler correctly. Options:{"std","minmax","skip"}')
        else: self.scalertype = None
        self._kwargs = kwargs
    def fit(self,X,y=None,scalertype:str=None):
        '''
        scalertype: str | {'std','minmax','skip'}, Default: None
        specifying the scaler to transform the data
        '''
        if scalertype:
            if self.scalertype != scalertype:
                self.scalertype = scalertype
        if self.scalertype == 'std':
            self.scaler = StandardScaler(**self._kwargs)
        elif self.scalertype == 'minmax':
            self.scaler = MinMaxScaler(**self._kwargs)
        elif self.scalertype == 'skip':
            return self
        else:
            raise ValueError('Specify scaler correctly. Options:{"std","minmax","skip"}')
        return self
    def transform(self,X,scalertype:str=None):
        '''
        scalertype: str | {'std','minmax'}, Default: None
        specifying the scaler to transform the data
        '''
        if scalertype:
            if self.scalertype != scalertype:
                self.scalertype = scalertype
        if self.scalertype == 'std':
            self.scaler = StandardScaler(**self._kwargs)
        elif self.scalertype == 'minmax':
            self.scaler = MinMaxScaler(**self._kwargs)
        elif self.scalertype == 'skip':
            if type(X) == pd.DataFrame:
                return X.values
            elif type(X) == np.ndarray:
                return X
        else:
            raise ValueError('Specify scaler correctly. Options:{"std","minmax","skip"}')
        return self.scaler.fit_transform(X)

In [277]:
class CustomTransform(BaseEstimator,TransformerMixin):
    def __init__(self,out_df=False):
        self.door_dict = {"two":2,"four":4}
        self.cylinder_dict = {"four":4,"six":6,"five":5,"three":3,
                              "twelve":12,"two":2,"eight":8,}
        self.out_df = out_df
        return None
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        if type(X) is not pd.DataFrame:
            raise ValueError("Please input only DataFrame containing 'doornumber' and 'cylindernumber' columns")
        X = X.copy()
        X['doornumber'] = X['doornumber'].apply(lambda x: self.door_dict[x])
        X['cylindernumber'] = X['cylindernumber'].apply(lambda x: self.cylinder_dict[x])
        X['make'] = X['CarName'].str.split().apply(self.extract_namufacturer)
        X.drop('CarName',axis=1,inplace=True)
        
        if self.out_df: return X
        else: return X.values
    def extract_namufacturer(self,carname_strs):
        typo_dict = {
            "maxda":"mazda",
            "porcshce":"porsche",
            "toyouta":"toyota",
            "vokswagen":"volkswagen",
            "vw":"volkswagen",
        }

        if carname_strs[0] in typo_dict.keys():
            return typo_dict[carname_strs[0]].lower()
        else:
            return carname_strs[0].lower()

In [286]:
class TrainedTransformer(BaseEstimator,TransformerMixin):
    def __init__(self,estimator,**kwargs):
        self.estimator = estimator
        self.kwargs = kwargs
        if kwargs is not None: self.estimator.set_params(**self.kwargs)
        return None
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return self.estimator.transform(X)

## Functions

## Loading Dataset

In [2]:
rawdata = pd.read_csv("Datasets\\Raw Data\\CarPrice_Assignment.csv",index_col=0)

In [5]:
X_raw = rawdata.loc[:,rawdata.columns != 'price']
y_raw = rawdata.loc[:,'price']

In [6]:
X_raw.shape,y_raw.shape

((205, 24), (205,))

## Splitting Train and Test Datasets

In [114]:
X_train,X_test,y_train,y_test = train_test_split(X_raw,y_raw,test_size=0.2,random_state=19,stratify=pd.cut(y_raw,[5000,7000,9000,11000,13000,15000,17000,19000,21000,26000,31000,36000,50000]))

In [84]:
savingpath = "Datasets\\"
joblib.dump(X_train,savingpath+"X_train.pkl",compress=9)
joblib.dump(X_test,savingpath+"X_test.pkl",compress=9)
joblib.dump(y_train,savingpath+"y_train.pkl",compress=9)
joblib.dump(y_test,savingpath+"y_test.pkl",compress=9)

['Datasets\\y_test.pkl']

## Pipeline

In [115]:
cus_trans = CustomTransform(out_df=True)

In [116]:
X_play = cus_trans.fit_transform(X_train)
X_play.head()

Unnamed: 0_level_0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,make
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
189,2,gas,std,4,sedan,fwd,front,97.3,171.7,65.5,...,109,mpfi,3.19,3.4,10.0,100,5500,26,32,volkswagen
92,1,gas,std,2,sedan,fwd,front,94.5,165.3,63.8,...,97,2bbl,3.15,3.29,9.4,69,5200,31,37,nissan
192,0,gas,std,4,sedan,fwd,front,100.4,180.2,66.9,...,136,mpfi,3.19,3.4,8.5,110,5500,19,24,volkswagen
1,3,gas,std,2,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,alfa-romero
45,1,gas,std,2,sedan,fwd,front,94.5,155.9,63.6,...,90,2bbl,3.03,3.11,9.6,70,5400,38,43,isuzu


In [118]:
num_cols = X_play.columns[X_play.dtypes.apply(lambda x: True if x == np.int64 or x == np.float64 else False)].values
cat_cols = X_play.columns[~X_play.dtypes.apply(lambda x: True if x == np.int64 or x == np.float64 else False)].values

In [119]:
joblib.dump(num_cols,savingpath+"num_cols.pkl",compress=9)
joblib.dump(cat_cols,savingpath+"cat_cols.pkl",compress=9)

['Datasets\\cat_cols.pkl']

In [293]:
make_categories = joblib.load("System\\make_categories")

In [309]:
col_trans = ColumnTransformer([
    ("onehot",OneHotEncoder(),cat_cols),
],remainder='passthrough')

Training col_trans with raw data so that the one-hot encoder is exposed to every possible categories

In [575]:
col_trans.fit(CustomTransform(out_df=True).fit_transform(X_raw.copy()))

In [319]:
joblib.dump(col_trans,"Trained Models\\Transformers\\col_trans.pkl")

['Trained Models\\Transformers\\col_trans.pkl']

In [725]:
pipe_train = Pipeline([
    ("custrans",CustomTransform(out_df=True)),
    ("col_trans",TrainedTransformer(col_trans)),
    ("fea_sel",SelectPercentile(f_regression,percentile=100)),
    ("std_scale",MyScaler('std',with_mean=False))
])

In [727]:
X_train_trans = pipe_train.fit_transform(X_train.copy(),y_train.copy())
X_train_trans

array([[ 0.        ,  3.46900627,  2.55409579, ..., 11.53382903,
         4.03884412,  4.69054043],
       [ 0.        ,  3.46900627,  2.55409579, ..., 10.90471108,
         4.81554492,  5.42343737],
       [ 0.        ,  3.46900627,  2.55409579, ..., 11.53382903,
         2.95146301,  3.51790532],
       ...,
       [ 0.        ,  3.46900627,  2.55409579, ..., 11.32412304,
         2.48544254,  3.22474654],
       [ 0.        ,  3.46900627,  2.55409579, ..., 12.58235894,
         2.48544254,  3.37132593],
       [ 0.        ,  3.46900627,  2.55409579, ..., 12.05809398,
         2.6407827 ,  4.10422287]])

In [329]:
pd.DataFrame(X_train_trans).to_csv('X_train_trans.csv')

In [733]:
X_test_trans = pipe_train.transform(X_test.copy())
X_test_trans

array([[ 3.05595957,  0.        ,  0.        , ...,  8.81032831,
         4.08145692,  4.68927471],
       [ 0.        ,  3.05595957,  2.82926879, ...,  9.55336804,
         2.04072846,  2.27358774],
       [ 0.        ,  3.05595957,  2.82926879, ..., 11.03944751,
         4.51875587,  5.25767164],
       ...,
       [ 0.        ,  3.05595957,  2.82926879, ..., 11.67633872,
         3.49839164,  4.12087777],
       [ 0.        ,  3.05595957,  2.82926879, ..., 12.73782406,
         4.37298955,  4.83137394],
       [ 3.05595957,  0.        ,  2.82926879, ...,  9.87181364,
         5.24758747,  5.96816781]])

In [328]:
pd.DataFrame(X_test_trans).to_csv('X_test_trans.csv')

## Models Testing

Due to the lack of samples (only 205 samples in total), so I decided not to conduct cross-validation.\
We will directly evaluate the model's performance with the test dataset.

### Linear Regression

In [330]:
lin = LinearRegression()

In [331]:
lin.fit(X_train_trans,y_train.copy())

In [332]:
lin_train_yhat = lin.predict(X_train_trans)

In [333]:
lin_train_r2 = r2_score(y_train,lin_train_yhat)
lin_train_r2

0.9638392532431698

In [334]:
lin_train_rmse = mean_squared_error(y_train,lin_train_yhat,squared=False)
lin_train_rmse

1509.0132559906228

In [335]:
lin_test_yhat = lin.predict(X_test_trans)

In [336]:
lin_test_r2 = r2_score(y_test,lin_test_yhat)
lin_test_r2

-8.767796968919184e+24

In [337]:
lin_test_rmse = mean_squared_error(y_test,lin_test_yhat,squared=False)
lin_test_rmse

2.3993540366363348e+16

Linear Regression is definitely overfitting for training dataset.

#### Saving the model

In [1984]:
lin_yhats = {
    "lin_train_yhat":lin_train_yhat,
    "lin_test_yhat":lin_test_yhat,
}

In [1985]:
lin_scores = {
    "lin_train_r2":lin_train_r2,
    "lin_train_rmse":lin_train_rmse,
    "lin_test_r2":lin_test_r2,
    "lin_test_rmse":lin_test_rmse,
}

In [2000]:
dump_model(lin,yhat=lin_yhats,scores=lin_scores)

### Ridge Regression

In [381]:
ridge = Ridge(alpha=150)
ridge.fit(X_train_trans,y_train.copy())

In [382]:
ridge_train_yhat = ridge.predict(X_train_trans)

In [383]:
ridge_train_r2 = r2_score(y_train,ridge_train_yhat)
ridge_train_r2

0.9131826268493264

In [384]:
ridge_train_rmse = mean_squared_error(y_train,ridge_train_yhat,squared=False)
ridge_train_rmse

2338.1785937883624

In [385]:
ridge_test_yhat = ridge.predict(X_test_trans)

In [386]:
ridge_test_r2 = r2_score(y_test,ridge_test_yhat)
ridge_test_r2

0.8634984337328037

In [387]:
ridge_test_rmse = mean_squared_error(y_test,ridge_test_yhat,squared=False)
ridge_test_rmse

2993.7664014909474

Ridge Regression did so much better than Linear Regression due to the regularization property that it carries.

In [395]:
ridge.coef_

array([ 121.31777597, -121.31777597,  -59.28297181,   59.28297181,
        348.81096502,  190.53212531, -204.97145819,  103.73123827,
       -174.14764309,  -65.07637452, -315.3006517 ,  352.74505356,
       -475.64144254,  475.64144254,  -58.9709092 ,   63.0630709 ,
        -92.55198188,  -41.16621205,    9.50036994,  119.102269  ,
        152.0739118 ,  -36.99765067, -174.51550733,   43.23024563,
        121.31777597,  -59.01262885,  176.59013864, -129.73439843,
        -73.89565512,  -37.45578727,  156.0899001 ,  808.18335859,
        706.70785324,  148.16736074, -117.8141489 ,  -23.42691837,
         -3.68700906,  429.70990381,  -17.67771986,    0.        ,
       -160.69232783, -209.20333493, -208.03017733, -131.14443675,
        493.96358525, -127.64344761,  145.33513021, -255.64858819,
       -421.28503703, -205.39784743,  -89.3317325 ,   54.42867566,
        -69.56133514,  348.57944796,  379.77848156,  663.71199692,
          8.03410276,  676.38019845,  827.10685359,  892.35344

Most of the coefficients of the trained Ridge Regression model are capped to be a very low value.

#### Saving the model

In [1987]:
ridge_yhats = {
    "ridge_train_yhat":ridge_train_yhat,
    "ridge_test_yhat":ridge_test_yhat,
}

In [1988]:
ridge_scores = {
    "ridge_train_r2":ridge_train_r2,
    "ridge_train_rmse":ridge_train_rmse,
    "ridge_test_r2":ridge_test_r2,
    "ridge_test_rmse":ridge_test_rmse,
}

In [1989]:
dump_model(ridge,yhat=ridge_yhats,scores=ridge_scores)

### Lasso Regression

In [438]:
lasso = Lasso(alpha=1100)
lasso.fit(X_train_trans,y_train.copy())

In [439]:
lasso_train_yhat = lasso.predict(X_train_trans)

In [440]:
lasso_train_r2 = r2_score(y_train,lasso_train_yhat)
lasso_train_r2

0.8432555833897956

In [441]:
lasso_train_rmse = mean_squared_error(y_train,lasso_train_yhat,squared=False)
lasso_train_rmse

3141.741143579494

In [442]:
lasso_test_yhat = lasso.predict(X_test_trans)

In [443]:
lasso_test_r2 = r2_score(y_test,lasso_test_yhat)
lasso_test_r2

0.8451198430210743

In [444]:
lasso_test_rmse = mean_squared_error(y_test,lasso_test_yhat,squared=False)
lasso_test_rmse

3188.944660899877

In [445]:
lasso.coef_

array([   0.        ,   -0.        ,   -0.        ,    0.        ,
          0.        ,    0.        ,   -0.        ,    0.        ,
         -0.        ,   -0.        ,   -0.        ,   81.37938962,
       -546.04503837,    0.        ,    0.        ,    0.        ,
          0.        ,   -0.        ,    0.        ,    0.        ,
          0.        ,   -0.        ,   -0.        ,    0.        ,
          0.        ,   -0.        ,    0.        ,   -0.        ,
         -0.        ,    0.        ,    0.        ,  574.67809688,
        362.37029421,    0.        ,   -0.        ,   -0.        ,
         -0.        ,    0.        ,    0.        ,    0.        ,
         -0.        ,   -0.        ,   -0.        ,   -0.        ,
          0.        ,   -0.        ,    0.        ,   -0.        ,
         -0.        ,   -0.        ,    0.        ,    0.        ,
         -0.        ,    0.        ,    0.        ,  635.92127137,
          0.        , 1120.16150142,    0.        , 2794.09193

Lasso Regression restricted vast majority of the coefficient of the features to zero.\
We can gain quite a good insights on the importance of the features from this.

In [453]:
lasso.coef_ >0

array([False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False,  True, False,  True, False, False, False,
        True, False, False, False])

In [455]:
col_trans.get_feature_names_out()[lasso.coef_ >0]

array(['onehot__drivewheel_rwd', 'onehot__make_bmw', 'onehot__make_buick',
       'remainder__carwidth', 'remainder__curbweight',
       'remainder__enginesize', 'remainder__horsepower'], dtype=object)

We can see that only the following features actually contribute most to the car price prediction:
1. drivewheel (rwd)
2. make (bmw)
3. make (buick)
4. carwidth
5. curbweight
6. enginesize
7. horsepower

But after a few round of hyperparameter testing, Lasso Regression seems to be performing slightly worse than Ridge Regression.\
This could mean that some of the features zeroed by the Lasso Regression's regularization could still be somewhat useful.

#### Saving the model

In [1990]:
lasso_yhats = {
    "lasso_train_yhat":lasso_train_yhat,
    "lasso_test_yhat":lasso_test_yhat,
}

In [1991]:
lasso_scores = {
    "lasso_train_r2":lasso_train_r2,
    "lasso_train_rmse":lasso_train_rmse,
    "lasso_test_r2":lasso_test_r2,
    "lasso_test_rmse":lasso_test_rmse,
}

In [1993]:
dump_model(lasso,yhat=lasso_yhats,scores=lasso_scores)

### Ridge + Lasso

Let's see how much is the difference between the yhat of Ridge and Lasso Regression

In [456]:
ridge_test_yhat - lasso_test_yhat

array([ -620.10510541, -1462.05887068, -1690.50659712, -2401.80219289,
       -2983.13410664, -3110.07093617, -2539.90727933, -1529.70063018,
         -27.20881983, -1424.26607451, -1005.61172228,   211.1256399 ,
        -441.01417031, -3077.81410346, -3190.83051641, -1305.94108473,
       -2897.15475465, -2997.60870744,   964.27353698, -3490.58985892,
       -3025.02656983,  -576.78505545, -2374.84110729,   -56.63212237,
         -76.82996797, -1641.67308625, -1584.2186208 ,   248.91854803,
       -2606.33669046, -2810.5395972 ,   567.05887828,  -984.81372542,
       -2414.91110656,  -847.24226852, -1760.43339843,  -871.79491434,
       -1023.89005653, -2131.56067411, -2397.63213396, -1773.03559478,
        -673.54579052])

The difference is quite noticable, perhaps we can do a combination of the two models and see if it is possible to provide even more accurate results.

In [493]:
ridgelasso_test_yhat = ridge_test_yhat*0.65 + lasso_test_yhat*0.35

In [494]:
ridgelasso_test_r2 = r2_score(y_test,ridgelasso_test_yhat)
ridgelasso_test_r2

0.8701962679137263

In [495]:
ridgelasso_test_rmse = mean_squared_error(y_test,ridgelasso_test_yhat,squared=False)
ridgelasso_test_rmse

2919.3938071612893

Well after applying different weights between the two models, it actually improves by a bit, not very noticeable.

#### Saving the model

In [1994]:
ridgelasso_yhats = {
    "ridgelasso_test_yhat":ridgelasso_test_yhat,
}

In [1995]:
ridgelasso_scores = {
    "ridgelasso_test_r2":ridgelasso_test_r2,
    "ridgelasso_test_rmse":ridgelasso_test_rmse,
}

In [2012]:
joblib.dump(ridgelasso_yhats,"Trained Models\\ridgelasso_yhat.pkl")
joblib.dump(ridgelasso_scores,"Trained Models\\ridgelasso_scores.pkl")

['Trained Models\\ridgelasso_scores.pkl']

### Features Testing

In [584]:
col_trans.get_feature_names_out().shape

(67,)

In [717]:
pipe_train2 = Pipeline([
    ("custrans",CustomTransform(out_df=True)),
    ("col_trans",TrainedTransformer(col_trans)),
    ("fea_sel",SelectPercentile(f_regression,percentile=30)),
    ("std_scale",MyScaler('std',with_mean=False)),
    ("ridge",Lasso(alpha=1100))
])

In [718]:
fea = pipe_train2.fit(X_train.copy(),y_train.copy())

In [719]:
fea_train_yhat = fea.predict(X_train)

In [720]:
fea_train_r2 = r2_score(y_train,fea_train_yhat)
fea_train_r2

0.8432544815873506

In [721]:
fea_train_rmse = mean_squared_error(y_train,fea_train_yhat,squared=False)
fea_train_rmse

3141.7521856697485

In [722]:
fea_test_yhat = fea.predict(X_test)

In [723]:
fea_test_r2 = r2_score(y_test,fea_test_yhat)
fea_test_r2

0.8451207629750277

In [724]:
fea_test_rmse = mean_squared_error(y_test,fea_test_yhat,squared=False)
fea_test_rmse

3188.9351900720403

In [702]:
f = pipe_train2['fea_sel']

In [708]:
pipe_train2['col_trans'].estimator.get_feature_names_out()[f.get_support()]

array(['onehot__drivewheel_fwd', 'onehot__drivewheel_rwd',
       'onehot__enginelocation_front', 'onehot__enginelocation_rear',
       'onehot__enginetype_ohc', 'onehot__fuelsystem_2bbl',
       'onehot__fuelsystem_mpfi', 'onehot__make_bmw',
       'onehot__make_buick', 'onehot__make_porsche',
       'remainder__wheelbase', 'remainder__carlength',
       'remainder__carwidth', 'remainder__curbweight',
       'remainder__cylindernumber', 'remainder__enginesize',
       'remainder__boreratio', 'remainder__horsepower',
       'remainder__citympg', 'remainder__highwaympg'], dtype=object)

With Lasso Regression, we need to keep only 30 percentile of the features evaluated by f_regression method, while maintaining the exact same accuracy.\
But in the case of computational resources is not a limitation, we should just go for a Ridge Regression with full features (Ridge Regression with feature selection does not show satisfied results)

### Polynomial Regression

In [734]:
poly = PolynomialFeatures(degree=2)

In [740]:
pipe_train.set_params(fea_sel__percentile=30)

In [741]:
X_train_trans2 = pipe_train.transform(X_train.copy())
X_train_trans2

array([[2.01824181, 0.        , 7.46225839, ..., 2.62308291, 4.03884412,
        4.69054043],
       [2.01824181, 0.        , 7.46225839, ..., 1.80992721, 4.81554492,
        5.42343737],
       [2.01824181, 0.        , 7.46225839, ..., 2.8853912 , 2.95146301,
        3.51790532],
       ...,
       [0.        , 2.06228136, 7.46225839, ..., 4.7740109 , 2.48544254,
        3.22474654],
       [0.        , 2.06228136, 7.46225839, ..., 3.54116193, 2.48544254,
        3.37132593],
       [0.        , 2.06228136, 7.46225839, ..., 7.55447879, 2.6407827 ,
        4.10422287]])

In [748]:
X_train_trans2_poly = poly.fit_transform(X_train_trans2)

In [758]:
X_test_trans2_poly = poly.transform(pipe_train.transform(X_test.copy()))

#### Linear

In [752]:
poly_lin = LinearRegression(fit_intercept=False,n_jobs=4).fit(X_train_trans2_poly,y_train)

In [753]:
poly_lin_train_yhat = poly_lin.predict(X_train_trans2_poly)

In [755]:
poly_lin_train_r2 = r2_score(y_train,poly_lin_train_yhat)
poly_lin_train_r2

0.9974657854591514

In [756]:
poly_lin_train_rmse = mean_squared_error(y_train,poly_lin_train_yhat,squared=False)
poly_lin_train_rmse

399.4809268720823

In [759]:
poly_lin_test_yhat = poly_lin.predict(X_test_trans2_poly)

In [760]:
poly_lin_test_r2 = r2_score(y_test,poly_lin_test_yhat)
poly_lin_test_r2

-14129498607.229387

In [761]:
poly_lin_test_rmse = mean_squared_error(y_test,poly_lin_test_yhat,squared=False)
poly_lin_test_rmse

963191116.0404202

Linear Regression model is still seriously overfit (even more than before polynomial transforming)

##### Saving the model

In [1997]:
poly_lin_yhats = {
    "poly_lin_train_yhat":poly_lin_train_yhat,
    "poly_lin_test_yhat":poly_lin_test_yhat,
}

In [1998]:
poly_lin_scores = {
    "poly_lin_train_r2":poly_lin_train_r2,
    "poly_lin_train_rmse":poly_lin_train_rmse,
    "poly_lin_test_r2":poly_lin_test_r2,
    "poly_lin_test_rmse":poly_lin_test_rmse,
}

In [2001]:
dump_model(poly_lin,filename="poly_lin",yhat=poly_lin_yhats,scores=poly_lin_scores)

#### Ridge

In [860]:
poly_ridge = Ridge(alpha=150000,fit_intercept=False).fit(X_train_trans2_poly,y_train)

In [861]:
poly_ridge_train_yhat = poly_ridge.predict(X_train_trans2_poly)

In [862]:
poly_ridge_train_r2 = r2_score(y_train,poly_ridge_train_yhat)
poly_ridge_train_r2

0.9063886123151322

In [863]:
poly_ridge_train_rmse = mean_squared_error(y_train,poly_ridge_train_yhat,squared=False)
poly_ridge_train_rmse

2427.9441990702444

In [864]:
poly_ridge_test_yhat = poly_ridge.predict(X_test_trans2_poly)

In [865]:
poly_ridge_test_r2 = r2_score(y_test,poly_ridge_test_yhat)
poly_ridge_test_r2

0.8807875829936561

In [866]:
poly_ridge_test_rmse = mean_squared_error(y_test,poly_ridge_test_yhat,squared=False)
poly_ridge_test_rmse

2797.7560231775697

With Polynomial degree = 2, Ridge Regression is able to get even higher performance than that without polynomial feature transform

##### Saving the model

In [2002]:
poly_ridge_yhats = {
    "poly_ridge_train_yhat":poly_ridge_train_yhat,
    "poly_ridge_test_yhat":poly_ridge_test_yhat,
}

In [2003]:
poly_ridge_scores = {
    "poly_ridge_train_r2":poly_ridge_train_r2,
    "poly_ridge_train_rmse":poly_ridge_train_rmse,
    "poly_ridge_test_r2":poly_ridge_test_r2,
    "poly_ridge_test_rmse":poly_ridge_test_rmse,
}

In [2004]:
dump_model(poly_ridge,filename="poly_ridge",yhat=poly_ridge_yhats,scores=poly_ridge_scores)

#### Lasso

In [1081]:
poly_lasso = Lasso(alpha=1000,max_iter=1000,selection='cyclic',fit_intercept=False).fit(X_train_trans2_poly,y_train)

  model = cd_fast.enet_coordinate_descent(


In [1082]:
poly_lasso_train_yhat = poly_lasso.predict(X_train_trans2_poly)

In [1083]:
poly_lasso_train_r2 = r2_score(y_train,poly_lasso_train_yhat)
poly_lasso_train_r2

0.9379341170596723

In [1084]:
poly_lasso_train_rmse = mean_squared_error(y_train,poly_lasso_train_yhat,squared=False)
poly_lasso_train_rmse

1976.9734120095347

In [1085]:
poly_lasso_test_yhat = poly_lasso.predict(X_test_trans2_poly)

In [1086]:
poly_lasso_test_r2 = r2_score(y_test,poly_lasso_test_yhat)
poly_lasso_test_r2

0.8899874232995395

In [1087]:
poly_lasso_test_rmse = mean_squared_error(y_test,poly_lasso_test_yhat,squared=False)
poly_lasso_test_rmse

2687.6348331401823

Although the lasso regression failed to converge to the optimum coefficients, but it still performs better than Ridge Regression under the same condition.

##### Saving the model

In [2005]:
poly_lasso_yhats = {
    "poly_lasso_train_yhat":poly_lasso_train_yhat,
    "poly_lasso_test_yhat":poly_lasso_test_yhat,
}

In [2006]:
poly_lasso_scores = {
    "poly_lasso_train_r2":poly_lasso_train_r2,
    "poly_lasso_train_rmse":poly_lasso_train_rmse,
    "poly_lasso_test_r2":poly_lasso_test_r2,
    "poly_lasso_test_rmse":poly_lasso_test_rmse,
}

In [2007]:
dump_model(poly_lasso,filename="poly_lasso",yhat=poly_lasso_yhats,scores=poly_lasso_scores)

#### Ridge + Lasso

In [1964]:
poly_ridgelasso_test_yhat = poly_ridge_test_yhat*0.2 + poly_lasso_test_yhat*0.8

In [1965]:
poly_ridgelasso_test_r2 = r2_score(y_test,poly_ridgelasso_test_yhat)
poly_ridgelasso_test_r2

0.8903694359586657

In [1966]:
poly_ridgelasso_test_rmse = mean_squared_error(y_test,poly_ridgelasso_test_yhat,squared=False)
poly_ridgelasso_test_rmse

2682.9644426716727

Well after applying different weights between the two models, it actually improves by a tiny bit, not very noticeable.

#### Saving the model

In [2008]:
poly_ridgelasso_yhats = {
    "poly_ridgelasso_test_yhat":poly_ridgelasso_test_yhat,
}

In [2009]:
poly_ridgelasso_scores = {
    "poly_ridgelasso_test_r2":poly_ridgelasso_test_r2,
    "poly_ridgelasso_test_rmse":poly_ridgelasso_test_rmse,
}

In [2011]:
joblib.dump(poly_ridgelasso_yhats,"Trained Models\\poly_ridgelasso_yhats.pkl")
joblib.dump(poly_ridgelasso_scores,"Trained Models\\poly_ridgelasso_scores.pkl")

['Trained Models\\poly_ridgelasso_scores.pkl']

### SVR

#### SVR Standard

In [1236]:
svr = SVR(C=100000).fit(X_train_trans,y_train)

In [1237]:
svr_train_yhat = svr.predict(X_train_trans)

In [1238]:
svr_train_r2 = r2_score(y_train,svr_train_yhat)
svr_train_r2

0.9388572005526047

In [1239]:
svr_train_rmse = mean_squared_error(y_train,svr_train_yhat,squared=False)
svr_train_rmse

1962.2169332464657

In [1240]:
svr_test_yhat = svr.predict(X_test_trans)

In [1241]:
svr_test_r2 = r2_score(y_test,svr_test_yhat)
svr_test_r2

0.8857250097641215

In [1242]:
svr_test_rmse = mean_squared_error(y_test,svr_test_yhat,squared=False)
svr_test_rmse

2739.2059677342954

#### SVR Poly

In [1621]:
svr_poly = SVR(C=77500,kernel='poly',degree=2).fit(X_train_trans,y_train)

In [1622]:
svr_poly_train_yhat = svr_poly.predict(X_train_trans)

In [1623]:
svr_poly_train_r2 = r2_score(y_train,svr_poly_train_yhat)
svr_poly_train_r2

0.9404184645679474

In [1624]:
svr_poly_train_rmse = mean_squared_error(y_train,svr_poly_train_yhat,squared=False)
svr_poly_train_rmse

1937.0026062307147

In [1625]:
svr_poly_test_yhat = svr_poly.predict(X_test_trans)

In [1626]:
svr_poly_test_r2 = r2_score(y_test,svr_poly_test_yhat)
svr_poly_test_r2

0.8769174052421456

In [1627]:
svr_poly_test_rmse = mean_squared_error(y_test,svr_poly_test_yhat,squared=False)
svr_poly_test_rmse

2842.807250010135

No much point going for more complex model like SVR

### Random Forest Regression

In [1835]:
rforest = RandomForestRegressor(n_estimators=100,n_jobs=4).fit(X_train_trans,y_train)

In [1836]:
rforest_train_yhat = rforest.predict(X_train_trans)

In [1837]:
rforest_train_r2 = r2_score(y_train,rforest_train_yhat)
rforest_train_r2

0.9861582593759282

In [1838]:
rforest_train_rmse = mean_squared_error(y_train,rforest_train_yhat,squared=False)
rforest_train_rmse

933.6190679932748

In [1839]:
rforest_test_yhat = rforest.predict(X_test_trans)

In [1840]:
rforest_test_r2 = r2_score(y_test,rforest_test_yhat)
rforest_test_r2

0.7772391791422922

In [1841]:
rforest_test_rmse = mean_squared_error(y_test,rforest_test_yhat,squared=False)
rforest_test_rmse

3824.445685853927

I have retried for a lot of times, it seems like the current number of samples is not enough to regularize well the Random Forest Regressor model

#### Saving the model

In [2013]:
rforest_yhats = {
    "rforest_train_yhat":rforest_train_yhat,
    "rforest_test_yhat":rforest_test_yhat,
}

In [2014]:
rforest_scores = {
    "rforest_train_r2":rforest_train_r2,
    "rforest_train_rmse":rforest_train_rmse,
    "rforest_test_r2":rforest_test_r2,
    "rforest_test_rmse":rforest_test_rmse,
}

In [2015]:
dump_model(rforest,yhat=rforest_yhats,scores=rforest_scores)

### K-Nearest Neighbors Regression

In [1927]:
knn = KNeighborsRegressor(n_neighbors=2,n_jobs=4).fit(X_train_trans,y_train)

In [1928]:
knn_train_yhat = knn.predict(X_train_trans)

In [1929]:
knn_train_r2 = r2_score(y_train,knn_train_yhat)
knn_train_r2

0.9607282643336948

In [1930]:
knn_train_rmse = mean_squared_error(y_train,knn_train_yhat,squared=False)
knn_train_rmse

1572.5860091904863

In [1931]:
knn_test_yhat = knn.predict(X_test_trans)

In [1932]:
knn_test_r2 = r2_score(y_test,knn_test_yhat)
knn_test_r2

0.8141148727609268

In [1933]:
knn_test_rmse = mean_squared_error(y_test,knn_test_yhat,squared=False)
knn_test_rmse

3493.585804641163

It doesn't seem like K-Nearest Neighbors model works well too

#### Saving the model

In [2016]:
knn_yhats = {
    "rforest_train_yhat":knn_train_yhat,
    "rforest_test_yhat":knn_test_yhat,
}

In [2017]:
knn_scores = {
    "knn_train_r2":knn_train_r2,
    "knn_train_rmse":knn_train_rmse,
    "knn_test_r2":knn_test_r2,
    "knn_test_rmse":knn_test_rmse,
}

In [2018]:
dump_model(knn,yhat=knn_yhats,scores=knn_scores)

## Pipeline Wrap Up

In [1977]:
pipe_final = Pipeline([
    ("custrans",CustomTransform(out_df=True)),
    ("col_trans",TrainedTransformer(col_trans)),
    ("fea_sel",SelectPercentile(f_regression,percentile=30)),
    ("std_scale",MyScaler('std',with_mean=False)),
    ("poly",PolynomialFeatures(degree=2)),
])

In [1978]:
X_train_trans_final = pipe_final.fit_transform(X_train,y_train)
X_train_trans_final,X_train_trans_final.shape

(array([[ 1.        ,  2.01824181,  0.        , ..., 16.31226185,
         18.94436164, 22.0011695 ],
        [ 1.        ,  2.01824181,  0.        , ..., 23.18947284,
         26.11680625, 29.41367289],
        [ 1.        ,  2.01824181,  0.        , ...,  8.71113392,
         10.38296744, 12.37565784],
        ...,
        [ 1.        ,  0.        ,  2.06228136, ...,  6.17742461,
          8.01492223, 10.39899027],
        [ 1.        ,  0.        ,  2.06228136, ...,  6.17742461,
          8.37923688, 11.36583854],
        [ 1.        ,  0.        ,  2.06228136, ...,  6.97373325,
         10.83836075, 16.8446454 ]]),
 (164, 231))

In [1981]:
X_test_trans_final = pipe_final.transform(X_test)
X_test_trans_final,X_test_trans_final.shape

(array([[ 1.        ,  0.        ,  2.10881245, ..., 16.65829057,
         19.1390727 , 21.9892973 ],
        [ 1.        ,  0.        ,  2.10881245, ...,  4.16457264,
          4.6397752 ,  5.1692012 ],
        [ 1.        ,  2.10881245,  0.        , ..., 20.41915464,
         23.75813462, 27.64311111],
        ...,
        [ 1.        ,  2.10881245,  0.        , ..., 12.23874409,
         14.41644437, 16.98163363],
        [ 1.        ,  2.10881245,  0.        , ..., 19.12303764,
         21.12754778, 23.34217417],
        [ 1.        ,  2.10881245,  0.        , ..., 27.5371742 ,
         31.3184826 , 35.61902703]]),
 (41, 231))

In [1982]:
pd.DataFrame(X_train_trans_final).to_csv("X_train_trans_final.csv")
pd.DataFrame(X_test_trans_final).to_csv("X_test_trans_final.csv")

In [1980]:
joblib.dump(pipe_final,"Trained Models\\pipe_final.pkl")

['Trained Models\\pipe_final.pkl']

## Conclusion

After testing for several kinds of regression models, the best model appears to be a Ridge(0.2) + Lasso(0.8) mixed regression model after polynomial transforming the X dataset.\
The failure in the more complex model could possibly suggests that there is insufficient data to even make the test RMSE reach close to that of the train set.\
There are still a lot of options that we can trial and error for better performance, such as conduct each model after a feature selection.\
But there are permutations of feature combinations, and it's just repetitive work and is considered out of the scope of this exercise.\
So I have decided to stop right here.

**Scores (Test)**:\
R2: 0.8904\
RMSE: 2682.9644