# Pre-processing after feat.eng.

In [20]:
import pandas as pd 

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import joblib

In [2]:
mode_imp = SimpleImputer(strategy='most_frequent')
scaler = StandardScaler()
encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')

In [3]:
numeric_transform = Pipeline([('impute-mode', mode_imp),
                                ('scaling', scaler)])
categoric_transform = Pipeline([('impute-mode', mode_imp),
                                          ('one-hot-encode', encoder)])

In [4]:
pre_processing = ColumnTransformer([('numeric_cols',numeric_transform,selector(dtype_include="float64")),
                                    ('categoric_cols',categoric_transform,selector(dtype_include="object"))]
                                   ,remainder='drop')

In [5]:
X_train = pd.read_csv('train-test-1/X_train.csv')
y_train = pd.read_csv('train-test-1/y_train.csv')
X_test = pd.read_csv('train-test-1/X_test.csv')
y_test = pd.read_csv('train-test-1/y_test.csv')

In [18]:
def compare_model(base_model,model2):
    y_pred1 = base_model.predict(X_test)
    y_pred2 = model2.predict(X_test)
    
    r2_model1 = r2_score(y_test,y_pred1)
    r2_model2 = r2_score(y_test,y_pred2)
    
    if r2_model1 > r2_model2:
        return print('Base model has better r2 score')
    else:
        return print('Improved model has better r2 score')

# Support Vector Regressor

### K Best added (v2 models)

In [24]:
svr_pipeline = Pipeline(steps=[('processing', pre_processing),
                               ('k_best',SelectKBest()),
                           ('svr',SVR())])

In [25]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'svr__kernel':['rbf', 'sigmoid'],
              'svr__C':[1,1.5,2,2.2,2.5,3],
             'k_best__k':[4,5,6,7,8,9,10]
             }
svr_grid = GridSearchCV(svr_pipeline, param_grid=param_grid, cv=5)
svr_grid.fit(X_train, y_train)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **

CPU times: user 1min 21s, sys: 1.07 s, total: 1min 22s
Wall time: 1min 28s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(transformers=[('numeric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('scaling',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f981ef03048>),
                                                                        ('categoric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                  

In [23]:
best_svr = svr_grid.best_estimator_

In [33]:
joblib.dump(best_svr, 'models-2/svr_v2.pkl')

['models-1/baseline_svr.pkl']

In [19]:
svr_base = joblib.load('models-1/baseline_svr.pkl')
svr_imprv = joblib.load('models-2/svr_v2.pkl')

compare_model(svr_base,svr_imprv)

Improved model has better r2 score


### PCA added (v3 models)

In [21]:
svr_pipeline = Pipeline(steps=[('processing', pre_processing),
                               ('PCA',PCA()),
                           ('svr',SVR())])

In [22]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'svr__kernel':['rbf', 'sigmoid'],
              'svr__C':[1,1.5,2,2.2,2.5,3],
             'PCA__n_components':[4,5,6,7,8]
             }
svr_grid = GridSearchCV(svr_pipeline, param_grid=param_grid, cv=5)
svr_grid.fit(X_train, y_train)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **

CPU times: user 5min 13s, sys: 19.3 s, total: 5min 32s
Wall time: 5min 16s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(transformers=[('numeric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('scaling',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fec9254ddd8>),
                                                                        ('categoric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                  

In [24]:
best_svr = svr_grid.best_estimator_
joblib.dump(best_svr, 'models-3/svr_v3.pkl')

['models-3/svr_v3.pkl']

In [25]:
# comparing baseline with PCA
svr_base = joblib.load('models-1/baseline_svr.pkl')
svr_imprv_pca = joblib.load('models-3/svr_v3.pkl')

compare_model(svr_base,svr_imprv)

Improved model has better r2 score


In [26]:
# comparing kbest with PCA
svr_imprv_kbest = joblib.load('models-2/svr_v2.pkl')

compare_model(svr_imprv_kbest,svr_imprv_pca)

Base model has better r2 score


PCA does worse than KBest

### PCA and K best combined (v4 models)

In [28]:
svr_pipeline = Pipeline(steps=[('processing', pre_processing),
                               ('k_best',SelectKBest()),
                               ('PCA',PCA()),
                               ('svr',SVR())])

In [29]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'svr__kernel':['rbf', 'sigmoid'],
              'svr__C':[1,1.5,2,2.2,2.5,3],
             'PCA__n_components':[4,5,6,7,8],
              'k_best__k':[4,5,6,7,8,9,10]
             }
svr_grid = GridSearchCV(svr_pipeline, param_grid=param_grid, cv=5)
svr_grid.fit(X_train, y_train)

  return f(*args, **kwargs)
  f = msb / msw
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  f = msb / msw
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  f = msb / msw
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  f = msb / msw
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*

CPU times: user 23min 22s, sys: 57.5 s, total: 24min 20s
Wall time: 24min 26s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(transformers=[('numeric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('scaling',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fec9254ddd8>),
                                                                        ('categoric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                  

In [35]:
best_svr = svr_grid.best_estimator_
joblib.dump(best_svr, 'models-4/svr_v4.pkl')

['models-4/svr_v4.pkl']

In [36]:
# comparing baseline with combined PCA and Kbest
svr_base = joblib.load('models-1/baseline_svr.pkl')

compare_model(svr_base,best_svr)

Improved model has better r2 score


In [37]:
# comparing kbest with combined PCA and Kbest
svr_imprv_kbest = joblib.load('models-2/svr_v2.pkl')

compare_model(svr_imprv_kbest,best_svr)

Improved model has better r2 score


In [38]:
# comparing PCA with combined PCA and Kbest
svr_imprv_pca = joblib.load('models-3/svr_v3.pkl')

compare_model(svr_imprv_pca,best_svr)

Improved model has better r2 score


# Random Forest Regressor

### K Best added (v2 models)

In [30]:
pipe_RandForest  = Pipeline(steps=[('processing', pre_processing),
                               ('k_best',SelectKBest()),
                               ('RForest',RandomForestRegressor())])

In [31]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'RForest__n_estimators':[10,20,35,55,80,100],
             'RForest__min_samples_split':[4,6,8,10],
             'RForest__min_samples_leaf':[1,3,5,7,9],
              'k_best__k':[4,5,6,7,8,9,10]
             }
rForest_grid = GridSearchCV(pipe_RandForest, param_grid=param_grid, cv=5)
rForest_grid.fit(X_train, y_train)

  return f(*args, **kwargs)
  f = msb / msw
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  f = msb / msw
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  f = msb / msw
  self._final_estimator.fit(Xt, y, **fit_params_last_s

CPU times: user 26min 53s, sys: 38.5 s, total: 27min 31s
Wall time: 31min 17s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(transformers=[('numeric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('scaling',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fec9254ddd8>),
                                                                        ('categoric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                  

In [32]:
best_rForest = rForest_grid.best_estimator_
joblib.dump(best_rForest, 'models-2/rForest_v2.pkl')

['models-2/rForest_v2.pkl']

In [33]:
rForest_base = joblib.load('models-1/baseline_rForest.pkl')
rForest_imprv = joblib.load('models-2/rForest_v2.pkl')

compare_model(rForest_base,rForest_imprv)

Improved model has better r2 score


In [40]:
y_pred = best_rForest.predict(X_test)
r2_score(y_test,y_pred)

0.44743014378851875

In [41]:
y_pred2 = rForest_base.predict(X_test)
r2_score(y_test,y_pred2)

0.31365686155465533

### PCA added (v3)

In [44]:
pipe_RandForest  = Pipeline(steps=[('processing', pre_processing),
                               ('PCA',PCA()),
                               ('RForest',RandomForestRegressor())])

In [45]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'RForest__n_estimators':[10,20,35,55,80,100],
             'RForest__min_samples_split':[4,6,8,10],
             'RForest__min_samples_leaf':[1,3,5,7,9],
              'PCA__n_components':[4,5,6,7,8]
             }
rForest_grid = GridSearchCV(pipe_RandForest, param_grid=param_grid, cv=5)
rForest_grid.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

CPU times: user 44min 24s, sys: 2min 50s, total: 47min 14s
Wall time: 46min 35s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(transformers=[('numeric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('scaling',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fec9254ddd8>),
                                                                        ('categoric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                  

In [46]:
best_rForest = rForest_grid.best_estimator_
joblib.dump(best_rForest, 'models-3/rForest_v3.pkl')

['models-3/rForest_v3.pkl']

In [47]:
# comparing baseline with PCA
rForest_base = joblib.load('models-1/baseline_rForest.pkl')

compare_model(rForest_base,best_rForest)

Base model has better r2 score


In [48]:
# comparing kbest with PCA
rForest_imprv_kbest = joblib.load('models-2/rForest_v2.pkl')

compare_model(rForest_imprv_kbest,best_rForest)

Base model has better r2 score


### PCA and K best combined (v4)

In [49]:
pipe_RandForest  = Pipeline(steps=[('processing', pre_processing),
                                   ('k_best',SelectKBest()),
                               ('PCA',PCA()),
                               ('RForest',RandomForestRegressor())])

In [None]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'RForest__n_estimators':[10,20,35,55,80,100],
             'RForest__min_samples_split':[4,6,8,10],
             'RForest__min_samples_leaf':[1,3,5,7,9],
              'PCA__n_components':[4,5,6,7,8],
              'k_best__k':[4,5,6,7,8,9,10]
             }
rForest_grid = GridSearchCV(pipe_RandForest, param_grid=param_grid, cv=5)
rForest_grid.fit(X_train, y_train)

  return f(*args, **kwargs)
  f = msb / msw
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  f = msb / msw
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  f = msb / msw
  self._final_estimator.fit(Xt, y, **fit_params_last_s

In [None]:
best_rForest = rForest_grid.best_estimator_
joblib.dump(best_rForest, 'models-4/rForest_v4.pkl')

In [None]:
# comparing baseline with combined
rForest_base = joblib.load('models-1/baseline_rForest.pkl')

compare_model(rForest_base,best_rForest)

In [None]:
# comparing kbest with combined
rForest_imprv_kbest = joblib.load('models-2/rForest_v2.pkl')

compare_model(rForest_imprv_kbest,best_rForest)

In [None]:
# comparing PCA with combined 
rForest_imprv_pca = joblib.load('models-3/rForest_v3.pkl')

compare_model(rForest_imprv_pca,best_rForest)

# XGBRegressor

In [260]:
pipe_GrBoost = Pipeline(steps=[('processing', pre_processing),
                           ('GrBoost',GradientBoostingRegressor(loss='lad',learning_rate=0.1,n_estimators=100,max_depth=6,min_samples_split=5))])

In [265]:
pipe_GrBoost.fit(X_train,y_train)

Pipeline(steps=[('processing',
                 ColumnTransformer(transformers=[('numeric_cols',
                                                  Pipeline(steps=[('impute-mode',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaling',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fa366907780>),
                                                 ('categoric_cols',
                                                  Pipeline(steps=[('impute-mode',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('one-hot-encode',
                                                                  

In [278]:
y_pred = pipe_GrBoost.predict(X_test)

In [279]:
r2_score(y_test,y_pred)

0.4418261732614325

In [273]:
joblib.dump(pipe_GrBoost, 'models-1/baseline_GrBoost.pkl')

['models-1/baseline_GrBoost.pkl']

# Elastic net

In [284]:
pipe_eNet = Pipeline(steps=[('processing', pre_processing),
                           ('eNet',ElasticNet())])

In [285]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'eNet__alpha':[0.1,0.2,0.5,1],
             'eNet__l1_ratio':[0.1,0.5,0.7],
             'eNet__max_iter':[1000,2000]}
eNet_grid = GridSearchCV(pipe_eNet, param_grid=param_grid, cv=5)
eNet_grid.fit(X_train, y_train)

CPU times: user 17.4 s, sys: 3.27 s, total: 20.7 s
Wall time: 7.83 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(transformers=[('numeric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('scaling',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fa366907780>),
                                                                        ('categoric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                  

In [286]:
best_eNet = eNet_grid.best_estimator_
best_hyperparams_eNet = eNet_grid.best_params_

In [287]:
best_hyperparams_eNet

{'eNet__alpha': 0.1, 'eNet__l1_ratio': 0.1, 'eNet__max_iter': 1000}

In [288]:
y_pred = best_eNet.predict(X_test)
r2_score(y_test,y_pred)

0.3031358422738887

In [289]:
joblib.dump(best_eNet, 'models-1/baseline_eNet.pkl')

['models-1/baseline_eNet.pkl']