# Pre-processing after feat.eng.

In [281]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import joblib

In [140]:
mode_imp = SimpleImputer(strategy='most_frequent')
scaler = StandardScaler()
encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')

In [141]:
numeric_transform = Pipeline([('impute-mode', mode_imp),
                                ('scaling', scaler)])
categoric_transform = Pipeline([('impute-mode', mode_imp),
                                          ('one-hot-encode', encoder)])

In [142]:
pre_processing = ColumnTransformer([('numeric_cols',numeric_transform,selector(dtype_include="float64")),
                                    ('categoric_cols',categoric_transform,selector(dtype_include="object"))]
                                   ,remainder='drop')

In [143]:
y = final_df['seq']
X = final_df.drop(['seq'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [270]:
X_train.to_csv('train-test-1/X_train.csv',index=False)
y_train.to_csv('train-test-1/y_train.csv',index=False)
X_test.to_csv('train-test-1/X_test.csv',index=False)
y_test.to_csv('train-test-1/y_test.csv',index=False)

# Support Vector Regressor

In [217]:
svr_pipeline = Pipeline(steps=[('processing', pre_processing),
                           ('svr',SVR())])

In [218]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'svr__kernel':['rbf', 'sigmoid'],
              'svr__C':[1,1.5,2,2.2,2.5,3]}
svr_grid = GridSearchCV(svr_pipeline, param_grid=param_grid, cv=5)
svr_grid.fit(X_train, y_train)

CPU times: user 1min 15s, sys: 491 ms, total: 1min 15s
Wall time: 1min 16s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(transformers=[('numeric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('scaling',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fa366907780>),
                                                                        ('categoric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                  

In [219]:
best_svr = svr_grid.best_estimator_
best_hyperparams_svr = svr_grid.best_params_

In [220]:
best_hyperparams_svr

{'svr__C': 3, 'svr__kernel': 'rbf'}

In [221]:
y_pred = svr_grid.predict(X_test)

In [222]:
r2_score(y_test,y_pred)

0.31368271393013125

In [223]:
mean_absolute_error(y_test,y_pred)

3.117398537611225

In [224]:
mean_squared_error(y_test,y_pred)

48.52796766292824

In [226]:
joblib.dump(best_svr, 'models-1/baseline_svr.pkl')

['models-1/baseline_svr.pkl']

# Random Forest Regressor

In [198]:
pipe_RandForest = Pipeline(steps=[('processing', pre_processing),
                           ('RForest',RandomForestRegressor())])

In [205]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'RForest__n_estimators':[10,20,35,55,80,100],
             'RForest__min_samples_split':[4,6,8,10],
             'RForest__min_samples_leaf':[1,3,5,7,9]}
rForest_grid = GridSearchCV(pipe_RandForest, param_grid=param_grid, cv=5)
rForest_grid.fit(X_train, y_train)

CPU times: user 9min 47s, sys: 5.36 s, total: 9min 53s
Wall time: 10min 27s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(transformers=[('numeric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('scaling',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fa366907780>),
                                                                        ('categoric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                  

In [209]:
best_rForest = rForest_grid.best_estimator_
best_hyperparams_rForest = rForest_grid.best_params_

In [211]:
best_hyperparams_rForest

{'RForest__min_samples_leaf': 5,
 'RForest__min_samples_split': 8,
 'RForest__n_estimators': 35}

In [212]:
y_pred = rForest_grid.predict(X_test)

In [213]:
r2_score(y_test,y_pred)

0.4514790347792569

In [230]:
joblib.dump(best_rForest, 'models-1/baseline_rForest.pkl')

['models-1/baseline_rForest.pkl']

# XGBRegressor

In [260]:
pipe_GrBoost = Pipeline(steps=[('processing', pre_processing),
                           ('GrBoost',GradientBoostingRegressor(loss='lad',learning_rate=0.1,n_estimators=100,max_depth=6,min_samples_split=5))])

In [265]:
pipe_GrBoost.fit(X_train,y_train)

Pipeline(steps=[('processing',
                 ColumnTransformer(transformers=[('numeric_cols',
                                                  Pipeline(steps=[('impute-mode',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaling',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fa366907780>),
                                                 ('categoric_cols',
                                                  Pipeline(steps=[('impute-mode',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('one-hot-encode',
                                                                  

In [278]:
y_pred = pipe_GrBoost.predict(X_test)

In [279]:
r2_score(y_test,y_pred)

0.4418261732614325

In [273]:
joblib.dump(pipe_GrBoost, 'models-1/baseline_GrBoost.pkl')

['models-1/baseline_GrBoost.pkl']

# Elastic net

In [284]:
pipe_eNet = Pipeline(steps=[('processing', pre_processing),
                           ('eNet',ElasticNet())])

In [285]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'eNet__alpha':[0.1,0.2,0.5,1],
             'eNet__l1_ratio':[0.1,0.5,0.7],
             'eNet__max_iter':[1000,2000]}
eNet_grid = GridSearchCV(pipe_eNet, param_grid=param_grid, cv=5)
eNet_grid.fit(X_train, y_train)

CPU times: user 17.4 s, sys: 3.27 s, total: 20.7 s
Wall time: 7.83 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(transformers=[('numeric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('scaling',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fa366907780>),
                                                                        ('categoric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                  

In [286]:
best_eNet = eNet_grid.best_estimator_
best_hyperparams_eNet = eNet_grid.best_params_

In [287]:
best_hyperparams_eNet

{'eNet__alpha': 0.1, 'eNet__l1_ratio': 0.1, 'eNet__max_iter': 1000}

In [288]:
y_pred = best_eNet.predict(X_test)
r2_score(y_test,y_pred)

0.3031358422738887

In [289]:
joblib.dump(best_eNet, 'models-1/baseline_eNet.pkl')

['models-1/baseline_eNet.pkl']