In [168]:
import pandas as pd 

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import joblib

In [204]:
X_train = pd.read_csv('train-test-1/X_train.csv')
y_train = pd.read_csv('train-test-1/y_train.csv')
X_test = pd.read_csv('train-test-1/X_test.csv')
y_test = pd.read_csv('train-test-1/y_test.csv')

In [170]:
X_train.isnull().sum()

gender                         10
icustay_admit_age               0
icustay_first_careunit          0
height                       2547
weight_first                  747
sapsi_first                   786
sofa_first                    388
cost_weight                   339
marital_status_descr          286
ethnicity_descr                24
overall_payor_group_descr      24
religion_descr                107
med_dur_min                  2636
total_amt                    1122
io_dur_min                   2098
dtype: int64

In [172]:
mode_imp = SimpleImputer(strategy='most_frequent')
scaler = StandardScaler()
encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')

In [173]:
numeric_transform = Pipeline([('impute-mode', mode_imp),
                                ('scaling', scaler)])
categoric_transform = Pipeline([('impute-mode', mode_imp),
                                          ('one-hot-encode', encoder)])

In [174]:
pre_processing = ColumnTransformer([('numeric_cols',numeric_transform,selector(dtype_include="float64")),
                                    ('categoric_cols',categoric_transform,selector(dtype_include="object"))]
                                   ,remainder='drop')

In [175]:
#pipe_RandForest = Pipeline(steps=[('processing', pre_processing),
#                                  ('RForest',RandomForestRegressor())])

In [176]:
#pipe_RandForest.fit(X_train,y_train)

In [178]:
def progress_r2(model):
    y_pred=model.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2

In [188]:
r2_lst=[]
def lst_r2(r2):
    r2_lst.append(r2)

# FIRST GRID SEARCH R2

In [203]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'RForest__n_estimators':[30,50,70,90,100,120]}
rForest_grid = GridSearchCV(pipe_RandForest, param_grid=param_grid, cv=5)
rForest_grid.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


KeyboardInterrupt: 

In [183]:
rForest_grid.best_params_

{'RForest__n_estimators': 50}

In [185]:
progress_r2(rForest_grid.best_estimator_)

0.44929303475308324

In [187]:
joblib.dump(rForest_grid,'grid1.pkl')

['grid1.pkl']

In [189]:
lst_r2(progress_r2(rForest_grid.best_estimator_))

# SECOND GRID SEARCH 

In [None]:
X_train = pd.read_csv('train-test-1/X_train.csv')
y_train = pd.read_csv('train-test-1/y_train.csv')
X_test = pd.read_csv('train-test-1/X_test.csv')
y_test = pd.read_csv('train-test-1/y_test.csv')

In [205]:
X_train = X_train.drop(['height'],axis=1)
X_test = X_test.drop(['height'],axis=1)

In [206]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'RForest__n_estimators':[30,50,70,90,100,120]}
rForest_grid = GridSearchCV(pipe_RandForest, param_grid=param_grid, cv=5)
rForest_grid.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

CPU times: user 57.3 s, sys: 618 ms, total: 58 s
Wall time: 59.7 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(transformers=[('numeric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('scaling',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f83d533ca90>),
                                                                        ('categoric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                  

In [207]:
joblib.dump(rForest_grid,'grid2.pkl')

['grid2.pkl']

In [211]:
lst_r2(progress_r2(rForest_grid.best_estimator_))

In [212]:
r2_lst

[0.44929303475308324, 0.4586655920519793]

In [213]:
rForest_grid.best_params_

{'RForest__n_estimators': 50}

# THIRD GRID SEARCH

In [221]:
X_train = pd.read_csv('train-test-1/X_train.csv')
y_train = pd.read_csv('train-test-1/y_train.csv')
X_test = pd.read_csv('train-test-1/X_test.csv')
y_test = pd.read_csv('train-test-1/y_test.csv')

In [222]:
X_train = X_train.drop(['height','marital_status_descr'],axis=1)
X_test = X_test.drop(['height','marital_status_descr'],axis=1)

In [223]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'RForest__n_estimators':[70,90,100,120,140,160,180,200]}
rForest_grid = GridSearchCV(pipe_RandForest, param_grid=param_grid, cv=5)
rForest_grid.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

CPU times: user 2min 1s, sys: 1.01 s, total: 2min 2s
Wall time: 2min 4s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(transformers=[('numeric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('scaling',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f83d533ca90>),
                                                                        ('categoric_cols',
                                                                         Pipeline(steps=[('impute-mode',
                                  

In [217]:
joblib.dump(rForest_grid,'grid3.pkl')

['grid3.pkl']

In [225]:
lst_r2(progress_r2(rForest_grid.best_estimator_))

In [229]:
r2_lst

[0.44929303475308324, 0.4586655920519793, 0.4458294041833306]

In [230]:
rForest_grid.best_params_

{'RForest__n_estimators': 200}

# FOURTH GRID SEARCH