In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Practicum/v1/train_df.csv')

Define the features 'X' and target variable 'y'

In [None]:
X = df.drop('TOTALPMT_ADJ',axis=1).drop('LOG_TOTALPMT', axis=1)
y = df[['TOTALPMT_ADJ']]

In [None]:
y

Unnamed: 0,TOTALPMT_ADJ
0,3.206399e+05
1,4.206172e+05
2,7.677933e+04
3,1.592415e+05
4,7.677933e+04
...,...
161133,1.166624e+06
161134,2.438100e+05
161135,4.826990e+05
161136,8.345579e+03


In [None]:
X.drop('index', axis = 1, inplace=True)

In [None]:
X

Unnamed: 0,LICNSTAT_AK,LICNSTAT_AL,LICNSTAT_AR,LICNSTAT_AZ,LICNSTAT_CA,LICNSTAT_CO,LICNSTAT_CT,LICNSTAT_DC,LICNSTAT_DE,LICNSTAT_FL,...,OUTCOME_Cannot Be Determined from Available Records,OUTCOME_Death,OUTCOME_Emotional Injury Only,OUTCOME_Insignificant Injury,OUTCOME_Major Permanent Injury,OUTCOME_Major Temporary Injury,OUTCOME_Minor Permanent Injury,OUTCOME_Minor Temporary Injury,"OUTCOME_Quadriplegic, Brain Damage, Lifelong Care",OUTCOME_Significant Permanent Injury
0,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161133,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
161134,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
161135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
161136,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


I. Hyperparameter Tuning

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
# define the search space for gridsearch
search_space = {
    'max_depth': [5, 10, 15], 
    'n_estimators': [50, 75, 100],
}

In [None]:
rfr = RandomForestRegressor(criterion='squared_error')

In [None]:
GS_object = GridSearchCV(
    rfr, 
    param_grid = search_space,
    scoring = ['r2', 'neg_root_mean_squared_error'],
    refit = 'r2',
    cv = 5, # cv = k-fold cross validation. The input is k.
    verbose = 4,
    n_jobs= -1)

In [None]:
# again, gridsearch will automatically conduct k-fold and determine the optimal parameters using k train and validation sets
GS_object.fit(X, y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [5, 10, 15],
                         'n_estimators': [50, 75, 100]},
             refit='r2', scoring=['r2', 'neg_root_mean_squared_error'],
             verbose=4)

Save to csv

In [None]:
gs_df = pd.DataFrame(GS_object.cv_results_)
gs_df = gs_df.sort_values('rank_test_r2')
gs_df.to_csv('/content/drive/MyDrive/Practicum/v1/gs_df_rfr.csv')

III. Model Evaluation

Define the KFold splits

In [None]:
from sklearn.model_selection import KFold

In [None]:
n_splits = 5
cv = KFold(n_splits)

In [None]:
# hyperparams obtained from stored gridsearchCV results
rfr_optim = RandomForestRegressor(n_estimators = GS_object.best_params_.get('n_estimators'), max_depth= GS_object.best_params_.get('max_depth'))

Evaluate Errors

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
errors = []
for train_index, test_index in cv.split(X):
    X_train_kfold, y_train_kfold = X.iloc[train_index], y.iloc[train_index]
    X_test_kfold, y_test_kfold = X.iloc[test_index], y.iloc[test_index]
    rfr_optim.fit(X_train_kfold, y_train_kfold)
    y_pred = rfr_optim.predict(X_test_kfold)
    errors.append(np.sqrt(mean_squared_error(y_test_kfold, y_pred)))

  """
  """
  """
  """
  """


In [None]:
errors 

[260736.3447857048,
 253813.54483545202,
 251964.04964852394,
 252129.17334084478,
 257519.53281952592]

In [None]:
np.mean(errors)

255232.5290860103

In [None]:
er_df = pd.DataFrame({'errors': errors})
er_df.to_csv('/content/drive/MyDrive/Practicum/v1/er_df_rfr.csv')

Final Evaluation

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/Practicum/v1/test_df.csv')

In [None]:
X_test = test_df.drop('TOTALPMT_ADJ',axis=1).drop('LOG_TOTALPMT', axis=1)
y_test = test_df[['TOTALPMT_ADJ']]

In [None]:
X = X.drop('index', axis=1)

In [None]:
rfr_optim.fit(X, y)
y_pred = rfr_optim.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

  """Entry point for launching an IPython kernel.


253343.85143545613