# Creating Unfitted Models

The purpose of this notebook is to provide a tool for easily creating multiple models with some basic hyperparameter tuning - these models are saved as joblib files to the `models/unfitted` directory and can later be fit on your own input data using the Rock Predictor pipeline. As an example of the output, we provide a very basic random forest model as a joblib file under `models/unfitted` in the GitHub repository.

To use this notebook, you will need to first run the first 2 steps in the training phase of the Rock Predictor pipeline to calculate and save the features to CSV file format, which output to the `data/pipeline` directory. This can be done by running the target: `make data/pipeline/train_features.csv data/pipeline/test_features.csv`.

The files you should have created at this point in the `data/pipeline` folder are: 

* train.csv
* train_features.csv

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from joblib import dump, load

import xgboost as xgb

import sys, os
sys.path.insert(0, os.path.abspath('../rock_predictor'))
from helpers.model import ColumnSelector

pd.set_option('display.max_rows', 1000)

Load in files created by pipeline and look at head and dimensions of data

In [None]:
df = pd.read_csv("../data/pipeline/train.csv", index_col=0)
df.head()

In [None]:
df.columns

In [None]:
df_features = pd.read_csv("../data/pipeline/train_features.csv")
df_features.head()

In [None]:
df_features.columns

In [None]:
print(df_features.shape)
df_features.litho_rock_class.dropna(inplace=True)
print(df_features.shape)

Separate out non-feature columns from features

In [None]:
# Update with your own customized columns
cols_to_exclude = ["hole_id", "exp_rock_type", "exp_rock_class", "litho_rock_type",
                   "litho_rock_class", 'ActualX_mean', 'ActualY_mean']

# Separate target and features
X = df_features.drop(columns=cols_to_exclude)
y = df_features.litho_rock_class # Target column

X.columns

In [None]:
# Update with your own feature column names
cols = ['pos_lagOfLag_median', 'pos_lag1_diff_median', 'time_count', 
        'hvib_std', 'hvib_max', 'hvib_min', 'hvib_sum','hvib_median', 'hvib_10th_quant','hvib_25th_quant', 'hvib_75th_quant', 'hvib_90th_quant','hvib_num_oscillations', 
        'vvib_std', 'vvib_max', 'vvib_min', 'vvib_sum','vvib_median', 'vvib_10th_quant', 'vvib_25th_quant', 'vvib_75th_quant','vvib_90th_quant', 'vvib_num_oscillations', 
        'pull_std', 'pull_max','pull_min', 'pull_sum', 'pull_median', 'pull_10th_quant','pull_25th_quant', 'pull_75th_quant', 'pull_90th_quant','pull_prop_max', 'pull_prop_half', 'pull_num_oscillations', 
        'air_std', 'air_max', 'air_min', 'air_sum', 'air_median', 'air_10th_quant','air_25th_quant', 'air_75th_quant', 'air_90th_quant','air_num_oscillations', 
        'pos_std', 'pos_max', 'pos_min', 'pos_sum','pos_median', 'pos_10th_quant', 'pos_25th_quant', 'pos_75th_quant','pos_90th_quant', 
        'depth_std', 'depth_max', 'depth_min', 'depth_sum','depth_median', 'depth_10th_quant', 'depth_25th_quant','depth_75th_quant', 'depth_90th_quant', 
        'rot_std', 'rot_max', 'rot_min','rot_sum', 'rot_median', 'rot_10th_quant', 'rot_25th_quant','rot_75th_quant', 'rot_90th_quant', 'rot_num_oscillations', 
        'water_std','water_max', 'water_min', 'water_sum', 'water_median','water_10th_quant', 'water_25th_quant', 'water_75th_quant','water_90th_quant', 'water_prop_zero', 
        'penetration_rate',
        'exp_rock_type_onehot_AMP', 'exp_rock_type_onehot_GN',
        'exp_rock_type_onehot_IF', 'exp_rock_type_onehot_LIMO',
        'exp_rock_type_onehot_QR', 'exp_rock_type_onehot_SIF']

### Simple Random Forest

In [None]:
pipe_rfc = Pipeline(steps=[
    ("col_selector", ColumnSelector(cols)),
    ("rfc", RandomForestClassifier())])

param_dist = { 
    'rfc__n_estimators': range(10,501),
    'rfc__max_features': ['sqrt', 'log2'],
    'rfc__max_depth' : range(2,31),
    'rfc__criterion' :['gini', 'entropy']
}

cv_rfc = RandomizedSearchCV(estimator=pipe_rfc, param_distributions=param_dist, n_iter=10, cv=5, iid=False)
# We're fitting just to find the best hyperparameters, but it will be overwritten in the pipeline
cv_rfc.fit(X, y)

best_rfc = cv_rfc.best_estimator_
best_rfc.description = "Simple Random Forest"

dump(best_rfc, "../models/unfitted/randomforest.joblib")

### XGBoost

In [None]:
pipe_xgb = Pipeline(steps=[
    ("col_selector", ColumnSelector(cols)),
    ("xgb", xgb.XGBClassifier())])

param_xgb = {
        'xgb__learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25],
        'xgb__max_depth': np.arange(2, 31),
        'xgb__n_estimators': np.arange(10, 501, 2),
        'xgb__gamma': [0.5, 1, 1.5, 2, 5, 10],
        'xgb__reg_lambda': [0.1, 1, 25, 50, 100],
        'xgb__reg_alpha': [0, 0.1, 1, 25, 50, 100],
        'xgb__min_child_weight': [1, 5, 10],
        'xgb__subsample': [0.25, 0.5, 0.6, 0.8, 1.0],
        'xgb__colsample_bytree': [0.5, 0.8, 1.0],
        }

cv_xgb = RandomizedSearchCV(estimator=pipe_xgb, param_distributions=param_xgb, n_iter=10, cv=5, iid=False)
# We're fitting just to find the best hyperparameters, but it will be overwritten in the pipeline
cv_xgb.fit(X, y)

best_xgb = cv_xgb.best_estimator_
best_xgb.description = "Simple XGBoost"

dump(best_xgb, "../models/unfitted/xgboost.joblib")