# Stage Three Quiz Session

In [48]:
# import relevant libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [49]:
# load dataset into dataframe
df = pd.read_csv('data.csv')

In [50]:
df.drop('stab', axis=1, inplace=True)

In [51]:
encoder = LabelEncoder()
df['stabf'] = encoder.fit_transform(df['stabf'])

In [52]:
x = df.drop('stabf', axis=1)
y = df['stabf']

In [53]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [54]:
scaler = StandardScaler()

scaled_df = scaler.fit_transform(x_train, y_train)
x_train = pd.DataFrame(scaled_df, columns=x.columns)

x_test = scaler.transform(x_test)
x_test = pd.DataFrame(x_test, columns=x.columns)

### Random Forest Classifier

In [55]:
from sklearn.ensemble import RandomForestClassifier

In [56]:
rfc = RandomForestClassifier(random_state=1, verbose=0)

In [57]:
rfc_model = rfc.fit(x_train, y_train)

In [58]:
y_pred_rfc = rfc_model.predict(x_test)

In [59]:
y_pred_rfc[0:5]

array([1, 1, 0, 0, 1])

In [60]:
metrics.confusion_matrix(y_test, y_pred_rfc)

array([[ 625,   87],
       [  55, 1233]], dtype=int64)

In [61]:
rfc_summary = pd.DataFrame(metrics.classification_report(y_test, y_pred_rfc, output_dict=True)).T

In [62]:
rfc_summary

Unnamed: 0,precision,recall,f1-score,support
0,0.919118,0.877809,0.897989,712.0
1,0.934091,0.957298,0.945552,1288.0
accuracy,0.929,0.929,0.929,0.929
macro avg,0.926604,0.917554,0.92177,2000.0
weighted avg,0.92876,0.929,0.928619,2000.0


### ExtraTrees without Tuning

In [63]:
from sklearn.ensemble import ExtraTreesClassifier

In [64]:
extra_no_tune = ExtraTreesClassifier(random_state=1)

In [65]:
no_tuned_model = extra_no_tune.fit(x_train, y_train)

In [66]:
y_pred_extra_no_tune = no_tuned_model.predict(x_test)

In [67]:
extra_no_tune_summary = pd.DataFrame(metrics.classification_report(y_test, y_pred_extra_no_tune, output_dict=True)).T

In [68]:
extra_no_tune_summary

Unnamed: 0,precision,recall,f1-score,support
0,0.940994,0.851124,0.893805,712.0
1,0.921829,0.970497,0.945537,1288.0
accuracy,0.928,0.928,0.928,0.928
macro avg,0.931411,0.91081,0.919671,2000.0
weighted avg,0.928652,0.928,0.927121,2000.0


### ExtraTrees with Tuning

In [69]:
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [70]:
est = ExtraTreesClassifier(random_state=1)

In [71]:
rcv = RandomizedSearchCV(est, hyperparameter_grid, cv=5, n_jobs=-1, verbose=2, random_state=1)

In [72]:
extra_model = rcv.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.2min finished


In [73]:
y_pred_extra = extra_model.predict(x_test)

In [74]:
metrics.confusion_matrix(y_test, y_pred_extra)

array([[ 619,   93],
       [  53, 1235]], dtype=int64)

In [75]:
extra_summary = pd.DataFrame(metrics.classification_report(y_test, y_pred_extra, output_dict=True)).T

In [76]:
extra_summary

Unnamed: 0,precision,recall,f1-score,support
0,0.921131,0.869382,0.894509,712.0
1,0.92997,0.958851,0.94419,1288.0
accuracy,0.927,0.927,0.927,0.927
macro avg,0.92555,0.914116,0.919349,2000.0
weighted avg,0.926823,0.927,0.926503,2000.0


### XGBoost Classifier

In [77]:
xgb = XGBClassifier(verbosity=0, random_state=1)

In [78]:
xgb_model = xgb.fit(x_train, y_train)

In [79]:
y_pred_xgb = xgb_model.predict(x_test)

In [80]:
metrics.confusion_matrix(y_test, y_pred_xgb)

array([[ 648,   64],
       [  45, 1243]], dtype=int64)

In [81]:
xgboost_summary = pd.DataFrame(metrics.classification_report(y_test, y_pred_xgb, output_dict=True)).T

In [82]:
xgboost_summary

Unnamed: 0,precision,recall,f1-score,support
0,0.935065,0.910112,0.92242,712.0
1,0.951033,0.965062,0.957996,1288.0
accuracy,0.9455,0.9455,0.9455,0.9455
macro avg,0.943049,0.937587,0.940208,2000.0
weighted avg,0.945348,0.9455,0.945331,2000.0


In [93]:
xgb = XGBClassifier(random_state=1).fit(x_train, y_train)
result = pd.DataFrame(metrics.classification_report(y_test, xgb.predict(x_test), output_dict=True)).T
result

Unnamed: 0,precision,recall,f1-score,support
0,0.935065,0.910112,0.92242,712.0
1,0.951033,0.965062,0.957996,1288.0
accuracy,0.9455,0.9455,0.9455,0.9455
macro avg,0.943049,0.937587,0.940208,2000.0
weighted avg,0.945348,0.9455,0.945331,2000.0


### LightGBM Classifier

In [83]:
lgbm = LGBMClassifier(random_state=1)

In [84]:
lgbm_model = lgbm.fit(x_train, y_train)

In [85]:
y_pred_lgbm = lgbm_model.predict(x_test)

In [86]:
metrics.confusion_matrix(y_test, y_pred_lgbm)

array([[ 635,   77],
       [  48, 1240]], dtype=int64)

In [87]:
lgbm_summary = pd.DataFrame(metrics.classification_report(y_test, y_pred_lgbm, output_dict=True)).T

In [88]:
lgbm_summary

Unnamed: 0,precision,recall,f1-score,support
0,0.929722,0.891854,0.910394,712.0
1,0.941534,0.962733,0.952015,1288.0
accuracy,0.9375,0.9375,0.9375,0.9375
macro avg,0.935628,0.927293,0.931205,2000.0
weighted avg,0.937329,0.9375,0.937198,2000.0


### Question 12

### Question 13

### Question 14

### Question 15

### Question 16

### Question 18

### Question 19

### Question 20