# Stage Three Quiz Session

In [1]:
# import relevant libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
# load dataset into dataframe
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv'
df = pd.read_csv(url)

In [3]:
df.drop('stab', axis=1, inplace=True)

In [4]:
encoder = LabelEncoder()
df['stabf'] = encoder.fit_transform(df['stabf'])

In [5]:
x = df.drop('stabf', axis=1)
y = df['stabf']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [7]:
scaler = StandardScaler()

scaled_df = scaler.fit_transform(x_train, y_train)
x_train = pd.DataFrame(scaled_df, columns=x.columns)

x_test = scaler.transform(x_test)
x_test = pd.DataFrame(x_test, columns=x.columns)

### Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
rfc = RandomForestClassifier(random_state=1, verbose=0)

In [10]:
rfc_model = rfc.fit(x_train, y_train)

In [11]:
y_pred_rfc = rfc_model.predict(x_test)

In [12]:
y_pred_rfc[0:5]

array([1, 1, 0, 0, 1])

In [13]:
metrics.confusion_matrix(y_test, y_pred_rfc)

array([[ 625,   87],
       [  55, 1233]])

In [14]:
rfc_summary = pd.DataFrame(metrics.classification_report(y_test, y_pred_rfc, output_dict=True)).T

In [49]:
rfc_summary

Unnamed: 0,precision,recall,f1-score,support
0,0.919118,0.877809,0.897989,712.0
1,0.934091,0.957298,0.945552,1288.0
accuracy,0.929,0.929,0.929,0.929
macro avg,0.926604,0.917554,0.92177,2000.0
weighted avg,0.92876,0.929,0.928619,2000.0


### ExtraTrees without Tuning

In [16]:
from sklearn.ensemble import ExtraTreesClassifier

In [17]:
extra_no_tune = ExtraTreesClassifier(random_state=1)

In [18]:
no_tuned_model = extra_no_tune.fit(x_train, y_train)

In [19]:
y_pred_extra_no_tune = no_tuned_model.predict(x_test)

In [20]:
extra_no_tune_summary = pd.DataFrame(metrics.classification_report(y_test, y_pred_extra_no_tune, output_dict=True)).T

In [44]:
extra_no_tune_summary.round(4)

Unnamed: 0,precision,recall,f1-score,support
0,0.941,0.8511,0.8938,712.0
1,0.9218,0.9705,0.9455,1288.0
accuracy,0.928,0.928,0.928,0.928
macro avg,0.9314,0.9108,0.9197,2000.0
weighted avg,0.9287,0.928,0.9271,2000.0


### ExtraTrees with Tuning

In [22]:
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [23]:
est = ExtraTreesClassifier(random_state=1)

In [50]:
RandomizedSearchCV?

In [51]:
rcv = RandomizedSearchCV(est, hyperparameter_grid, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)

In [52]:
extra_model = rcv.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.5min finished


In [53]:
y_pred_extra = extra_model.predict(x_test)

In [54]:
metrics.confusion_matrix(y_test, y_pred_extra)

array([[ 619,   93],
       [  53, 1235]])

In [55]:
extra_summary = pd.DataFrame(metrics.classification_report(y_test, y_pred_extra, output_dict=True)).T

In [56]:
extra_summary.round(4)

Unnamed: 0,precision,recall,f1-score,support
0,0.9211,0.8694,0.8945,712.0
1,0.93,0.9589,0.9442,1288.0
accuracy,0.927,0.927,0.927,0.927
macro avg,0.9256,0.9141,0.9193,2000.0
weighted avg,0.9268,0.927,0.9265,2000.0


In [57]:
extra_model.best_params_

{'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [60]:
est = ExtraTreesClassifier(max_features=None, min_samples_leaf=8, min_samples_split=2, n_estimators=1000, random_state=1)

In [61]:
optimal = est.fit(x_train, y_train)

In [64]:
optimal.feature_importances_

array([0.13723975, 0.1405075 , 0.13468029, 0.13541676, 0.00368342,
       0.00533686, 0.00542927, 0.00496249, 0.10256244, 0.10757765,
       0.11306268, 0.10954089])

In [65]:
x.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4'],
      dtype='object')

### XGBoost Classifier

In [30]:
xgb = XGBClassifier(random_state=1)

In [31]:
xgb_model = xgb.fit(x_train, y_train)

In [32]:
y_pred_xgb = xgb_model.predict(x_test)

In [33]:
metrics.confusion_matrix(y_test, y_pred_xgb)

array([[ 603,  109],
       [  52, 1236]])

In [34]:
xgboost_summary = pd.DataFrame(metrics.classification_report(y_test, y_pred_xgb, output_dict=True)).T

In [46]:
xgboost_summary.round(4)

Unnamed: 0,precision,recall,f1-score,support
0,0.9206,0.8469,0.8822,712.0
1,0.919,0.9596,0.9389,1288.0
accuracy,0.9195,0.9195,0.9195,0.9195
macro avg,0.9198,0.9033,0.9105,2000.0
weighted avg,0.9195,0.9195,0.9187,2000.0


In [47]:
xgb = XGBClassifier(random_state=1).fit(x_train, y_train)
result = pd.DataFrame(metrics.classification_report(y_test, xgb.predict(x_test), output_dict=True)).T
result.round(4)

Unnamed: 0,precision,recall,f1-score,support
0,0.9206,0.8469,0.8822,712.0
1,0.919,0.9596,0.9389,1288.0
accuracy,0.9195,0.9195,0.9195,0.9195
macro avg,0.9198,0.9033,0.9105,2000.0
weighted avg,0.9195,0.9195,0.9187,2000.0


### LightGBM Classifier

In [37]:
lgbm = LGBMClassifier(random_state=1)

In [38]:
lgbm_model = lgbm.fit(x_train, y_train)

In [39]:
y_pred_lgbm = lgbm_model.predict(x_test)

In [40]:
metrics.confusion_matrix(y_test, y_pred_lgbm)

array([[ 635,   77],
       [  48, 1240]])

In [41]:
lgbm_summary = pd.DataFrame(metrics.classification_report(y_test, y_pred_lgbm, output_dict=True)).T

In [48]:
lgbm_summary.round(4)

Unnamed: 0,precision,recall,f1-score,support
0,0.9297,0.8919,0.9104,712.0
1,0.9415,0.9627,0.952,1288.0
accuracy,0.9375,0.9375,0.9375,0.9375
macro avg,0.9356,0.9273,0.9312,2000.0
weighted avg,0.9373,0.9375,0.9372,2000.0


### Question 1

In [66]:
recall = 355 / (355 + 45)
precision = 355 / (355 + 1480)

f1 = 2 * ((recall * precision) / (recall + precision))
f1

0.3176733780760626