In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [4]:
df = pd.read_csv('electricity_grid.csv')

In [5]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [6]:
df.stabf.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [7]:
df.drop('stab', axis=1, inplace=True)

In [8]:
X = df.drop('stabf', axis=1)
y = df.stabf

In [9]:
col_name = X.columns

In [10]:
x_train, x_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=1)

In [11]:
scaler = StandardScaler()

In [12]:
X_train = scaler.fit_transform(x_train, y_train)
X_test = scaler.transform(x_test)

In [13]:
def the_score(pred, output, label='unstable'):
    print(confusion_matrix(output, pred, labels=['stable', 'unstable']))
    print('Accuracy: ', accuracy_score(output, pred))
    print('Precision: ', precision_score(output, pred, pos_label=label))
    print('Recall: ', recall_score(output, pred, pos_label=label))
    print('F1_score: ', f1_score(output, pred, pos_label=label))

In [14]:
def train_model(model, features, target, new_features, new_target):
    model.fit(features, target)
    pred = model.predict(new_features)
    the_score(pred, new_target)

# QUESTION 1

F1_Score = = 2*((precision × recall)/(precision + recall))

precision = TP/(TP+FP)

recall = TP/(TP+FN)


In [15]:
# TP = 355
# FP = 1480
# FN = 45

p = 355/(355+1480)
r = 355/(355+45)

print('F1_SCORE: ', 2*((p*r)/(p+r)))

F1_SCORE:  0.3176733780760626


In [16]:
rfc = RandomForestClassifier(random_state=1)
etc = ExtraTreesClassifier(random_state=1)
xgb = XGBClassifier(random_state=1)
lgbm = LGBMClassifier(random_state=1)

# QUESTION 14
### RANDOMFOREST CLASSIFIER

In [17]:
train_model(rfc, X_train, y_train, X_test, y_test)

[[ 625   87]
 [  55 1233]]
Accuracy:  0.929
Precision:  0.9340909090909091
Recall:  0.9572981366459627
F1_score:  0.9455521472392638


- Accuracy on the test set for the RandomForestClassifier is **0.9290** in 4d.p

# QUESTION 15
### XGBOOST CLASSIFIER

In [18]:
train_model(xgb, X_train, y_train, X_test, y_test)

[[ 603  109]
 [  52 1236]]
Accuracy:  0.9195
Precision:  0.9189591078066914
Recall:  0.9596273291925466
F1_score:  0.9388530193695404


- Accuracy on the test set for the Xgboost Classifier is **0.9195** in 4d.p

# QUESTION 16
### LGBM CLASSIFIER

In [19]:
train_model(lgbm, X_train, y_train, X_test, y_test)

[[ 635   77]
 [  48 1240]]
Accuracy:  0.9375
Precision:  0.9415337889141989
Recall:  0.9627329192546584
F1_score:  0.9520153550863725


- Accuracy on the test set for the LGBM Classifier is **0.9375** in 4d.p

### EXTRATREE CLASSIFIER

In [20]:
train_model(etc, X_train, y_train, X_test, y_test)

[[ 606  106]
 [  38 1250]]
Accuracy:  0.928
Precision:  0.9218289085545722
Recall:  0.9704968944099379
F1_score:  0.9455370650529501


# QUESTION 17
### OPTIMIZED/TUNED EXTRA TREE CLASSIFIER

In [21]:
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [22]:
def train_opti_model(model, features, target, new_features, new_target):
    model.fit(features, target)
    pred = model.predict(new_features)
    the_score(pred, new_target)
    return model

In [23]:

model = RandomizedSearchCV(etc, hyperparameter_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1, verbose=1, random_state=1)

In [24]:
model = train_opti_model(model, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.8min finished


[[ 619   93]
 [  53 1235]]
Accuracy:  0.927
Precision:  0.9299698795180723
Recall:  0.9588509316770186
F1_score:  0.9441896024464832


In [27]:
val = []
cvres = model.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], 
                              cvres["params"]):
    val.append((mean_score, params))
sorted(val, key=lambda x: x[0], reverse=True)[0][1]

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

The best hyperparameters for the randomized search cv;

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

# QUESTION 18

Accuracy of the initial ExtraTreeClassifier model: 0.928

Accuracy of the new optimal model: 0.927

- Therefore, the accuracy of the new optimal model is **Lower**

# QUESTION 20

In [28]:
sorted(zip(col_name, model.best_estimator_.feature_importances_), key=lambda x: x[1])

[('p1', 0.003683422151688322),
 ('p4', 0.004962486591192238),
 ('p2', 0.005336864710946151),
 ('p3', 0.005429268421191957),
 ('g1', 0.10256244080927947),
 ('g2', 0.10757764577478764),
 ('g4', 0.10954089174337298),
 ('g3', 0.11306267999167334),
 ('tau3', 0.13468028520386593),
 ('tau4', 0.1354167630909727),
 ('tau1', 0.13723974766109256),
 ('tau2', 0.14050750384993677)]

- The most and least important features are 'tau2' and 'p1'