In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
import lightgbm as lgb

In [2]:
elect_grid = pd.read_csv('Data_for_UCI_named.csv')

In [3]:
elect_grid.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
elect_grid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


## Following the Instructions.

In [5]:
# dropping stab column
elect_grid.drop('stab', axis=1, inplace= True )

In [6]:
elect_grid.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [7]:
# consistency check of the stabf column
elect_grid.stabf.unique()

array(['unstable', 'stable'], dtype=object)

In [8]:
# Changing stable and unstable to 1 and 0 respectively
def change_funct(x):
    if x == 'unstable':
        return 0
    else:
        return 1
elect_grid['stabf'] = elect_grid.stabf.map(change_funct)

In [9]:
elect_grid.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,1
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0


In [10]:
X = elect_grid.drop('stabf', axis= 1)
y = elect_grid[['stabf']]

In [11]:
y.stabf.unique()

array([0, 1], dtype=int64)

In [12]:
# splitting the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [13]:
scale_train = StandardScaler()

In [14]:
X_train= pd.DataFrame(scale_train.fit_transform(X_train), columns = X.columns, index = X_train.index)

In [15]:
X_train

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
2694,0.367327,-0.986042,0.650447,1.547527,-0.291490,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
5140,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2568,-1.467850,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.682560,-0.855302,1.399350,1.451534,-1.045743,0.492489
3671,0.820081,0.529920,1.299657,-1.141975,-0.812854,-0.763632,1.521579,0.658780,-0.958319,1.361958,1.604140,0.275303
7427,0.665424,-1.425627,0.312300,0.919137,-1.614296,0.760315,1.422019,0.639243,1.676895,0.695660,1.137504,-1.312575
...,...,...,...,...,...,...,...,...,...,...,...,...
2895,1.551314,0.007408,-1.177640,1.016898,-0.397177,0.759820,-0.636951,0.572703,-1.209413,0.313976,-1.625728,-0.637401
7813,1.015925,-0.223483,-1.489381,-1.479078,0.451468,-0.731994,0.990355,-1.048148,-1.094647,-0.755209,0.734821,-0.304433
905,0.657609,-0.722756,-1.405888,-0.274301,-0.012584,1.438694,-0.364266,-1.046683,1.253539,0.293100,-1.550587,0.810344
5192,-0.059316,-1.260532,-1.010471,-0.877808,-0.779769,0.828824,0.516923,0.018984,-0.182448,-0.388255,-0.726781,1.667916


In [16]:
y_train

Unnamed: 0,stabf
2694,0
5140,0
2568,0
3671,0
7427,0
...,...
2895,1
7813,1
905,0
5192,0


In [17]:
X_test

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
9953,6.877876,4.113820,9.356768,8.299753,4.056779,-1.897470,-1.590581,-0.568728,0.276567,0.845536,0.112440,0.822562
3850,5.802841,6.271371,4.731540,3.819867,3.579569,-1.709480,-1.067511,-0.802579,0.077527,0.416478,0.912846,0.861306
4962,2.286998,4.385142,2.830232,5.293880,3.035814,-1.202764,-0.902011,-0.931039,0.924216,0.130186,0.703887,0.063811
3886,5.019920,2.209962,6.266080,0.578901,4.322584,-1.960207,-1.074561,-1.287815,0.546910,0.065992,0.427349,0.814648
5437,7.646145,9.187896,5.484219,9.934313,3.634226,-1.254541,-1.335366,-1.044319,0.561528,0.121611,0.787318,0.300314
...,...,...,...,...,...,...,...,...,...,...,...,...
3919,8.320531,3.394352,1.468729,8.766874,3.940123,-1.599081,-0.931023,-1.410020,0.657445,0.493384,0.178907,0.173240
162,2.290102,3.028512,8.071234,6.169944,3.624592,-1.104003,-0.820951,-1.699638,0.785595,0.770194,0.234290,0.225070
7903,7.848945,0.684889,0.728223,6.702948,2.599685,-0.563497,-1.210069,-0.826119,0.185122,0.559799,0.682418,0.886902
2242,2.173967,5.776797,4.597638,6.399330,2.869349,-0.931891,-1.237955,-0.699503,0.942680,0.743103,0.152161,0.305183


In [18]:
# This should be wrong, as the mean of the Xtrain is different from the Xtest mean and the mean when they 
# are combined, since the idea of scaling is to take the consideration of current values in the specific
# dataset in this case we are using the condition of the x train to influence x test.
X_test= pd.DataFrame(scale_train.transform(X_test), columns = X.columns, index= X_test.index)
X_test

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
9953,0.593951,-0.412733,1.503924,1.116943,0.403423,-1.492971,-0.785033,1.566781,-0.901007,1.167203,-1.507330,1.084726
3850,0.202190,0.374416,-0.188800,-0.522268,-0.225967,-1.058483,0.420047,1.028627,-1.625721,-0.395660,1.414651,1.226011
4962,-1.079044,-0.313745,-0.884634,0.017080,-0.943122,0.112653,0.801335,0.733004,1.457108,-1.438495,0.651821,-1.682168
3886,-0.083120,-1.107327,0.372805,-1.708152,0.753990,-1.637972,0.403805,-0.088036,0.083322,-1.672322,-0.357714,1.055865
5437,0.873921,1.438466,0.086662,1.715037,-0.153880,-0.007015,-0.197053,0.472315,0.136549,-1.469731,0.956396,-0.819727
...,...,...,...,...,...,...,...,...,...,...,...,...
3919,1.119679,-0.675220,-1.382912,1.287865,0.249565,-0.803325,0.734497,-0.369263,0.485786,-0.115528,-1.264683,-1.283117
162,-1.077913,-0.808691,1.033449,0.337636,-0.166587,0.340913,0.988085,-1.035753,0.952386,0.892766,-1.062502,-1.094114
7903,0.947825,-1.663727,-1.653920,0.532665,-1.518329,1.590144,0.091613,0.974455,-1.233963,0.126391,0.573445,1.319350
2242,-1.120235,0.193979,-0.237805,0.421570,-1.162671,0.738702,0.027367,1.265833,1.524336,0.794087,-1.362323,-0.801971


In [19]:
y_test

Unnamed: 0,stabf
9953,0
3850,0
4962,1
3886,1
5437,0
...,...
3919,1
162,1
7903,1
2242,0


In [20]:
#xscaler = StandardScaler()
#tri = pd.DataFrame(xscaler.fit_transform(X))

## Training models

In [21]:
Random_f = RandomForestClassifier(random_state= 1)

In [22]:
Extra_t = ExtraTreesClassifier(random_state= 1)

In [23]:
XG_boost = XGBClassifier(random_state = 1)

In [24]:
light_gb = lgb.LGBMClassifier(random_state = 1)

In [25]:
# rf_ train
Random_f.fit(X_train, y_train)

  Random_f.fit(X_train, y_train)


RandomForestClassifier(random_state=1)

In [26]:
Extra_t.fit(X_train, y_train)

  Extra_t.fit(X_train, y_train)


ExtraTreesClassifier(random_state=1)

In [27]:
XG_boost.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [28]:
light_gb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(random_state=1)

In [29]:
rf_pred = Random_f.predict(X_test)

In [30]:
ex_pred = Extra_t.predict(X_test)

In [31]:
XG_pred = XG_boost.predict(X_test)

In [32]:
lg_pred = light_gb.predict(X_test)

In [33]:
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix


In [34]:
xg_acc = accuracy_score(y_test, XG_pred)
xg_acc

0.9455

In [35]:
ex = accuracy_score(y_test, ex_pred)
ex

0.9285

In [36]:
rf_acc = accuracy_score(y_test, rf_pred) 
rf_acc

0.9295

In [37]:
from sklearn.model_selection import RandomizedSearchCV

In [38]:
params ={'n_estimators': range(500, 1000, 50), 'criterion': ['gini', 'entropy'], 'max_depth': range(60, 80),
         'min_samples_split': range(1, 15), 'min_samples_leaf': range(2,15)}

In [39]:
extra_tree2 = ExtraTreesClassifier()

In [40]:
clf = RandomizedSearchCV(extra_tree2, params, n_iter=100, cv=5, random_state=1)

In [None]:
#model = clf.fit(X_train, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [None]:
from pprint import pprint
pprint(model.best_estimator_.get_params())

In [None]:
# task related
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, normalised_train_df, y_balanced, cv= 5 , scoring= 'f1_macro' )
scores


In [None]:
# K-Fold Cross Validation
from sklearn.model_selection import KFold
kf = KFold(n_splits= 5 )
kf.split(X_train)
f1_scores = []
#run for every split
for train_index, test_index in kf.split(X_train, y_train):

    x_train, x_test = X_train.iloc[train_index], X_train.iloc[test_index]
    display(x_train, x_test)
    print('1')
    y_train1, y_test1 = y_train.iloc[train_index], y_train.iloc[test_index]
    display(y_train1, y_test1)
 #model = LogisticRegression().fit(x_train, y_train)
 #save result to list
 #f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test),
 #pos_label= '2A' )* 100 )

In [None]:
Stratified K-Fold Cross Validation
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits= 5 , shuffle= True , random_state= 1 )
f1_scores = []
#run for every split
for train_index, test_index in skf.split(normalised_train_df, y_balanced):
 x_train, x_test = np.array(normalised_train_df)[train_index],
 np.array(normalised_train_df)[test_index]
 y_train, y_test = y_balanced[train_index], y_balanced[test_index]
 model = LogisticRegression().fit(x_train, y_train)
 #save result to list
 f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label= '2A' ))

In [None]:
Leave One Out Cross Validation (LOOCV)
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(LogisticRegression(), normalised_train_df, y_balanced, cv=loo,
 scoring= 'f1_macro' )
average_score = scores.mean() * 100


In [None]:
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score,
confusion_matrix
new_predictions = log_reg.predict(normalised_test_df)
cnf_mat = confusion_matrix(y_true=y_test, y_pred=new_predictions, labels=[ '2A' , '3A' ])
cnf_mat #prints array([[ 35, 34 ],
 [ 50 , 58 ]])
Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=new_predictions)
print( 'Accuracy: {}' .format(round(accuracy* 100 ), 2 )) #prints 53.0
Precision
precision = precision_score(y_true=y_test, y_pred=new_predictions, pos_label= '2A' )
print( 'Precision: {}' .format(round(precision* 100 ), 2 )) #prints 41.0
Recall
recall = recall_score(y_true=y_test, y_pred=new_predictions, pos_label= '2A' )
print( 'Recall: {}' .format(round(recall* 100 ), 2 )) #prints 51.0
F1-Score
f1 = f1_score(y_true=y_test, y_pred=new_predictions, pos_label= '2A' )
print( 'F1: {}' .format(round(f1* 100 ), 2 )) #prints 45.0 