In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

In [2]:
elect_grid = pd.read_csv('Data_for_UCI_named.csv')

In [3]:
elect_grid.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
elect_grid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


## Following the Instructions.

In [5]:
# dropping stab column
elect_grid.drop('stab', axis=1, inplace= True )

In [6]:
elect_grid.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [7]:
# consistency check of the stabf column
elect_grid.stabf.unique()

array(['unstable', 'stable'], dtype=object)

In [8]:
# Changing stable and unstable to 1 and 0 respectively
def change_funct(x):
    if x == 'unstable':
        return 0
    else:
        return 1
elect_grid['stabf'] = elect_grid.stabf.map(change_funct)

In [9]:
elect_grid.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,1
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0


In [10]:
X = elect_grid.drop('stabf', axis= 1)
y = elect_grid[['stabf']]

In [11]:
y.stabf.unique()

array([0, 1], dtype=int64)

In [12]:
# splitting the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [13]:
scale_train = StandardScaler()

In [14]:
X_train= pd.DataFrame(scale_train.fit_transform(X_train), columns = X.columns, index = X_train.index)

In [15]:
X_train

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
2694,0.367327,-0.986042,0.650447,1.547527,-0.291490,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
5140,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2568,-1.467850,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.682560,-0.855302,1.399350,1.451534,-1.045743,0.492489
3671,0.820081,0.529920,1.299657,-1.141975,-0.812854,-0.763632,1.521579,0.658780,-0.958319,1.361958,1.604140,0.275303
7427,0.665424,-1.425627,0.312300,0.919137,-1.614296,0.760315,1.422019,0.639243,1.676895,0.695660,1.137504,-1.312575
...,...,...,...,...,...,...,...,...,...,...,...,...
2895,1.551314,0.007408,-1.177640,1.016898,-0.397177,0.759820,-0.636951,0.572703,-1.209413,0.313976,-1.625728,-0.637401
7813,1.015925,-0.223483,-1.489381,-1.479078,0.451468,-0.731994,0.990355,-1.048148,-1.094647,-0.755209,0.734821,-0.304433
905,0.657609,-0.722756,-1.405888,-0.274301,-0.012584,1.438694,-0.364266,-1.046683,1.253539,0.293100,-1.550587,0.810344
5192,-0.059316,-1.260532,-1.010471,-0.877808,-0.779769,0.828824,0.516923,0.018984,-0.182448,-0.388255,-0.726781,1.667916


In [16]:
y_train

Unnamed: 0,stabf
2694,0
5140,0
2568,0
3671,0
7427,0
...,...
2895,1
7813,1
905,0
5192,0


In [17]:
X_test

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
9953,6.877876,4.113820,9.356768,8.299753,4.056779,-1.897470,-1.590581,-0.568728,0.276567,0.845536,0.112440,0.822562
3850,5.802841,6.271371,4.731540,3.819867,3.579569,-1.709480,-1.067511,-0.802579,0.077527,0.416478,0.912846,0.861306
4962,2.286998,4.385142,2.830232,5.293880,3.035814,-1.202764,-0.902011,-0.931039,0.924216,0.130186,0.703887,0.063811
3886,5.019920,2.209962,6.266080,0.578901,4.322584,-1.960207,-1.074561,-1.287815,0.546910,0.065992,0.427349,0.814648
5437,7.646145,9.187896,5.484219,9.934313,3.634226,-1.254541,-1.335366,-1.044319,0.561528,0.121611,0.787318,0.300314
...,...,...,...,...,...,...,...,...,...,...,...,...
3919,8.320531,3.394352,1.468729,8.766874,3.940123,-1.599081,-0.931023,-1.410020,0.657445,0.493384,0.178907,0.173240
162,2.290102,3.028512,8.071234,6.169944,3.624592,-1.104003,-0.820951,-1.699638,0.785595,0.770194,0.234290,0.225070
7903,7.848945,0.684889,0.728223,6.702948,2.599685,-0.563497,-1.210069,-0.826119,0.185122,0.559799,0.682418,0.886902
2242,2.173967,5.776797,4.597638,6.399330,2.869349,-0.931891,-1.237955,-0.699503,0.942680,0.743103,0.152161,0.305183


In [50]:
X_test= pd.DataFrame(scale_train.transform(X_test), columns = X.columns, index= X_test.index)
X_test

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
9953,-1.696020,-2.064177,-1.370032,-1.511280,-4.414965,-0.558082,1.070836,6.481174,-5.188601,2.338892,-7.420497,2.040740
3850,-1.838784,-1.776998,-1.989529,-2.111075,-5.245062,0.446119,3.847171,5.242734,-7.827314,-3.353915,3.246549,2.555956
4962,-2.305687,-2.028063,-2.244187,-1.913725,-6.190913,3.152882,4.725604,4.562425,3.397391,-7.152493,0.461746,-8.049119
3886,-1.942756,-2.317589,-1.783994,-2.544996,-3.952603,-0.893212,3.809750,2.672985,-1.604621,-8.004219,-3.223686,1.935497
5437,-1.593994,-1.388795,-1.888716,-1.292435,-5.149988,2.876302,2.425458,3.962508,-1.410819,-7.266269,1.573634,-4.904110
...,...,...,...,...,...,...,...,...,...,...,...,...
3919,-1.504435,-2.159942,-2.426545,-1.448739,-4.617886,1.035847,4.571618,2.025806,-0.139232,-2.333521,-6.534685,-6.593925
162,-2.305275,-2.208636,-1.542214,-1.796432,-5.166746,3.680442,5.155850,0.492029,1.559675,1.339239,-5.796600,-5.904699
7903,-1.567062,-2.520583,-2.525728,-1.725070,-6.949551,6.567701,3.090505,5.118069,-6.400908,-1.452317,0.175625,2.896330
2242,-2.320698,-1.842828,-2.007463,-1.765720,-6.480475,4.599824,2.942491,5.788610,3.642169,0.979798,-6.891131,-4.839361


In [19]:
y_test

Unnamed: 0,stabf
9953,0
3850,0
4962,1
3886,1
5437,0
...,...
3919,1
162,1
7903,1
2242,0


## Training models

In [21]:
Random_f = RandomForestClassifier(random_state= 1)

In [22]:
Extra_t = ExtraTreesClassifier(random_state= 1)

In [23]:
XG_boost = XGBClassifier(random_state = 1)

In [24]:
light_gb = lgb.LGBMClassifier(random_state = 1)

In [25]:
# rf_ train
Random_f.fit(X_train, y_train)

  Random_f.fit(X_train, y_train)


RandomForestClassifier(random_state=1)

In [26]:
Extra_t.fit(X_train, y_train)

  Extra_t.fit(X_train, y_train)


ExtraTreesClassifier(random_state=1)

In [27]:
XG_boost.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [28]:
light_gb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(random_state=1)

In [29]:
rf_pred = Random_f.predict(X_test)

In [30]:
ex_pred = Extra_t.predict(X_test)

In [31]:
XG_pred = XG_boost.predict(X_test)

In [32]:
lg_pred = light_gb.predict(X_test)

## QUIZ SOLUTIONS

#### Number 3 solution.

In [33]:
xg_acc = accuracy_score(y_test, XG_pred)
xg_acc

0.9455

#### Number 7 solutions.

In [34]:
params ={'n_estimators': range(100, 1001, 100), 'criterion': ['gini', 'entropy'], 'max_depth': range(60, 80),
         'min_samples_split': range(1, 15), 'min_samples_leaf': range(2,15), 'max_features': [None, 'log2',
                                                                                             'auto']}

In [35]:
extra_tree2 = ExtraTreesClassifier()

In [36]:
clf = RandomizedSearchCV(extra_tree2, params, random_state=1)

In [37]:
model = clf.fit(X_train, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

In [38]:
from pprint import pprint
pprint(model.best_estimator_.get_params())

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 60,
 'max_features': None,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [39]:
# normal extra tree
accuracy_score(y_test, ex_pred)

0.9285

In [40]:
extra_tre_opt = model.predict(X_test)
accuracy_score(y_test, extra_tre_opt)
#higher

0.935

#### Number 8 solutions.

In [41]:
extra_tree3 = ExtraTreesClassifier()

In [42]:
clf1 = RandomizedSearchCV(extra_tree3, params, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1, 
                          random_state = 1)

In [43]:
model1= clf1.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [44]:
pprint(model1.best_estimator_.get_params())

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 60,
 'max_features': None,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


#### Number 9 solution.

In [45]:
feat_imp = pd.DataFrame({'feature':X.columns, 'val':model.best_estimator_.feature_importances_})

In [46]:
#maximum
feat_imp[feat_imp.val == feat_imp.val.max()]

Unnamed: 0,feature,val
1,tau2,0.124718


In [47]:
# minimum 
feat_imp[feat_imp.val == feat_imp.val.min()]

Unnamed: 0,feature,val
4,p1,0.015485


#### Number 10

In [48]:
rf_acc = accuracy_score(y_test, rf_pred) 
rf_acc

0.9295

#### Number 20

In [49]:
light_acc = accuracy_score(y_test, lg_pred)
light_acc

0.9395