### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

#### Loading the Stability of the Grid System Dataset

In [2]:
data = pd.read_csv("Data_for_UCI_named.csv")
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
#Checking For Null Values
data.isnull().sum().sort_values(ascending=True)

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

#### Dropping the Stab Column

In [4]:
data2 = data.drop("stab", axis='columns')
data2.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [5]:
data2.stabf.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [6]:
X = data2.drop('stabf', axis='columns')
y = data2['stabf']

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


#### Data Normalization using the Standard Scaler

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)

In [9]:
x_test_scaled = scaler.transform(x_test)

### Question 9. Random Forest Classifier

Accuracy score 0.929


In [10]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(random_state=1)
forest.fit(x_train_scaled, y_train)

RandomForestClassifier(random_state=1)

In [11]:
from sklearn.metrics import accuracy_score
pred = forest.predict(x_test_scaled)
## Accuracy
print("Accuracy score {}".format(round(accuracy_score(y_test, pred), 4)))

Accuracy score 0.929


### Question 7. XGboost Classifier

Accuracy score 0.9455

In [12]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier()
xgb_model.fit(x_train_scaled, y_train)






XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [13]:
pred = xgb_model.predict(x_test_scaled)
## Accuracy
print("Accuracy score {}".format(round(accuracy_score(y_test, pred), 4)))

Accuracy score 0.9455


### Question 8

F1_score = 0.26357

In [14]:
Precision = (255/ ( 255+45)) 
Recall =  (255 /(255+1380)) 
F1_Score = 2 * (Precision*Recall)/(Precision + Recall)
print(round(F1_Score,5))


0.26357


### Question 4. LGBM

Accuracy score 0.9395

In [15]:
import lightgbm as lgb
clf = lgb.LGBMClassifier(random_state=1)
clf.fit(x_train_scaled, y_train)

LGBMClassifier(random_state=1)

In [16]:
pred = clf.predict(x_test_scaled)
## Accuracy
print("Accuracy score {}".format(round(accuracy_score(y_test, pred), 4)))

Accuracy score 0.9395


### Question 5. ExtraTreesClassifier

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [17]:
from sklearn.ensemble import ExtraTreesClassifier
ETC_model = ExtraTreesClassifier(random_state = 1)


In [18]:
n_estimators = [50, 100, 300, 500, 1000]   

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}


In [19]:
from sklearn.model_selection import RandomizedSearchCV
Random_search = RandomizedSearchCV(estimator = ETC_model, param_distributions= hyperparameter_grid, random_state=1,cv = 5, n_iter=10,scoring='accuracy',n_jobs=1, verbose=1)


In [20]:
model = Random_search.fit(x_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [21]:
model.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

### Question 3

->> Lower

The accuracy of the new optimal model is Lower than the initial ExtraTreesClassifier model with no hyperparameter tuning

In [22]:
model2 = ExtraTreesClassifier(n_estimators=1000, min_samples_split=2, 
                                 min_samples_leaf=8, max_features=None)

In [23]:
model2.fit(x_train_scaled, y_train)
pred = model2.predict(x_test_scaled)
## Accuracy
print("Accuracy score {}".format(round(accuracy_score(y_test, pred), 4)))

Accuracy score 0.9275


In [24]:
##Comparing result with the originall extratreeclassifier without tuning
ETC_model = ExtraTreesClassifier(random_state = 1)
ETC_model.fit(x_train_scaled, y_train)
pred = ETC_model.predict(x_test_scaled)
## Accuracy
print("Accuracy score {}".format(round(accuracy_score(y_test, pred), 4)))

Accuracy score 0.928


### Question 17

The most important feature is "tau2" and the least important feature is "p1"

In [25]:
important_features = model.best_estimator_.feature_importances_
print ('Important_features :\n', important_features) 

Important_features :
 [0.13723975 0.1405075  0.13468029 0.13541676 0.00368342 0.00533686
 0.00542927 0.00496249 0.10256244 0.10757765 0.11306268 0.10954089]


In [26]:
sorted (zip(important_features,X), reverse = True)

[(0.14050750384993677, 'tau2'),
 (0.13723974766109256, 'tau1'),
 (0.1354167630909727, 'tau4'),
 (0.13468028520386593, 'tau3'),
 (0.11306267999167334, 'g3'),
 (0.10954089174337298, 'g4'),
 (0.10757764577478764, 'g2'),
 (0.10256244080927947, 'g1'),
 (0.005429268421191957, 'p3'),
 (0.005336864710946151, 'p2'),
 (0.004962486591192238, 'p4'),
 (0.003683422151688322, 'p1')]