In [1]:
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
egrids_data = pd.read_csv('Data_for_UCI_named.csv')
egrids_data.drop(columns=['stab'], inplace=True)
def categorical_value_to_binary(value):
    if value == 'unstable':
        return 1
    else:
        return 0
egrids_data['stabf'] = egrids_data['stabf'].apply(categorical_value_to_binary)

In [3]:
"""
this extracts all of the data to a corresponding number
of dimensions, and size and shape of the data frame as a numpy array
"""
array = egrids_data.values
X = array[:,:12]
y = array[:,12:13]


In [4]:
X#.shape

array([[2.95906002, 3.0798852 , 8.38102539, ..., 0.85957811, 0.88744492,
        0.95803399],
       [9.30409723, 4.90252411, 3.04754073, ..., 0.86241408, 0.56213905,
        0.78175991],
       [8.97170691, 8.84842842, 3.04647875, ..., 0.76668866, 0.83944402,
        0.10985324],
       ...,
       [2.36403419, 2.84203025, 8.77639096, ..., 0.98650532, 0.14928646,
        0.14598403],
       [9.63151069, 3.9943976 , 2.75707093, ..., 0.58755755, 0.88911835,
        0.81839133],
       [6.53052662, 6.7817899 , 4.34969522, ..., 0.50544105, 0.37876093,
        0.94263083]])

In [5]:
y#.shape

array([[1.],
       [0.],
       [1.],
       ...,
       [0.],
       [1.],
       [1.]])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [7]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# ADA BOOST CLASSIFIER

In [8]:
parameters = {
    'n_estimators' : [10,15,20,25,30,35,40,45,50,55,60]
}

In [9]:
"""
the adaboost uses the decision tree classifier as it default classifier
"""
model_1 = AdaBoostClassifier(random_state=1)
clf = GridSearchCV(model_1,                    # model
                   param_grid = parameters,   # hyperparameters
                   scoring='accuracy',        # metric for scoring
                   cv=10)
clf.fit(X_train,y_train)

In [10]:
clf.best_score_

0.8491250000000001

In [11]:
clf.best_params_

{'n_estimators': 60}

In [12]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs

In [13]:
ada_model = AdaBoostClassifier(random_state=1, n_estimators=clf.best_params_['n_estimators'])
ada_model.fit(X_train,y_train)
y_pred = ada_model.predict(X_test)
cnf_mat_1 = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[ 1 , 0 ])
precision_1 = precision_score(y_true=y_test, y_pred=y_pred, pos_label= 1)
recall_1 = recall_score(y_true=y_test, y_pred=y_pred, pos_label= 1)
f1 = f1_score(y_true=y_test, y_pred=y_pred, pos_label= 1)
accuracy_1 = accuracy_score(y_true=y_test, y_pred=y_pred)
print('cnf_mat_1:',cnf_mat_1,'\n')
print('precision_1:',precision_1,'\n')
print('recall_1:',recall_1,'\n')
print('f1:',f1,'\n')
print('accuracy_1:',accuracy_1,'\n')
print('train_data_score:',ada_model.score(X_train, y_train),'\n')
print('test_data_score:',ada_model.score(X_test, y_test))

cnf_mat_1: [[1157  131]
 [ 187  525]] 

precision_1: 0.8608630952380952 

recall_1: 0.8982919254658385 

f1: 0.8791793313069908 

accuracy_1: 0.841 

train_data_score: 0.86525 

test_data_score: 0.841


# XGBOOST CLASSIFIER

In [14]:
from xgboost import XGBClassifier

In [15]:
parameters = {
    'n_estimators' : [10,15,20,25,30,35,40,45,50,55,60,65,70]
}

In [16]:
model_2 = XGBClassifier(random_state=1)
clf = GridSearchCV(model_2,                    # model
                   param_grid = parameters,   # hyperparameters
                   scoring='accuracy',        # metric for scoring
                   cv=10)
clf.fit(X_train,y_train)









In [17]:
clf.best_score_

0.9415000000000001

In [18]:
clf.best_params_

{'n_estimators': 70}

In [19]:
xgb_model = XGBClassifier(random_state=1, n_estimators=clf.best_params_['n_estimators'])
xgb_model.fit(X_train,y_train)
y_pred_2 = xgb_model.predict(X_test)
cnf_mat_2 = confusion_matrix(y_true=y_test, y_pred=y_pred_2, labels=[ 1 , 0 ])
precision_2 = precision_score(y_true=y_test, y_pred=y_pred_2, pos_label= 1)
recall_2 = recall_score(y_true=y_test, y_pred=y_pred_2, pos_label= 1)
f2 = f1_score(y_true=y_test, y_pred=y_pred_2, pos_label= 1)
accuracy_2 = accuracy_score(y_true=y_test, y_pred=y_pred_2)
print('cnf_mat_2:',cnf_mat_2,'\n')
print('precision_2:',precision_2,'\n')
print('recall_2:',recall_2,'\n')
print('f2:',f2,'\n')
print('accuracy_2:',accuracy_2,'\n')
print('train_data_score:',xgb_model.score(X_train, y_train),'\n')
print('test_data_score:',xgb_model.score(X_test, y_test))

cnf_mat_2: [[1246   42]
 [  70  642]] 

precision_2: 0.9468085106382979 

recall_2: 0.967391304347826 

f2: 0.956989247311828 

accuracy_2: 0.944 

train_data_score: 1.0 

test_data_score: 0.944


# GRADIENT BOOSTING

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

In [21]:
parameters = {'n_estimators' : [10,20,30,40,50],
              'learning_rate':[0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1],
              'max_features':[4,5,6,7],
              'max_depth':[4,5,6,7]
}

In [22]:
model_3 = GradientBoostingClassifier(random_state=1)
clf = RandomizedSearchCV(model_3,                    # model
                         parameters,   # hyperparameters
                         scoring='accuracy',        # metric for scoring
                         cv=10,
                         return_train_score=False,
                         n_iter=3)
clf.fit(X_train,y_train)

In [23]:
clf.best_score_

0.9257500000000001

In [24]:
clf.best_params_

{'n_estimators': 50, 'max_features': 6, 'max_depth': 7, 'learning_rate': 0.75}

In [25]:
gb_model =  GradientBoostingClassifier(random_state=1, n_estimators=clf.best_params_['n_estimators'],
                         learning_rate=clf.best_params_['learning_rate'],
                         max_features=clf.best_params_['max_features'],
                         max_depth=clf.best_params_['max_depth'])

gb_model.fit(X_train,y_train)
y_pred_3 = gb_model.predict(X_test)
cnf_mat_3 = confusion_matrix(y_true=y_test, y_pred=y_pred_3, labels=[ 1 , 0 ])
precision_3 = precision_score(y_true=y_test, y_pred=y_pred_3, pos_label= 1)
recall_3 = recall_score(y_true=y_test, y_pred=y_pred_3, pos_label= 1)
f3 = f1_score(y_true=y_test, y_pred=y_pred_3, pos_label= 1)
accuracy_3 = accuracy_score(y_true=y_test, y_pred=y_pred_3)
print('cnf_mat_3:',cnf_mat_3,'\n')
print('precision_3:',precision_3,'\n')
print('recall_3:',recall_3,'\n')
print('f3:',f3,'\n')
print('accuracy_3:',accuracy_3,'\n')
print('train_data_score:',gb_model.score(X_train, y_train),'\n')
print('test_data_score:',gb_model.score(X_test, y_test))

cnf_mat_3: [[1240   48]
 [  86  626]] 

precision_3: 0.9351432880844646 

recall_3: 0.9627329192546584 

f3: 0.9487375669472075 

accuracy_3: 0.933 

train_data_score: 1.0 

test_data_score: 0.933


In [26]:
from sklearn.ensemble import ExtraTreesClassifier

In [27]:
parameters = {'n_estimators' : [100,300,500,1000],
              'criterion':['gini','entropy','log_loss'],
              'min_samples_split':[2,5,6,7],
              'max_depth':[4,5,6,7],
              'min_samples_leaf':[2,4,6,8],
              'max_features':['sqrt', 'log2','auto', None]
}

In [28]:
model_4 = ExtraTreesClassifier(random_state=1, verbose=1, n_jobs=-1)
clf = RandomizedSearchCV(model_4,                    # model
                         parameters,   # hyperparameters
                         scoring='accuracy',        # metric for scoring
                         cv=5,
                         return_train_score=False,
                         n_iter=10)
clf.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    5.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks     

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 235 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend L

[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 220 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBacken

[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    3.9s finished


In [29]:
clf.best_score_

0.8657499999999999

In [30]:
clf.best_params_

{'n_estimators': 1000,
 'min_samples_split': 7,
 'min_samples_leaf': 6,
 'max_features': None,
 'max_depth': 5,
 'criterion': 'log_loss'}

In [36]:
ext_model =  ExtraTreesClassifier(random_state=1, n_estimators=clf.best_params_['n_estimators'],
                         min_samples_split=clf.best_params_['min_samples_split'],
                         min_samples_leaf=clf.best_params_['min_samples_leaf'],
                         max_features=clf.best_params_['max_features'],
                         max_depth=clf.best_params_['max_depth'], 
                         criterion=clf.best_params_['criterion'],
                         verbose=1,
                         n_jobs=-1
                                 )

ext_model.fit(X_train,y_train)
y_pred_4 = ext_model.predict(X_test)
cnf_mat_4 = confusion_matrix(y_true=y_test, y_pred=y_pred_4, labels=[ 1 , 0 ])
precision_4 = precision_score(y_true=y_test, y_pred=y_pred_4, pos_label= 1)
recall_4 = recall_score(y_true=y_test, y_pred=y_pred_4, pos_label= 1)
f4 = f1_score(y_true=y_test, y_pred=y_pred_4, pos_label= 1)
accuracy_4 = accuracy_score(y_true=y_test, y_pred=y_pred_4)
print('cnf_mat_4:',cnf_mat_4,'\n')
print('precision_4:',precision_4,'\n')
print('recall_4:',recall_4,'\n')
print('f4:',f4,'\n')
print('accuracy_4:',accuracy_4,'\n')
print('train_data_score:',ext_model.score(X_train, y_train),'\n')
print('test_data_score:',ext_model.score(X_test, y_test))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s


cnf_mat_4: [[1264   24]
 [ 241  471]] 

precision_4: 0.8398671096345515 

recall_4: 0.9813664596273292 

f4: 0.9051199427139277 

accuracy_4: 0.8675 



[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s


train_data_score: 0.8795 

test_data_score: 0.8675


[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.2s finished


In [31]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_iter

In [35]:
clf._select_best_index

<function sklearn.model_selection._search.BaseSearchCV._select_best_index(refit, refit_metric, results)>