In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("C:/Users/USER/Downloads/Data_for_UCI_named.csv")

In [3]:
df.drop('stab', axis=1, inplace = True)

In [4]:
df.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable


In [5]:
X = df.drop(columns= 'stabf' )
y = df[ 'stabf' ]

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [7]:
#split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size= 0.2 , random_state= 1 )

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train),columns=x_train.columns)
x_test = pd.DataFrame(scaler.transform(x_test),columns=x_test.columns)

## RandomForestClassifier model

In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1)
clf.fit(x_train, y_train)
clf_y_pred = clf.predict(x_test)

from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score

clf_accuracy_score = accuracy_score(y_test, clf_y_pred)
clf_confusion_matrix = confusion_matrix(y_test, clf_y_pred)
clf_cross_validation_scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')
clf_f1 = f1_score(clf_y_pred, y_test, average = 'macro')

In [10]:
clf_accuracy_score

0.929

In [11]:
clf_confusion_matrix

array([[ 625,   87],
       [  55, 1233]], dtype=int64)

In [12]:
clf_cross_validation_scores

array([0.90613656, 0.91620497, 0.9079931 , 0.91214574, 0.8990338 ])

In [13]:
clf_f1

0.9217703264931951

## ExtraTreesClassifier model

In [14]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(random_state=1)
etc.fit(x_train, y_train)
etc_y_pred = etc.predict(x_test)

In [15]:
etc_accuracy_score = accuracy_score(y_test, etc_y_pred)
etc_accuracy_score

0.928

In [16]:
etc_confusion_matrix = confusion_matrix(y_test, etc_y_pred)
etc_confusion_matrix

array([[ 606,  106],
       [  38, 1250]], dtype=int64)

In [17]:
etc_cross_validation_scores = cross_val_score(etc, X, y, cv=5, scoring='f1_macro')
etc_cross_validation_scores

array([0.91291445, 0.91920023, 0.92117898, 0.92477044, 0.89466796])

In [18]:
etc_f1 = f1_score(etc_y_pred, y_test, average = 'macro')
etc_f1

0.9196711873937318

## XGBClassifier model

In [19]:
from xgboost import XGBClassifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=1)
xgb.fit(x_train, y_train)
xgb_y_pred = xgb.predict(x_test)

In [20]:
from sklearn.metrics import accuracy_score
xgb_accuracy_score = accuracy_score(y_test, xgb_y_pred)
xgb_accuracy_score

0.9455

In [21]:
xgb_confusion_matrix = confusion_matrix(y_test, xgb_y_pred)
xgb_confusion_matrix

array([[ 648,   64],
       [  45, 1243]], dtype=int64)

In [22]:
xgb_cross_validation_scores = cross_val_score(xgb, X, y, cv=5, scoring='f1_macro')
xgb_cross_validation_scores

array([0.94107885, 0.94040093, 0.94706337, 0.94276955, 0.93095032])

In [23]:
xgb_f1 = f1_score(xgb_y_pred, y_test, average = 'macro')
xgb_f1

0.9402080376305377

## lightgbm model

In [24]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
lgm = LGBMClassifier(random_state=1)
lgm.fit(x_train, y_train)

In [25]:
lgm_y_pred = lgm.predict(x_test)

In [26]:
lgm_accuracy = accuracy_score(y_test, lgm_y_pred)
lgm_accuracy

0.9395

In [27]:
lgm_confusion_matrix = confusion_matrix(y_test, lgm_y_pred)
lgm_confusion_matrix

array([[ 641,   71],
       [  50, 1238]], dtype=int64)

In [28]:
lgm_cross_validation_scores = cross_val_score(lgm, X, y, cv=5, scoring='f1_macro')
lgm_cross_validation_scores

array([0.94276955, 0.93829249, 0.94167944, 0.94115282, 0.92990607])

In [29]:
lgm_f1 = f1_score(lgm_y_pred, y_test, average = 'macro')
lgm_f1

0.9335820074207012

## Question 2: Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the most and least important respectively?

In [30]:
def get_feature_importance(model,feat,col_name):
    importance = pd.Series(model.feature_importances_,feat.columns).sort_values()
    importance_df = pd.DataFrame(importance).reset_index()
    importance_df.columns = ['Features',col_name]
    importance_df[col_name].round(3)
    return importance_df

In [31]:
feature_importance = get_feature_importance(etc, x_train,'Feature_Importance')
feature_importance

Unnamed: 0,Features,Feature_Importance
0,p1,0.039507
1,p2,0.040371
2,p4,0.040579
3,p3,0.040706
4,g1,0.089783
5,g2,0.093676
6,g4,0.094019
7,g3,0.096883
8,tau3,0.113169
9,tau4,0.115466


## Answer: tau2 and p1

## Question 4:Train a new ExtraClassifier Model with the new Hyper parameters from the RandomizedSearchCV (with random state = 1). Is the accuracy of the new model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?

In [32]:
new_etc = ExtraTreesClassifier(n_estimators = 1000,
                             min_samples_split = 2,
                             min_samples_leaf = 8,
                             max_features = None,
                             random_state = 1
                              )
new_etc.fit(x_train, y_train)

In [33]:
new_y_pred = new_etc.predict(x_test)

In [34]:
new_accuracy_score = accuracy_score(y_test, new_y_pred)
new_accuracy_score

0.927

## Answer: Lower

## Question 9: What is the accuracy on the test set using the random forest classifier? In 4 decimal places.

In [35]:
print('Accuracy on random forest classifier is: {}'.format(clf_accuracy_score))

Accuracy on random forest classifier is: 0.929


## Answer: 0.929

## Question 10: Using the ExtraTreesClassifier as your estimator with cv = 5, n_jobs = -1, verbose = 1 and random state = 1. What are the best parameters from the randomized search CV?

In [36]:
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(etc,
                                   {'n_estimators': [50,100,300,500,1000],
                                    'min_samples_leaf': [1,2,4,6,8],
                                    'min_samples_split': [2,3,5,7,9],
                                    'max_features': ['auto','sqrt','log2', None]
                                   },
                                       cv=5, scoring="accuracy",verbose=1,n_jobs=-1, 
                             n_iter=10, random_state = 1)
random_search.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [37]:
random_search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

## Answer: 'n_estimators': 1000,
##                 'min_samples_split': 2,
##                'min_samples_leaf': 8,
##                'max_features': None

## Question 15: What is the accuracy on the test set using the LGBM classifier? In 4 decimal places.

In [38]:
print('Accuracy on LGBM classifier is: {}'.format(lgm_accuracy))

Accuracy on LGBM classifier is: 0.9395


## Answer: 0.9395

## Question 19: What is the accuracy on the test set using the XGboost classifier? In 4 decimal places.


In [39]:
print('Accuracy on XGboost classifier is: {}'.format(xgb_accuracy_score))

Accuracy on XGboost classifier is: 0.9455


## Answer: 0.9455