## Set Default directory

In [1]:
%cd C:\Users\negas\OneDrive\Desktop\dDevice

C:\Users\negas\OneDrive\Desktop\dDevice


## Import libraries    

In [3]:
import pandas as pd
import numpy as np
from collections import Counter

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, KFold

In [28]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [10]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [24]:
import warnings
warnings.filterwarnings('ignore')
# %matplotlib inline

## Read Data from csv

In [4]:
dev_data= pd.read_csv('dev_model.csv')
dev_data.head()

Unnamed: 0.1,Unnamed: 0,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric8,metric9,diff
0,0,0,1.323358,-0.047478,-0.053516,2.193905,-0.515755,1.485268,-0.039335,-0.028479,0
1,1,0,-1.71358,-0.047478,-0.053516,2.193905,-0.515755,1.485268,-0.039335,-0.028479,1
2,2,0,0.023124,-0.047478,-0.053516,2.193905,-0.515755,1.485268,-0.039335,-0.028479,2
3,3,0,0.080687,-0.047478,-0.053516,2.193905,-0.515755,1.485278,-0.039335,-0.028479,3
4,4,0,-0.35474,-0.047478,-0.053516,2.193905,-0.515755,1.492086,-0.039335,-0.028479,4


## Check for null values in the dataset

In [8]:
dev_data.isnull().sum()

Unnamed: 0    0
device        0
metric1       0
metric2       0
metric3       0
metric4       0
metric5       0
metric6       0
metric8       0
metric9       0
failure       0
diff          0
dtype: int64

## Target variable balance

In [5]:
dev_data['failure'].value_counts(dropna=False)

0    124388
1       106
Name: failure, dtype: int64

## Split Target and Feature variable columns

In [6]:
X= dev_data.iloc[:,2:]
y= dev_data.iloc[:,1]

## Apply SMOTE to address the inbalance of the of the target variable

In [11]:
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

In [12]:
X, y = over.fit_resample(X,y)
X, y = under.fit_resample(X, y)

## Split train and test data

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1234)

In [31]:
Counter(y)

Counter({0: 24876, 1: 12438})

## Hyperparameter Tunning 

In [16]:
'''Put the three models in dictionary as keys and their respective parameters as values'''
param_grid = {
    'logistic' : {'C': [1, 10, 100, 1000]},
    'rforest' : { 'max_features': [ 'sqrt', 'log2']},
    'gboost' : { "max_features":["log2","sqrt"],}
    
   }

In [19]:
''' Instantiate the models'''
logisticReg = LogisticRegression()
rforestM = RandomForestClassifier()
gboostM = GradientBoostingClassifier()

In [20]:
'''Put the models again in dictionary with their instance models as values'''
models = { "logistic": logisticReg, 
          "rforest": rforestM,
         "gboost" : gboostM
         }

In [25]:
'''Iterate through the models and their parameters and get the best parameter for each model'''
best_models = {}
for m in models.keys():
    print(m)
    grid = GridSearchCV(models[m], param_grid[m], cv=10, scoring='f1')
    grid.fit(X_train, y_train)
    best_models[m] = grid

logistic
rforest
gboost


## Model comparison

In [29]:
''' Compare best winner model'''
for m in best_models.keys():
    m_ = best_models[m]
    #train
    y_pred_train = m_.predict(X_train)
    f1_train = round(f1_score(y_train, y_pred_train, average='macro'),2)
    precision_train = round(precision_score(y_train, y_pred_train, average='macro'),2)
    recall_train = round(recall_score(y_train, y_pred_train, average='macro'),2)
    
    #Test
    y_pred = m_.predict(X_test)
    f1 = round(f1_score(y_test, y_pred, average='macro'),2)
    precision = round(precision_score(y_test, y_pred, average='macro'),2)
    recall = round(recall_score(y_test, y_pred, average='macro'),2)
    print(m)
    print(f'f1_train={f1_train}, f1_test={f1}')
    print(f'precision_train={precision_train}, precision={precision}')
    print(f'recall_train={recall_train}, recall={recall}')
    
    print("+++++++")

logistic
f1_train=0.77, f1_test=0.76
precision_train=0.87, precision=0.86
recall_train=0.75, recall=0.74
+++++++
rforest
f1_train=1.0, f1_test=1.0
precision_train=1.0, precision=1.0
recall_train=1.0, recall=1.0
+++++++
gboost
f1_train=0.96, f1_test=0.96
precision_train=0.97, precision=0.96
recall_train=0.96, recall=0.96
+++++++
