Modelling

In [1]:
from imblearn.over_sampling import SMOTE

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, roc_curve, auc,ConfusionMatrixDisplay

Load the data from previous notebook

In [4]:
df=pd.read_csv('..\Capstone-2-Data-wranglling\preprocessed.csv',index_col='id')
X_train=pd.read_csv('..\Capstone-2-Data-wranglling\X_train.csv',index_col='id')
y_train=pd.read_csv('..\Capstone-2-Data-wranglling\y_train.csv',index_col='id')
X_test=pd.read_csv('..\Capstone-2-Data-wranglling\X_test.csv',index_col='id')
y_test=pd.read_csv('..\Capstone-2-Data-wranglling\y_test.csv',index_col='id')

In [5]:
X_train.shape

(3576, 20)

In [6]:
y_train.shape

(3576, 1)

In [7]:
y_train.value_counts()

stroke
0         3402
1          174
dtype: int64

Oversampling SMOTE and ADASYN, and Undersampling method was appplied for the imbalanced classification data.

In [8]:
sm=SMOTE()
X_train_sm,y_train_sm=sm.fit_resample(X_train,y_train)

In [26]:
from imblearn.under_sampling import RandomUnderSampler
RUS=RandomUnderSampler()
X_train_us,y_train_us=RUS.fit_resample(X_train,y_train)

In [10]:
from imblearn.over_sampling import ADASYN
AD=ADASYN()
X_train_ad,y_train_ad=AD.fit_resample(X_train,y_train)

In [11]:
X_train_sm.shape

(6804, 20)

In [12]:
y_train_sm.shape

(6804, 1)

In [13]:
X_train_us.shape

(348, 20)

In [14]:
y_train_us.value_counts()

stroke
0         174
1         174
dtype: int64

In [15]:
y_train_ad.value_counts()

stroke
0         3402
1         3334
dtype: int64

LogisticRegression Model used to compare the result with the imbalanced dataset from the previous notebook.

In [16]:
LR=LogisticRegression()
LR.fit(X_train_sm,y_train_sm)
ypred=LR.predict(X_test)

In [17]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.97      0.88      0.93      1458
           1       0.19      0.53      0.28        75

    accuracy                           0.87      1533
   macro avg       0.58      0.71      0.60      1533
weighted avg       0.94      0.87      0.89      1533



In [18]:
print(accuracy_score(y_test,ypred))

0.8662752772341813


In [19]:
cf_mat=confusion_matrix(y_test,ypred)
cf_mat

array([[1288,  170],
       [  35,   40]], dtype=int64)

Modelling with Oversampled Data(SMOTE)

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
RF=RandomForestClassifier(random_state=0,oob_score=True)
RF.fit(X_train_sm,y_train_sm)
ypredRF=RF.predict(X_test)

In [22]:
print(RF.oob_score_)

0.9609053497942387


In [23]:
print(accuracy_score(y_test,ypredRF))

0.9125896934116112


In [24]:
print(classification_report(y_test,ypredRF))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95      1458
           1       0.16      0.19      0.17        75

    accuracy                           0.91      1533
   macro avg       0.56      0.57      0.56      1533
weighted avg       0.92      0.91      0.92      1533



In [28]:
cfrf=confusion_matrix(y_test,ypredRF)
cfrf

array([[1389,   69],
       [  62,   13]], dtype=int64)

In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
param_grid={
    'n_estimators':np.arange(1,50),
    'max_depth':[4,5,6,7,8],
    'criterion':['gini','entropy'],
    'max_features':[3,5,7],
    'min_samples_split':[1,2,3]
           }

In [31]:
RF=RandomForestClassifier()
rf_grid=GridSearchCV(estimator=RF,param_grid=param_grid,cv=3,n_jobs=-1,verbose=2)
rf_grid.fit(X_train_sm,y_train_sm)
y_pred_rf=rf_grid.predict(X_test)
print(classification_report(y_test,y_pred_rf))

Fitting 3 folds for each of 4410 candidates, totalling 13230 fits
              precision    recall  f1-score   support

           0       0.97      0.80      0.88      1458
           1       0.11      0.49      0.18        75

    accuracy                           0.79      1533
   macro avg       0.54      0.65      0.53      1533
weighted avg       0.93      0.79      0.84      1533



In [32]:
print(rf_grid.best_score_)
print(rf_grid.best_params_)

0.8872721928277484
{'criterion': 'gini', 'max_depth': 8, 'max_features': 3, 'min_samples_split': 2, 'n_estimators': 38}


In [33]:
from sklearn.model_selection import ParameterGrid

In [34]:
rf=RandomForestClassifier(oob_score=True)
param_grid={
    'n_estimators':np.arange(1,50),
    'max_depth':[5,6,7,8],
    'criterion':['gini','entropy'],
    'max_features':[3,5,7],
    #'min_samples_split':[2,3,4]
           }

In [35]:
best_score=0
for g in ParameterGrid(param_grid):
    rf.set_params(**g)
    rf.fit(X_train_sm,y_train_sm)
    # save if best
    if rf.oob_score_ > best_score:
        best_score = rf.oob_score_
        best_grid = g

print ("OOB: %0.5f" % best_score) 
print ("Grid:", best_grid)

OOB: 0.88566
Grid: {'criterion': 'gini', 'max_depth': 8, 'max_features': 3, 'n_estimators': 40}


In [45]:
rf=RandomForestClassifier(criterion= 'gini', max_depth= 8, max_features= 3, n_estimators=40)
rf.fit(X_train_sm,y_train_sm)
y_pred_rf=rf.predict(X_test)
print(classification_report(y_test,y_pred_rf))

              precision    recall  f1-score   support

           0       0.97      0.77      0.86      1458
           1       0.11      0.56      0.19        75

    accuracy                           0.76      1533
   macro avg       0.54      0.67      0.52      1533
weighted avg       0.93      0.76      0.83      1533



In [46]:
print(confusion_matrix(y_test, y_pred_rf))

[[1127  331]
 [  33   42]]


In [None]:
#from bayes_opt import BayesianOptimization


In [194]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.2-py3-none-win_amd64.whl (106.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.2
Note: you may need to restart the kernel to use updated packages.


In [47]:
from xgboost import XGBClassifier

In [48]:
xgb=XGBClassifier(learning_rate=0.1,objective='binary:logistic',random_state=0,eval_metric='error')
xgb.fit(X_train_sm,y_train_sm)
ypred_xgb=xgb.predict(X_test)


In [49]:
print(confusion_matrix(y_test,ypred_xgb))

[[1339  119]
 [  53   22]]


In [50]:
print(classification_report(y_test,ypred_xgb))

              precision    recall  f1-score   support

           0       0.96      0.92      0.94      1458
           1       0.16      0.29      0.20        75

    accuracy                           0.89      1533
   macro avg       0.56      0.61      0.57      1533
weighted avg       0.92      0.89      0.90      1533



In [51]:
from sklearn.model_selection import RandomizedSearchCV

In [56]:
xgb=XGBClassifier(objective='binary:logistic',random_state=0,eval_metric='auc',)
params={ 'max_depth': [3, 5, 6, 10],
           'learning_rate': [0.1, 0.2, 0.3, 0.4],
           'subsample': np.arange(0.6, 0.8, 1.0),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'n_estimators': [100, 500, 1000]}
random_search=RandomizedSearchCV(xgb,param_distributions=params,n_iter=5,n_jobs=4,verbose=2)
random_search.fit(X_train_sm,y_train_sm)
rs_ypred=random_search.predict(X_test)
print(random_search.best_params_)
print(confusion_matrix(y_test,rs_ypred))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
{'subsample': 0.6, 'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.7999999999999999}
[[1401   57]
 [  63   12]]


In [54]:
#xgb=XGBClassifier(objective='binary:logistic',random_state=0,eval_metric='auc',subsample= 0.6, n_estimators= 500, max_depth= 10, learning_rate= 0.3, colsample_bytree= 0.7999999999999999)
#xgb.fit(X_train_sm,y_train_sm)
#ypred_xgb=xgb.predict(X_test)
#print(confusion_matrix(y_test,ypred_xgb))

[[1394   64]
 [  67    8]]


In [62]:
from lightgbm import LGBMClassifier

In [233]:
lgbm=LGBMClassifier()
lgbm.fit(X_train_sm,y_train_sm)
ypred_lgbm=lgbm.predict(X_test)

In [234]:
print(confusion_matrix(y_test,ypred_lgbm))

[[1392   66]
 [  62   13]]


In [63]:
lgbm=LGBMClassifier()
params_lgbm = {
    'learning_rate': [0.05,0.01,0.0001],
    'num_leaves': [90,140,200],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'max_depth' : [4,5,6,7],
    'random_state' : [42], 
    'colsample_bytree' : [0.6,0.7,0.8,1.0],
    'subsample' : [0.5,0.6,0.7,0.8,1.0],
    'min_split_gain' : [0.01],
    'min_data_in_leaf':[10],
    'metric':['auc']
    }
clf=LGBMClassifier()
LGBMRS=RandomizedSearchCV(clf,param_distributions=params_lgbm,verbose=2,cv=10,n_jobs=-1,n_iter=10)
LGBMRS.fit(X_train_sm,y_train_sm)
ypred_lgbm=LGBMRS.predict(X_test)
print(LGBMRS.best_params_)
print(confusion_matrix(y_test,ypred_lgbm))

Fitting 10 folds for each of 10 candidates, totalling 100 fits
{'subsample': 0.8, 'random_state': 42, 'objective': 'binary', 'num_leaves': 200, 'min_split_gain': 0.01, 'min_data_in_leaf': 10, 'metric': 'auc', 'max_depth': 7, 'learning_rate': 0.05, 'colsample_bytree': 0.6, 'boosting_type': 'gbdt'}
[[1312  146]
 [  49   26]]


In [244]:
clf=LGBMClassifier()
LGBMRS=RandomizedSearchCV(clf,param_distributions=params_lgbm,verbose=2,cv=10,n_jobs=-1,n_iter=10)
LGBMRS.fit(X_train_sm,y_train_sm)
ypred_lgbm=LGBMRS.predict(X_test)
print(LGBMRS.best_params_)
print(confusion_matrix(y_test,ypred_lgbm)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
{'subsample': 0.8, 'random_state': 42, 'objective': 'binary', 'num_leaves': 140, 'min_split_gain': 0.01, 'min_data_in_leaf': 10, 'metric': 'auc', 'max_depth': 7, 'learning_rate': 0.05, 'colsample_bytree': 0.8, 'boosting_type': 'gbdt'}


In [245]:
lgbm=LGBMClassifier(subsample= 0.8, random_state= 42, objective= 'binary', num_leaves= 140, min_split_gain= 0.01, min_data_in_leaf =10, metric= 'auc', max_depth= 7, learning_rate= 0.05, colsample_bytree= 0.8, boosting_type= 'gbdt')
lgbm.fit(X_train_sm,y_train_sm)
ypred_lgbm=lgbm.predict(X_test)
print(confusion_matrix(y_test,ypred_lgbm))

[[1323  135]
 [  50   25]]


In [246]:
print(classification_report(y_test,ypred_lgbm))

              precision    recall  f1-score   support

           0       0.96      0.91      0.93      1458
           1       0.16      0.33      0.21        75

    accuracy                           0.88      1533
   macro avg       0.56      0.62      0.57      1533
weighted avg       0.92      0.88      0.90      1533



In [65]:
params = {
    'learning_rate': [0.05,0.01,0.0001],
    'num_leaves': [90,140,200],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'max_depth' : [3,4,5,6,7,8],
    'random_state' : [42], 
    'colsample_bytree' : [0.5,0.6,0.7,0.8,1.0],
    'subsample' : [0.5,0.6,0.7,0.8,1.0],
    'min_split_gain' : [0.01],
    'min_data_in_leaf':[10],
    'metric':['auc']
    }
clf = LGBMClassifier()
RSCV = RandomizedSearchCV(clf,params,verbose=3,cv=10,n_jobs = -1,n_iter=10)
RSCV.fit(X_train_sm,y_train_sm)
RSCV_pred=RSCV.predict(X_test)
print(classification_report(y_test,RSCV_pred))

Fitting 10 folds for each of 10 candidates, totalling 100 fits
              precision    recall  f1-score   support

           0       0.96      0.92      0.94      1458
           1       0.16      0.29      0.21        75

    accuracy                           0.89      1533
   macro avg       0.56      0.61      0.57      1533
weighted avg       0.92      0.89      0.90      1533

