In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import log_loss

import os

## Bernoulli NB

In [3]:
os.chdir(r"C:\CDAC\6_Practical_Machine_Learning\Datasets")

Telecom Datset

In [4]:
telecom = pd.read_csv("Telecom.csv")
telecom

Unnamed: 0,Gender,TT_gt_100,Response
0,F,Y,N
1,M,N,N
2,M,N,N
3,F,Y,Y
4,F,N,N
...,...,...,...
145,F,Y,Y
146,M,N,Y
147,M,N,N
148,M,N,N


In [5]:
dum_tel = pd.get_dummies(telecom, drop_first=True)
dum_tel

Unnamed: 0,Gender_M,TT_gt_100_Y,Response_Y
0,0,1,0
1,1,0,0
2,1,0,0
3,0,1,1
4,0,0,0
...,...,...,...
145,0,1,1
146,1,0,1
147,1,0,0
148,1,0,0


In [6]:
X = dum_tel.drop('Response_Y', axis=1)
X

Unnamed: 0,Gender_M,TT_gt_100_Y
0,0,1
1,1,0
2,1,0
3,0,1
4,0,0
...,...,...
145,0,1
146,1,0
147,1,0
148,1,0


In [7]:
y = dum_tel['Response_Y']
y

0      0
1      0
2      0
3      1
4      0
      ..
145    1
146    1
147    0
148    0
149    0
Name: Response_Y, Length: 150, dtype: uint8

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2023, test_size=0.3)

In [9]:
nb = BernoulliNB()
nb.fit(X_train, y_train) # Aprior Probs get calculated
y_pred_prob = nb.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))
print(log_loss(y_test, y_pred_prob))

0.8972332015810276
0.39978596913980674


In [10]:
y_pred = nb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[21  1]
 [ 5 18]]
0.8666666666666667


In [11]:
### Using K-Folds CV

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
nb = BernoulliNB()
results = cross_val_score(nb, X, y, scoring='roc_auc', cv=kfold)
print(results.mean())

0.8783531746031746


Cancer Dataset

In [12]:
os.chdir(r"C:\CDAC\6_Practical_Machine_Learning\Cases\Cancer")

In [13]:
cancer = pd.read_csv("Cancer.csv")
cancer

Unnamed: 0,subjid,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
0,1,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
1,2,50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
2,3,50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events
3,4,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
4,5,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events
...,...,...,...,...,...,...,...,...,...,...,...
281,282,50-59,ge40,30-34,6-8,yes,2,left,left_low,no,no-recurrence-events
282,283,50-59,premeno,25-29,3-5,yes,2,left,left_low,yes,no-recurrence-events
283,284,30-39,premeno,30-34,6-8,yes,2,right,right_up,no,no-recurrence-events
284,285,50-59,premeno,15-19,0-2,no,2,right,left_low,no,no-recurrence-events


In [14]:
dum_can = pd.get_dummies(cancer, drop_first=True)
dum_can

Unnamed: 0,subjid,deg-malig,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,menopause_lt40,menopause_premeno,tumor-size_10-14,...,inv-nodes_6-8,inv-nodes_9-11,node-caps_yes,breast_right,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,irradiat_yes,Class_recurrence-events
0,1,3,0,1,0,0,0,0,1,0,...,0,0,1,1,0,1,0,0,0,1
1,2,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,3,2,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,4,3,0,1,0,0,0,0,1,0,...,0,0,1,1,1,0,0,0,1,0
4,5,2,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,282,2,0,0,1,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,0
282,283,2,0,0,1,0,0,0,1,0,...,0,0,1,0,1,0,0,0,1,0
283,284,2,1,0,0,0,0,0,1,0,...,1,0,1,1,0,0,0,1,0,0
284,285,2,0,0,1,0,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0


In [15]:
X = dum_can.drop(['Class_recurrence-events','subjid'], axis=1)
y = dum_can['Class_recurrence-events']

In [22]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
nb = BernoulliNB()
results_roc_auc = cross_val_score(nb, X, y, scoring='roc_auc', cv=kfold)
results_log_loss = cross_val_score(nb, X, y, scoring='neg_log_loss', cv=kfold)
print(results_roc_auc.mean())
print(results_log_loss.mean())

0.655286944045911
-0.6431022033206913


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2023, test_size=0.3)

In [19]:
nb.fit(X_train, y_train)
y_pred_prob = nb.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))
print(log_loss(y_test, y_pred_prob))

0.7141025641025641
0.6167130598005681


## Gaussian NB

In [23]:
os.chdir(r"C:\CDAC\6_Practical_Machine_Learning\Cases\Bankruptcy")

Bankruptcy Dataset

In [24]:
brupt = pd.read_csv("Bankruptcy.csv")
brupt

Unnamed: 0,NO,D,YR,R1,R2,R3,R4,R5,R6,R7,...,R15,R16,R17,R18,R19,R20,R21,R22,R23,R24
0,1,0,78,0.23,0.08,0.02,0.03,0.46,0.12,0.19,...,0.05,0.57,0.15,0.23,3.56,0.26,1.55,0.43,0.11,0.17
1,2,0,77,0.19,0.07,0.09,0.12,0.02,0.02,0.03,...,0.09,0.12,0.16,0.22,3.78,1.29,1.40,0.06,0.07,0.10
2,3,0,72,0.07,0.02,0.03,0.05,0.06,0.10,0.14,...,-0.03,0.02,0.02,0.04,13.29,1.61,1.43,0.03,0.05,0.07
3,4,0,80,0.07,0.03,0.04,0.04,0.04,0.06,0.06,...,-0.02,0.01,0.02,0.02,5.36,1.30,1.12,-0.06,-0.08,-0.09
4,5,0,81,0.09,0.02,0.03,0.04,0.06,0.08,0.11,...,0.02,0.07,0.10,0.14,7.74,1.48,1.41,0.03,0.04,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,128,1,77,0.27,0.03,0.07,0.10,0.09,0.18,0.26,...,0.11,0.06,0.12,0.17,7.56,2.07,1.45,0.06,0.13,0.19
128,129,1,77,0.32,0.03,0.03,0.09,0.05,0.06,0.16,...,0.17,0.07,0.09,0.26,5.99,1.27,2.74,0.06,0.08,0.21
129,130,1,78,0.08,0.01,0.02,0.05,0.04,0.07,0.15,...,0.19,0.07,0.12,0.26,7.14,1.89,2.10,0.07,0.12,0.26
130,131,1,78,0.14,0.01,0.05,0.07,0.02,0.09,0.14,...,0.07,0.02,0.10,0.15,170.96,4.55,1.45,0.02,0.10,0.14


In [25]:
X = brupt.drop(['NO', 'D', 'YR'], axis=1)
y = brupt['D']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2023, test_size=0.3)

In [27]:
nb = GaussianNB()
nb.fit(X_train, y_train) # Aprior Probs get calculated
y_pred_prob = nb.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))
print(log_loss(y_test, y_pred_prob))

0.875
1.9537944932352929


In [28]:
y_pred = nb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[16  4]
 [ 3 17]]
0.825


Using K-Folds CV

In [31]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
nb = GaussianNB()
results = cross_val_score(nb, X, y, scoring='roc_auc', cv=kfold)
print(results.mean())

0.849788672865596
