In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
# load .csv file and set X and y(class). and check the information

df = pd.read_csv("pima-indians-diabetes.csv")
df.columns = ["pregnancies", "glucose", "blood_pressure", "skin_thickness","insulin","bmi","Diabetes_Pedigree_Function","age","class"]
X = df.loc[:,'pregnancies':'age']
y = df['class']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 767 entries, 0 to 766
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   pregnancies                 767 non-null    int64  
 1   glucose                     767 non-null    int64  
 2   blood_pressure              767 non-null    int64  
 3   skin_thickness              767 non-null    int64  
 4   insulin                     767 non-null    int64  
 5   bmi                         767 non-null    float64
 6   Diabetes_Pedigree_Function  767 non-null    float64
 7   age                         767 non-null    int64  
 8   class                       767 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [3]:
# 0 mean a missing value, so it is changed to a Nan value and then changed to an average value. 
# (The number of pregnancies can actually be zero, so it was excluded from the process of changing to the Nan value.)

temp = df.loc[:,'glucose':'age']
temp = temp.replace(0, np.NaN)
temp = temp.fillna(df.mean())
X = pd.concat([df.loc[:,'pregnancies':'pregnancies'], temp],axis=1)
df = pd.concat([X, y],axis=1)
df.head()

Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,Diabetes_Pedigree_Function,age,class
0,1,85.0,66.0,29.0,79.90352,26.6,0.351,31,0
1,8,183.0,64.0,20.517601,79.90352,23.3,0.672,32,1
2,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
3,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
4,5,116.0,74.0,20.517601,79.90352,25.6,0.201,30,0


In [4]:
# preprocessing dataset to small value
def pre(X, split_range):
    new_df = []
    n = len(split_range)
    for i in range(n): # 0~7
        cnt = np.arange(0,split_range[i])
        temp = pd.cut(X.iloc[:,i], split_range[i], labels=cnt)
        new_df.append(temp)
    
    new_df = np.array(new_df).T
    new_df = pd.DataFrame(new_df)
    new_df.columns = ["pregnancies", "glucose", "blood_pressure", "skin_thickness","insulin","bmi","Diabetes_Pedigree_Function","age"]
    return new_df


In [5]:
split_range = [4,5,4,3,2,5,4,5]
X_pred = pre(X, split_range)

In [6]:
df = pd.concat([X_pred, y],axis=1)
df.head()

Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,Diabetes_Pedigree_Function,age,class
0,0,1,1,0,0,0,0,0,0
1,1,4,1,0,0,0,1,0,1
2,0,1,1,0,0,1,0,0,0
3,0,2,0,0,0,2,3,0,1
4,1,2,2,0,0,0,0,0,0


In [7]:
# divide train set, test set 80% and 20%
X_train, X_test, y_train, y_test = train_test_split(X_pred, y, test_size=0.2, shuffle=True, stratify=y, random_state=0)

In [8]:
# counter number of data val
def cnt_num(x_data, y_data, spr):
    row = spr
    cnt = np.zeros((row,2))
    n = len(y_data)
    for i in range(n):
        
        if int(y_data.iloc[i]) == 1:
            cnt[x_data.iloc[i]][1] = cnt[x_data.iloc[i]][1] + 1
        else:
            cnt[x_data.iloc[i]][0] = cnt[x_data.iloc[i]][0] + 1
    return cnt

In [9]:
# calculate probability one and zero
def get_prob(x_data):
    
    zero_pv = 1
    one_pv = 1
    n = len(x_data)
    for i in range(n):
        one_pv *= ( temp_res[i][x_data[i]][1] / sum(y_train == 1) )
        zero_pv *= ( temp_res[i][x_data[i]][0] / sum(y_train == 0) )
    
        
    return zero_pv, one_pv

In [10]:
# naive bays classifier for train and predict the dataset
class NaiveBaysClassifier:
    def train(train_data, y_data):
        temp_res = []
        for i, columns in enumerate(train_data):
            temp_res.append(cnt_num(train_data[columns], y_data, split_range[i])) 
    
        return temp_res
    
    def predict(X_test, y_test):
        n = len(y_test)
        object_val = [0]*n
        for i in range(n):
            if get_prob(X_test.iloc[i])[0] < get_prob(X_test.iloc[i])[1]:
                object_val[i] = 1
            else:
                object_val[i] = 0
                
        return object_val

In [11]:
# using NaiveBaysClassifier, train datatset and get confusion matrix and score.
temp_res = NaiveBaysClassifier.train(X_train, y_train)
y_pred = NaiveBaysClassifier.predict(X_train, y_train)
cf_mat = confusion_matrix(y_train, y_pred)
print(cf_mat)

acs = (cf_mat[0][0]+cf_mat[1][1])/(cf_mat[0][0]+cf_mat[0][1]+cf_mat[1][0]+cf_mat[1][1])
ps = cf_mat[1][1]/(cf_mat[0][1]+cf_mat[1][1])
rs = cf_mat[1][1]/(cf_mat[1][0]+cf_mat[1][1])
f1s = 2*ps*rs / (ps+rs)
print('accuracy score : ', acs)
print('preision score : ', ps)
print('recall score : ', rs)
print('f1 score : ', f1s)

[[301  99]
 [ 56 157]]
accuracy score :  0.7471451876019576
preision score :  0.61328125
recall score :  0.7370892018779343
f1 score :  0.6695095948827292


In [12]:
# using NaiveBaysClassifier, test datatset and get confusion matrix and score.
y_pred = NaiveBaysClassifier.predict(X_test, y_test)
cf_mat = confusion_matrix(y_test, y_pred)
print(cf_mat)

acs = (cf_mat[0][0]+cf_mat[1][1])/(cf_mat[0][0]+cf_mat[0][1]+cf_mat[1][0]+cf_mat[1][1])
ps = cf_mat[1][1]/(cf_mat[0][1]+cf_mat[1][1])
rs = cf_mat[1][1]/(cf_mat[1][0]+cf_mat[1][1])
f1s = 2*ps*rs / (ps+rs)

print('accuracy score : ', acs)
print('preision score : ', ps)
print('recall score : ', rs)
print('f1 score : ', f1s)

[[80 20]
 [10 44]]
accuracy score :  0.8051948051948052
preision score :  0.6875
recall score :  0.8148148148148148
f1 score :  0.7457627118644067


In [13]:
# using GaussianNB, test datatset. and get confusion matrix and score.
X_train, X_test, y_train, y_test = train_test_split(X_pred, y, test_size=0.2, shuffle=True, stratify=y, random_state=0)
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
cf_mat = confusion_matrix(y_test, y_pred)
print(cf_mat)

acs = (cf_mat[0][0]+cf_mat[1][1])/(cf_mat[0][0]+cf_mat[0][1]+cf_mat[1][0]+cf_mat[1][1])
ps = cf_mat[1][1]/(cf_mat[0][1]+cf_mat[1][1])
rs = cf_mat[1][1]/(cf_mat[1][0]+cf_mat[1][1])
f1s = 2*ps*rs / (ps+rs)

print('accuracy score : ', acs)
print('preision score : ', ps)
print('recall score : ', rs)
print('f1 score : ', f1s)

[[92  8]
 [30 24]]
accuracy score :  0.7532467532467533
preision score :  0.75
recall score :  0.4444444444444444
f1 score :  0.5581395348837209


The data were called up and the missing values were filled in on average. It also divided the range and reduced the values that were too large to be small. I didn't use the Laplace smoothing technique. I created a cnt num function that counts numbers, and a getprob function that calculates probabilities.  A NaiveBaysClassifier was created to predict the value by selecting a higher probability. In fact, we compared it to the most widely used Gauss NB. For each result, accuracy, preference, recall, and f1 were obtained using the confusion matrix. Accuracy and preference were similar, but recall and f1 were slightly different. NaiveBase is used well for spam filtering, abnormality detection, and document classification and so on.