In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


In [2]:
# #data loading
# data = pd.read_csv("pima-indians-diabetes.csv", names = ["preg","plas","pres","skin","test","mass","pedi","age","class_"])
# y = data["class_"]
# X = data.drop("class_",axis = 1)
# data.head()
# # df = pd.read_csv("pima-indians-diabetes.csv", names = ["pregnancies", "glucose", "blood_pressure", "skin_thickness","insulin","bmi","Diabetes_Pedigree_Function","age","class"])
# # df.head()

In [3]:
df = pd.read_csv("pima-indians-diabetes.csv")
df.columns = ["pregnancies", "glucose", "blood_pressure", "skin_thickness","insulin","bmi","Diabetes_Pedigree_Function","age","class"]
X = df.loc[:,'pregnancies':'age']
y = df['class']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 767 entries, 0 to 766
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   pregnancies                 767 non-null    int64  
 1   glucose                     767 non-null    int64  
 2   blood_pressure              767 non-null    int64  
 3   skin_thickness              767 non-null    int64  
 4   insulin                     767 non-null    int64  
 5   bmi                         767 non-null    float64
 6   Diabetes_Pedigree_Function  767 non-null    float64
 7   age                         767 non-null    int64  
 8   class                       767 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [9]:
def estimate_target_prob(df, target, label):
    return len(df[df[target]==label])/len(df)


In [10]:
def category_prob(df, column, category, target, label, l=1):
    numerator = len(df[(df[column]==category)&(df[target]==label)]) + l
    denominator = len(df[df[target]==label])+l*len(df[column].unique())
    return numerator/denominator

In [11]:
def estimate_mean_std(df, column, target, label):
    temp = df[df[target]==label]
    mu = temp[column].mean()
    std = temp[column].std()
    return mu, std

In [12]:
class NaiveBaysClassifier:
    def __init__(self,unique_label,target,categorical_column=[]):
        self.__reference_dict = None
        self.__prior_probs = None
        self.__categorical_column = categorical_column
        self.__unique_label = unique_label
        self.__target = target
        
    def train(self, train_df):
        reference_dict = dict()
        for col in train_df.columns[:-1]: ## target column은 마지막열에 있다고 가정.
            temp_res = []
            for label in self.__unique_label:
                if col in self.__categorical_column:
                    unique_value = train_df[col].unique()
                    temp_res1 = []
                    for u in unique_value:
                        temp_res1.append(category_prob(train_df, col, u, self.__target, label, l=1))
                    temp_res.append(temp_res1)
                else:
                    temp_res.append(estimate_mean_std(train_df, col, self.__target, label))
 
            reference_dict[col] = temp_res
        self.__reference_dict = reference_dict
        
        prior_probs = []
        for label in self.__unique_label:
            prior_prob = estimate_target_prob(train_df, self.__target, label)
            prior_probs.append(prior_prob)
        self.__prior_probs = prior_probs
        
        
    def get_reference_dict(self):
        return self.__reference_dict
 
    def predict(self, new_data):
        object_value = [0]*len(self.__unique_label)
        for idx in new_data.index[:-1]:
            value = new_data[idx] 
            reference_value = self.__reference_dict[idx]
 
            if idx in self.__categorical_column:
                for i, r in enumerate(reference_value):
                    ## if category, reference value is the value of probability
                    value = int(value) ## 표준화를 하는 과정에서 다른 열의 정수 값이 float로 바뀌기 다시 정수로 바꿔줌
                    object_value[i] += np.log(r[value])
            else:
                for i, r in enumerate(reference_value):
                    ## if continuous, reference value contains (mean, std)
                    object_value[i] += np.log(norm.pdf(value, r[0], r[1]))
 
        
        for i in self.__unique_label:
            object_value[i] += np.log(self.__prior_probs[i])
        max_object_value = max(object_value)
        max_idx = object_value.index(max_object_value)
        return max_idx

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size = 0.2, random_state = 0)

In [31]:
y_test

661    1
122    0
113    1
14     1
529    0
      ..
476    0
482    0
230    1
527    0
380    0
Name: class, Length: 154, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[89 12]
 [18 35]]
0.8051948051948052


In [14]:
def split_test_train_data(df, split_ratio):
    num_test_df = int(len(df)*split_ratio)
    idx_test_df = random.sample(df.index.tolist(),num_test_df)
    
    test_df = df.loc[idx_test_df].reset_index(drop=True)
    train_df = df.drop(idx_test_df).reset_index(drop=True)
    return train_df, test_df

In [17]:
random.seed(110)
split_ratio = 0.2
 
train_df, test_df = split_test_train_data(df,split_ratio) ## split train, test dataset
 

for col in train_df.columns[:-1]:
    if col not in categorical_column:
        mean = train_df[col].mean()
        std = train_df[col].std()
        train_df[col] = train_df[col].map(lambda x: (x-mean)/std)
        test_df[col] = test_df[col].map(lambda x: (x-mean)/std)
 
target = 'class'
unique_label = df[target].unique()
 
NBC = NaiveBaysClassifier(unique_label,target,categorical_column)

In [18]:
NBC.train(train_df) ## 나이브 베이즈 분류기 학습

In [22]:
import pandas as pd
import numpy as np
import random
import re
import warnings
warnings.filterwarnings('ignore')
 
from collections import defaultdict
from tqdm import tqdm
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer


In [23]:
## train accuracy
total_sum = 0
for i, new_data in tqdm(train_df.iterrows(), total=len(train_df)):
    if new_data[target] == NBC.predict(new_data):
        total_sum += 1        
        
accuracy = total_sum/len(train_df)

 69%|██████████████████████████▉            | 424/614 [00:00<00:00, 9796.16it/s]


IndexError: list index out of range

In [24]:
## test accuracy
total_sum = 0
for i, new_data in tqdm(test_df.iterrows(), total=len(test_df)):
    if new_data[target] == NBC.predict(new_data):
        total_sum += 1        
        
accuracy = total_sum/len(test_df)

 14%|█████▍                                  | 21/153 [00:00<00:00, 5210.93it/s]


IndexError: list index out of range

In [28]:
split_range = [4,5,4,3,2,5,4,5]
count = np.zeros((4,2))
count

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])