In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import naive_bayes
from scipy.stats import shapiro

#### Ali Shiraee - 400422112

In [2]:
data = pd.read_csv('heart.csv')

In [3]:
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [5]:
print(f'We have {data.isna().sum().sum()} null values in total.')

We have 0 null values in total.


In [6]:
data['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [7]:
X = data[['chol', 'trestbps', 'thalach']]
y = data[['target']]

In [8]:
stat, p = shapiro(X)
print('Statistics=%.3f, p=%.3f' % (stat, p))
alpha = 0.05
if p > alpha:
	print('Sample looks Gaussian (fail to reject H0)')
else:
	print('Sample does not look Gaussian (reject H0)')

Statistics=0.885, p=0.000
Sample does not look Gaussian (reject H0)


In [9]:
X

Unnamed: 0,chol,trestbps,thalach
0,233,145,150
1,250,130,187
2,204,130,172
3,236,120,178
4,354,120,163
...,...,...,...
298,241,140,123
299,264,110,132
300,193,144,141
301,131,130,115


In [10]:
X = X.values
y = y.values
y = y.reshape(-1,)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
class GNB:

    def separate_classes(self, X, y):
        separated_classes = {}
        for i in range(len(X)):
            feature_values = X[i]
            class_name = y[i]
            if class_name not in separated_classes:
                separated_classes[class_name] = []
            separated_classes[class_name].append(feature_values)
        return separated_classes
    
    def summarize(self, X):
        for feature in zip(*X):
            yield {
                'stdev' : np.std(feature),
                'mean' : np.mean(feature)
            }

    def gauss_distribution_function(self, x, mean, stdev):
        exponent = np.exp(-((x-mean)**2 / (2*stdev**2)))
        return exponent / (np.sqrt(2*np.pi)*stdev)

    def fit(self, X, y):
        print('training...')
        separated_classes = self.separate_classes(X, y)
        self.class_summary = {}
        for class_name, feature_values in separated_classes.items():
            self.class_summary[class_name] = {
                'prior_proba': len(feature_values)/len(X),
                'summary': [i for i in self.summarize(feature_values)],
            }     
        return self.class_summary
    
    def predict(self, X):
        print('predicting...')
        MAPs = []
        for row in X:
            joint_proba = {}
            for class_name, features in self.class_summary.items():
                total_features = len(features['summary'])
                likelihood = 1
                for idx in range(total_features):
                    feature = row[idx]
                    mean = features['summary'][idx]['mean']
                    stdev = features['summary'][idx]['stdev']
                    normal_proba = self.gauss_distribution_function(feature, mean, stdev)
                    likelihood *= normal_proba
                prior_proba = features['prior_proba']
                joint_proba[class_name] = prior_proba * likelihood
            MAP = max(joint_proba, key=joint_proba.get)
            MAPs.append(MAP)
        return MAPs

In [13]:
my_gnb = GNB()
my_gnb.fit(X_train, y_train)
my_y_pred = my_gnb.predict(X_test)
print(classification_report(y_test, my_y_pred, digits=4))

training...
predicting...
              precision    recall  f1-score   support

           0     0.6154    0.7619    0.6809        21
           1     0.8571    0.7500    0.8000        40

    accuracy                         0.7541        61
   macro avg     0.7363    0.7560    0.7404        61
weighted avg     0.7739    0.7541    0.7590        61



In [14]:
sklearn_gnb = naive_bayes.GaussianNB()
sklearn_gnb.fit(X_train, y_train)
y_pred = sklearn_gnb.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.6154    0.7619    0.6809        21
           1     0.8571    0.7500    0.8000        40

    accuracy                         0.7541        61
   macro avg     0.7363    0.7560    0.7404        61
weighted avg     0.7739    0.7541    0.7590        61

