In [None]:
# import the required libraries

from scipy.stats import norm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import GaussianNB

In [None]:
# Load the dataset

!wget https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data
df = pd.read_csv('processed.cleveland.data', header=None)
df

--2022-05-16 15:17:00--  https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18461 (18K) [application/x-httpd-php]
Saving to: ‘processed.cleveland.data.1’


2022-05-16 15:17:00 (352 KB/s) - ‘processed.cleveland.data.1’ saved [18461/18461]



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [None]:
# rename the columns

names = ['age', 'sex', 'cp', 'restbp', 'chol', 'fbs', 
         'restecg', 'thalach', 'exang', 'oldpeak', 
         'slope', 'ca', 'thal', 'target']

df.columns = names
df.head()

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [None]:
# print data types

df.dtypes

age        float64
sex        float64
cp         float64
restbp     float64
chol       float64
fbs        float64
restecg    float64
thalach    float64
exang      float64
oldpeak    float64
slope      float64
ca          object
thal        object
target       int64
dtype: object

In [None]:
# get the unique values of 'ca'

df['ca'].unique()

array(['0.0', '3.0', '2.0', '1.0', '?'], dtype=object)

In [None]:
# get the unique values of 'thal

df['thal'].unique()

array(['6.0', '3.0', '7.0', '?'], dtype=object)

In [None]:
# replace ? with nan

df[df == '?'] = np.nan

In [None]:
# check for missing values

for col in df.columns:
    missing = df[col].isnull().sum()
    print(f'{missing} missing values in {col}')

0 missing values in age
0 missing values in sex
0 missing values in cp
0 missing values in restbp
0 missing values in chol
0 missing values in fbs
0 missing values in restecg
0 missing values in thalach
0 missing values in exang
0 missing values in oldpeak
0 missing values in slope
4 missing values in ca
2 missing values in thal
0 missing values in target


In [None]:
# remove missing (?) values

df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,1
293,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
294,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
295,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3


In [None]:
# convert 'ca' and 'thal' from object to float

df['ca'] = df['ca'].astype(float)
df['thal'] = df['thal'].astype(float)

In [None]:
# print data types

df.dtypes

age        float64
sex        float64
cp         float64
restbp     float64
chol       float64
fbs        float64
restecg    float64
thalach    float64
exang      float64
oldpeak    float64
slope      float64
ca         float64
thal       float64
target       int64
dtype: object

# 2)

In [None]:
# implement Naive Bayes

class NaiveBayesClassifier():
    
    mu = None
    sigma = None
    n_classes = None
    
    def __init__(self, priors):
        a = None
        self.priors=priors
    
    def pred(self, x):
        prob_vect = np.zeros(self.n_classes)
        
        for i in range(self.n_classes):
            prob_vect[i] = self.priors[i]
            for j in range(len(self.mu[i])):
                normal = norm(self.mu[i, j], self.sigma[i, j])
                prob_vect[i] *= normal.pdf(x[j])
        
        prob_vect = [p/(sum(prob_vect)) for p in prob_vect]
        
        return prob_vect
        
    def fit(self, X, y):
        self.n_classes = np.max(y) + 1
        self.mu = [[] for _ in range(self.n_classes)]
        self.sigma = [[] for _ in range(self.n_classes)]
        
        for i in range(self.n_classes):
            Xc = X[y==i]
            for j in range(Xc.shape[1]):
                mu_c_f = np.mean(Xc[:, j])
                self.mu[i].append(mu_c_f)
                sigma_c_f = np.std(Xc[:, j])
                self.sigma[i].append(sigma_c_f)

        self.mu = np.asarray(self.mu)
        self.sigma = np.asarray(self.sigma)

# 3)

In [None]:
# create the features data (X) and the target (y)

X = df[['chol', 'restbp', 'thalach']]
y = df['target']

In [None]:
# split the dataset 80-20

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2022)

In [None]:
# train and test using the custom model

priors_ML = [1/y.nunique() for _ in range(y.nunique())]
print(f'Max Likelihood Priors: {priors_ML}\n')

nb_model_custom = NaiveBayesClassifier(priors=priors_ML)
nb_model_custom.fit(np.array(X_train), y_train)

y_pred = []
for i in range(X_test.shape[0]):
    y_pr_i = nb_model_custom.pred(np.array(X_test)[i])
    y_pred.append(np.argmax(y_pr_i))

print(f'Testing Accuracy: {round(100*accuracy_score(y_test, y_pred), 2)}%\n')
print(f'{classification_report(y_test, y_pred)}')

Max Likelihood Priors: [0.2, 0.2, 0.2, 0.2, 0.2]

Testing Accuracy: 43.33%

              precision    recall  f1-score   support

           0       0.78      0.56      0.65        32
           1       0.40      0.29      0.33        14
           2       0.17      0.40      0.24         5
           3       0.11      0.17      0.13         6
           4       0.17      0.33      0.22         3

    accuracy                           0.43        60
   macro avg       0.33      0.35      0.32        60
weighted avg       0.54      0.43      0.47        60



# 4)

In [None]:
# train and test using the sklearn model

nb_model_sklearn = GaussianNB()
nb_model_sklearn.fit(X_train, y_train)
y_pred = nb_model_sklearn.predict(X_test)

print(f'Testing Accuracy: {round(100*accuracy_score(y_test, y_pred), 2)}%\n')
print(f'{classification_report(y_test, y_pred)}')

Testing Accuracy: 53.33%

              precision    recall  f1-score   support

           0       0.62      0.94      0.75        32
           1       0.00      0.00      0.00        14
           2       0.33      0.20      0.25         5
           3       0.11      0.17      0.13         6
           4       0.00      0.00      0.00         3

    accuracy                           0.53        60
   macro avg       0.21      0.26      0.23        60
weighted avg       0.37      0.53      0.43        60

