In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pandas import read_csv
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import random

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = read_csv('/content/drive/MyDrive/HW-2ML/heart_disease_uci.csv')

In [6]:
df.head(5)


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [7]:
# Checking Dtypes of features
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [9]:
# Chaning the fourth column name
df.rename(columns={'dataset': 'Location'}, inplace=True)

Null Value Treatment

In [10]:
# Finding Null values in terms of percentage
total_null_values = df.isnull().sum()
na_columns=[]
for column, na_count in total_null_values.items():
    print(f'{column}: {round(na_count*100/920,2)}%')
    if (na_count*100/920)>30:
        na_columns.append(column)
print(f'columns with high Na values are:{na_columns}')

id: 0.0%
age: 0.0%
sex: 0.0%
Location: 0.0%
cp: 0.0%
trestbps: 6.41%
chol: 3.26%
fbs: 9.78%
restecg: 0.22%
thalch: 5.98%
exang: 5.98%
oldpeak: 6.74%
slope: 33.59%
ca: 66.41%
thal: 52.83%
num: 0.0%
columns with high Na values are:['slope', 'ca', 'thal']


Approach to Null Value Treatment

i) For Categorical feature having very less na values we will replace it by backfill

ii) For Numerical feature having very less na values we will replace it by mean of the feature

ii) For features having very high na values,will drop the feature after doing corelation analysis with target feature

In [11]:
# For Categorical feature having very less na values we will replace it by backfill
df['restecg'].fillna(method='bfill', inplace=True)
df['fbs'].fillna(method='bfill', inplace=True)
df['exang'].fillna(method='bfill', inplace=True)

In [12]:
# For Numerical feature having very less na values  we will replace it by mean of the feature
df['thalch'] = df['thalch'].fillna(df['thalch'].mean())
df['oldpeak'] = df['oldpeak'].fillna(df['oldpeak'].mean())
df['trestbps'] = df['trestbps'].fillna(df['trestbps'].mean())
df['chol'] = df['chol'].fillna(df['trestbps'].mean())

In [13]:
df.isnull().sum()

id            0
age           0
sex           0
Location      0
cp            0
trestbps      0
chol          0
fbs           0
restecg       0
thalch        0
exang         0
oldpeak       0
slope       309
ca          611
thal        486
num           0
dtype: int64

features such as slope,ca and thal contains high percentage of NA values,finding the importance or correlation of these parametres with target feature

In [14]:
## Analyzing these columns ['slope', 'ca', 'thal'] beacuse they have vigh high amount of Null Values
## due to lack of domain knowledge about this dataset,finding the importance of these features through correlation

# finding correlation between tgt feature and categorical feature mentioned above

from scipy.stats import chi2_contingency
for i in na_columns:
    contingency_table = pd.crosstab(df[i], df['num'])
    chi2 =  chi2_contingency(contingency_table)[0]
    cramers_v = np.sqrt(chi2 / (df.shape[0] * (min(2, 2) - 1)))

# Print Cramer's V
    print(f'correlation between {i} and num if {round(cramers_v*100,2)}%')

correlation between slope and num if 33.64%
correlation between ca and num if 34.93%
correlation between thal and num if 35.21%


After finding the correlation of these features with target feature the correlation values appeared to be in range of 30 to 40 percent which is positive mild correlation hence decided to drop the features

In [15]:
df = df.drop(['slope', 'ca', 'thal'], axis=1)

In [16]:
df

Unnamed: 0,id,age,sex,Location,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,num
0,1,63,Male,Cleveland,typical angina,145.000000,233.0,True,lv hypertrophy,150.000000,False,2.300000,0
1,2,67,Male,Cleveland,asymptomatic,160.000000,286.0,False,lv hypertrophy,108.000000,True,1.500000,2
2,3,67,Male,Cleveland,asymptomatic,120.000000,229.0,False,lv hypertrophy,129.000000,True,2.600000,1
3,4,37,Male,Cleveland,non-anginal,130.000000,250.0,False,normal,187.000000,False,3.500000,0
4,5,41,Female,Cleveland,atypical angina,130.000000,204.0,False,lv hypertrophy,172.000000,False,1.400000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.000000,333.0,True,st-t abnormality,154.000000,False,0.000000,1
916,917,62,Male,VA Long Beach,typical angina,132.132404,139.0,False,st-t abnormality,137.545665,False,0.878788,0
917,918,55,Male,VA Long Beach,asymptomatic,122.000000,223.0,True,st-t abnormality,100.000000,False,0.000000,2
918,919,58,Male,VA Long Beach,asymptomatic,132.132404,385.0,True,lv hypertrophy,137.545665,True,0.878788,0


Label encoding for categorical data.

In [17]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
# df['species']= label_encoder.fit_transform(df['species'])
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])
df['fbs'] = label_encoder.fit_transform(df['fbs'])
df['exang'] = label_encoder.fit_transform(df['exang'])


In [18]:
# perform one-hot encoding for these columns ={sex, location,cp,fbs, restecg, exang}

df = pd.get_dummies(df, columns=['sex', 'Location', 'cp', 'fbs', 'restecg', 'exang'])


In [19]:
df.dtypes

id              int64
age             int64
trestbps      float64
chol          float64
thalch        float64
oldpeak       float64
num             int64
sex_0           uint8
sex_1           uint8
Location_0      uint8
Location_1      uint8
Location_2      uint8
Location_3      uint8
cp_0            uint8
cp_1            uint8
cp_2            uint8
cp_3            uint8
fbs_0           uint8
fbs_1           uint8
restecg_0       uint8
restecg_1       uint8
restecg_2       uint8
exang_0         uint8
exang_1         uint8
dtype: object

In [20]:
#  drop id column

df.drop('id', axis=1, inplace=True)


In [21]:
#  normalizing column {age, trestbps, chol, thalch, oldpeak}

# Normalizing the numerical features
scaler = StandardScaler()
df[['age', 'trestbps', 'chol', 'thalch', 'oldpeak']] = scaler.fit_transform(df[['age', 'trestbps', 'chol', 'thalch', 'oldpeak']])
df


Unnamed: 0,age,trestbps,chol,thalch,oldpeak,num,sex_0,sex_1,Location_0,Location_1,...,cp_1,cp_2,cp_3,fbs_0,fbs_1,restecg_0,restecg_1,restecg_2,exang_0,exang_1
0,1.007386,0.698041,0.329124,0.495698,1.349421,0,0,1,1,0,...,0,0,1,0,1,1,0,0,1,0
1,1.432034,1.511761,0.812936,-1.175955,0.589832,2,0,1,1,0,...,0,0,0,1,0,1,0,0,0,1
2,1.432034,-0.658158,0.292609,-0.340128,1.634267,1,0,1,1,0,...,0,0,0,1,0,1,0,0,0,1
3,-1.752828,-0.115679,0.484309,1.968345,2.488805,0,0,1,1,0,...,0,1,0,1,0,0,1,0,1,0
4,-1.328180,-0.115679,0.064396,1.371326,0.494884,0,1,0,1,0,...,1,0,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0.051927,-0.278423,1.241977,0.654903,-0.834397,1,1,0,0,0,...,0,0,0,0,1,0,0,1,1,0
916,0.901224,0.000000,-0.528959,0.000000,0.000000,0,0,1,0,0,...,0,0,1,1,0,0,0,1,1,0
917,0.158089,-0.549662,0.237838,-1.494365,-0.834397,2,0,1,0,0,...,0,0,0,0,1,0,0,1,1,0
918,0.476575,0.000000,1.716661,0.000000,0.000000,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,1


In [22]:
#initial split of 8:2 for Naive Bayes
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [23]:
print(train.shape)
print(test.shape)

(736, 23)
(184, 23)


In [24]:
X_train = train.drop(['num'], axis = 1)
Y_train = train['num']

X_test = test.drop(['num'], axis = 1)
Y_test = test['num']

In [25]:
print(X_train.shape)
print(Y_test.shape)

(736, 22)
(184,)


#Naive Bayes Classifier

In [26]:
#considering the prior is same and calculating only on the basis of mean and variance

means = train.groupby(["num"]).mean()
var = train.groupby(["num"]).var()
classes = np.unique(train["num"].tolist())

In [27]:
def Normal(data, mu, var):
    sd = np.sqrt(var)
    pdf = (np.e ** (-0.5 * ((data - mu)/sd) ** 2)) / (sd * np.sqrt(2 * np.pi))

    return pdf

In [28]:
def Predict(X):
    Predictions = []

    for i in X.index:

        ClassLikelihood = []
        instance = X.loc[i]

        for cls in classes:

            FeatureLikelihoods = []
            for col in X.columns:
                data = instance[col]
                mean = means[col].loc[cls]
                variance = var[col].loc[cls]

                Likelihood = Normal(data, mean, variance)

                if Likelihood != 0:
                    Likelihood = np.log(Likelihood) #log-likelihood evaluated at x
                else:
                    Likelihood = 1/len(train)

                FeatureLikelihoods.append(Likelihood)

            TotalLikelihood = sum(FeatureLikelihoods) #Calculating posterior
            ClassLikelihood.append(TotalLikelihood)

        MaxIndex = ClassLikelihood.index(max(ClassLikelihood)) # Finding largest posterior
        Prediction = classes[MaxIndex]
        Predictions.append(Prediction)

    return Predictions

In [29]:
PredictTrain = Predict(X_train)
PredictTest = Predict(X_test)

  pdf = (np.e ** (-0.5 * ((data - mu)/sd) ** 2)) / (sd * np.sqrt(2 * np.pi))
  pdf = (np.e ** (-0.5 * ((data - mu)/sd) ** 2)) / (sd * np.sqrt(2 * np.pi))


In [30]:
print("Training set accuracy", round(accuracy_score(Y_train, PredictTrain), 4)*100,"%")
print("Testing set accuracy", round(accuracy_score(Y_test, PredictTest), 4)*100,"%")

#we can see that both are close to equal

Training set accuracy 58.15 %
Testing set accuracy 52.72 %


In [35]:
def confusion_matrix(y_true, y_pred):
    """
    Compute confusion matrix.

    Args:
        y_true (ndarray): ground truth labels.
        y_pred (ndarray): predicted labels.

    Returns:
        ndarray: confusion matrix.
    """

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    num_classes = np.unique(y_true).size

    confusion_matrix = np.zeros((num_classes, num_classes))
    for i in range(num_classes):
        for j in range(num_classes):
            confusion_matrix[i, j] = np.sum((y_true == i) & (y_pred == j))

    return confusion_matrix

In [36]:
print("Confusion Matrix for Training Set:")
print(confusion_matrix(Y_train, PredictTrain))

print("Confusion Matrix for Testing Set:")
print(confusion_matrix(Y_test, PredictTest))


Confusion Matrix for Training Set:
[[268.  68.   0.   0.   0.]
 [ 51. 160.   0.   0.   0.]
 [ 12.  72.   0.   0.   0.]
 [  8.  73.   0.   0.   0.]
 [  3.  21.   0.   0.   0.]]
Confusion Matrix for Testing Set:
[[62. 13.  0.  0.  0.]
 [19. 35.  0.  0.  0.]
 [ 1. 24.  0.  0.  0.]
 [ 6. 20.  0.  0.  0.]
 [ 0.  4.  0.  0.  0.]]


In [37]:
def f1_score(y, y_hat):
    tp = 0
    tn = 0
    fp =0
    fn = 0

    for i, j in zip(y, y_hat):
        if i ==1 and j==1:
            tp = tp + 1
        elif i==0 and j==0:
            tn = tn + 1
        elif i==1 and j==0:
            fp = fp+1
        elif i==0 and j==1:
            fn = fn+1
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1score = (tp + tn)/ (tp + tn + fp + fn)
    return round(precision, 4), round(recall, 4), round(f1score, 4)

In [38]:
print("Precision, Recall and F1 score of the Naive Bayes Algorithm: ",f1_score(Y_test, PredictTest))


Precision, Recall and F1 score of the Naive Bayes Algorithm:  (0.6481, 0.7292, 0.7519)


###KNN


In [43]:
from collections import Counter

In [44]:
def L2_distance(a, b):
    dim = len(a)
    distance = 0

    for d in range(dim):
        distance += np.square(abs(a[d] - b[d]))

    distance = np.sqrt(distance)

    return distance

In [45]:
#normalizing using F(X) = (X −mean)/std

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [46]:
def knn_predict(X_train, Y_train, X_test, Y_test, k):
    y_hat_test = []

    for test_point in X_test:
        distances = []

        for train_point in X_train:
            distance = L2_distance(test_point, train_point)
            distances.append(distance)

        df_dists = pd.DataFrame(data=distances, columns=['dist'],
                                index=Y_train.index)

        df_nn = df_dists.sort_values(by=['dist'], axis=0)[:k]

        counter = Counter(Y_train[df_nn.index])

        prediction = counter.most_common()[0][0]

        y_hat_test.append(prediction)

    return y_hat_test

In [47]:
from sklearn.model_selection import StratifiedKFold

#implementing a 4-fold to test different values of k

strtfdKFold = StratifiedKFold(n_splits=4)
kfold = strtfdKFold.split(X_train, Y_train)

In [48]:
k_values = [1, 7, 18, int(np.sqrt(len(X_train)))]

#we know from theory that sqrt of the length of samples will provide the best value of k, so testing this theory
scores = []
for k, (t, v) in enumerate(kfold):
    y_hat_test = knn_predict(X_train[t, :], Y_train.iloc[t], X_train[v, :], Y_train.iloc[v],  k_values[k])
    score = accuracy_score(Y_train.iloc[v], y_hat_test)
    scores.append(score)
    print('Fold: %2d, value of k: %2d, Training/validation Split Distribution: %s, Validation Accuracy: %.3f' % (k+1, k_values[k], np.bincount(Y_train.iloc[t]), score*100))


print("Average validation accuracy: ", np.mean(scores))

Fold:  1, value of k:  1, Training/validation Split Distribution: [252 159  63  60  18], Validation Accuracy: 54.348
Fold:  2, value of k:  7, Training/validation Split Distribution: [252 158  63  61  18], Validation Accuracy: 52.174
Fold:  3, value of k: 18, Training/validation Split Distribution: [252 158  63  61  18], Validation Accuracy: 54.348
Fold:  4, value of k: 27, Training/validation Split Distribution: [252 158  63  61  18], Validation Accuracy: 59.783
Average validation accuracy:  0.5516304347826086


In [49]:
#Based on the k value that resulted the best accuracy in the validation set we will now be testing on the test set as well

y_hat_test = knn_predict(X_train, Y_train, X_test, Y_test, k_values[scores.index(max(scores))])

In [50]:
print("Testing accuracy based on the best value of k (", k_values[scores.index(max(scores))],"): ", round(accuracy_score(Y_test,y_hat_test)*100,4),"%")
print("Precision, Recall and F1 score of the Naive Bayes Algorithm: ",f1_score(Y_test, PredictTest))

Testing accuracy based on the best value of k ( 27 ):  57.0652 %
Precision, Recall and F1 score of the Naive Bayes Algorithm:  (0.6481, 0.7292, 0.7519)


In [52]:
# confusion matrix based on KNN resut
from sklearn.metrics import confusion_matrix
print("Confusion Matrix for KNN:")
print(confusion_matrix(Y_test, y_hat_test))


Confusion Matrix for KNN:
[[69  4  1  1  0]
 [18 31  3  2  0]
 [ 3 15  3  4  0]
 [ 7 15  2  2  0]
 [ 0  1  1  2  0]]


In [53]:
print("Precision, Recall and F1 score of the KNN Algorithm: ",f1_score(Y_test, y_hat_test))

Precision, Recall and F1 score of the KNN Algorithm:  (0.6327, 0.8857, 0.8197)


##Which classifier performs better?
## Why?

Given the precision, recall, and F1-score values for both models on the UCI Heart Disease dataset:

Naive Bayes:

Precision: 0.6481
Recall: 0.7292
F1-score: 0.7519
K-Nearest Neighbors (KNN):

Precision: 0.6327
Recall: 0.8857
F1-score: 0.8197

In this case, we can see that the performance metrics differ between the two models. Here's a comparison:

- Precision: Naive Bayes has a slightly higher precision (0.6481) compared to
KNN (0.6327), indicating that Naive Bayes is better at making positive predictions with higher precision.

- Recall: KNN has a significantly higher recall (0.8857) compared to Naive Bayes (0.7292), indicating that KNN is better at correctly identifying all actual positive instances.

- F1-score: The F1-score of KNN (0.8197) is also quite high, but it's slightly lower than the F1-score of Naive Bayes (0.7519).

The choice between Naive Bayes and KNN should consider the specific goals and requirements of your application:

- If precision is more critical (minimizing false positives) and we can accept a moderate recall rate, Naive Bayes might be preferred due to its higher precision. But in our case this is not crucial

- If recall is crucial (minimizing false negatives) and we can accept a slightly lower precision, KNN might be a better choice because it has a significantly higher recall and it fits with our case.

- Considering a balance between precision and recall, we can consider the F1-score. In this case, KNN has a slightly higher F1-score, indicating a better balance between precision and recall.