In [358]:
#importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from math import sqrt, exp, pi
from sklearn.preprocessing import RobustScaler

In [None]:
# approach used :
#P(posterior)=P(prior)*(P(likelihood of numerical)*P(likelihood of categorical)

In [359]:
#calculating likelihood for categorical columns
# seperating class for the categorical dataset
def separate_by_class_cat(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset.iloc[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

# seperating keys in each feature (ex: in feature Sex, no of M and F for 0 and no of M and F for 1)
def seperate_x(dataset,k):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset.iloc[i]
        class_value = vector[k]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector[k])
    return separated

# calculating the occurences of each key happening
def keys_number(cat_columns,cat0,cat1):
    probs0=dict()
    x=dict()
    for k in range(len(cat_columns)-1):
        prob_cat=seperate_x(cat0,k)
        for cv,rows in prob_cat.items():
            probs0[cv]=len(prob_cat[cv])
    probs1=dict()
    for k in range(len(cat_columns)-1):
        prob_cat=seperate_x(cat1,k)
        for cv,rows in prob_cat.items():
            probs1[cv]=len(prob_cat[cv])
    return probs0, probs1

#calculating probability of each key in class 0 or class 1 (i.e, Likelihood of each key)
def probs(xcat,xcat_columns):
    model_cat = separate_by_class_cat(xcat)
    cat0=pd.DataFrame(model_cat[0])
    cat1=pd.DataFrame(model_cat[1])
    probs0,probs1 = keys_number(xcat_columns,cat0,cat1)
    p0=dict()
    p1=dict()
    for cv in probs0:
        zero=probs0[cv]/(probs0[cv]+probs1[cv])
        one=probs1[cv]/(probs1[cv]+probs0[cv])
        p0[cv]=zero
        p1[cv]=one
#print(p0,p1)
    return p0, p1

#predicting the likelihood for the test set
def test_prob(X_test,p0,p1):
    xp0=dict()
    for i in range(len(X_test)):
        l=1
        for cv in X_test:
            for k in p0:
                if X_test[cv].iloc[i]==k:
                    l*=p0[k]
        xp0[i]=l
    xp1=dict()
    for i in range(len(X_test)):
        l=1
        for cv in X_test:
            for k in p1:
                if X_test[cv].iloc[i]==k:
                    l*=p1[k]
        xp1[i]=l
    return xp0,xp1

In [360]:
#calculating likelihood for numerical columns
# the numerical class names are seperated into 0 and 1
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers)/float(len(numbers))
 
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance =sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries
 
# Split dataset by class then calculate statistics(std deviation, mean) for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) *exponent
 
# Calculate the prior and likelihood of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *=calculate_probability(row[i], mean, stdev)
    return probabilities
 

In [361]:
# picking the best probabilities between 0 and 1
def predict(probabilitiesx):
    best_label, best_prob = None, -1
    for class_value, probability in probabilitiesx.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

In [362]:
# calculating Accuracy score
def accuracy_rate(test, predictions):
    correct = 0
    for i in range(len(test)):
        if test[i] == predictions[i]:
            correct += 1
    return (correct / float(len(test))) * 100.0

In [363]:
#calculating precision, recall, f1
def confusion_matrix(actual_value,predicted_value):
        tp=0
        tn=0
        fp=0
        fn=0
        for i in range(len(predicted_value)):
            if predicted_value[i]==0:
                if predicted_value[i]==actual_value[i]:
                    tn=tn+1
                else:
                    fn=fn+1
            if predicted_value[i]==1:
                if predicted_value[i]==actual_value[i]:
                    tp=tp+1
                else:
                    fp=fp+1
        prec=tp/(tp+fp)
        recall=tp/(tp+fn)
        f1=2*(prec*recall)/(prec+recall)
        return prec,recall,f1
           

In [364]:
#reading the data
O=pd.read_csv('heart.csv',header=0)
O=O.dropna()
Y=O['HeartDisease']

In [365]:
#splitting the dataset
data, X_test,Y_train, Y_test = train_test_split(O,Y, test_size=0.2, random_state=42, shuffle= False)

In [366]:
#sepearting dataset into categorical and numerical columns
cat_columns = data.select_dtypes(include='object').columns.tolist() #categorical_columns
num_columns = data.select_dtypes(exclude='object').columns.tolist() #numerical_columns

print('The categorical columns are', cat_columns)
print('The numerical columns are', num_columns)

The categorical columns are ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
The numerical columns are ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'HeartDisease']


In [367]:
#although FastingBS is stored as (0 or 1).  we can treat it as categorical.
cat_columns.append('FastingBS')
num_columns.remove('FastingBS')
num_columns.remove('HeartDisease')
print('The categorical columns are', cat_columns)
print('The numerical columns are', num_columns)

The categorical columns are ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'FastingBS']
The numerical columns are ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']


In [368]:
# assigning test and train data of numerical columns
X_train_num=data[num_columns]
X_test_num=X_test[num_columns]

In [369]:
#standardization of numerical dataset
scaler = RobustScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_test_num = scaler.transform(X_test_num)
X_train_num=np.column_stack((X_train_num,Y_train))

In [370]:
# fit model for numerical columns
model = summarize_by_class(X_train_num)
print(model)

{0.0: [(-0.2953648915187375, 0.7353144662561769, 312), (0.0009615384615384517, 0.8274135727560343, 312), (0.03550691125841427, 0.6492434567841144, 312), (0.3632478632478631, 0.6965480553098424, 312), (-0.06645299145299181, 0.4557754785142438, 312)], 1.0: [(0.12504557054320092, 0.677323723290734, 422), (0.21623222748815166, 1.010375285092694, 422), (-0.45199403546429523, 1.058159977029336, 422), (-0.20600315955766177, 0.6702880862527503, 422), (0.48925750394944634, 0.727030008271457, 422)]}


In [371]:
# printing the probability of eack key in 0 and 1)
cat_columns.append('HeartDisease')
cat=(data[cat_columns])
p0,p1 = probs(cat,cat_columns)
print(p0)
print(p1)

{'M': 0.35067114093959734, 'F': 0.7463768115942029, 'ATA': 0.8689655172413793, 'NAP': 0.5862068965517241, 'TA': 0.53125, 'ASY': 0.20388349514563106, 'Normal': 0.462882096069869, 'ST': 0.3465909090909091, 'LVH': 0.39, 'N': 0.6288416075650118, 'Y': 0.14790996784565916, 'Up': 0.8141025641025641, 'Flat': 0.13368983957219252, 'Down': 0.16666666666666666, 0: 0.5118829981718465, 1: 0.1711229946524064}
{'M': 0.6493288590604027, 'F': 0.2536231884057971, 'ATA': 0.1310344827586207, 'NAP': 0.41379310344827586, 'TA': 0.46875, 'ASY': 0.7961165048543689, 'Normal': 0.537117903930131, 'ST': 0.6534090909090909, 'LVH': 0.61, 'N': 0.37115839243498816, 'Y': 0.8520900321543409, 'Up': 0.1858974358974359, 'Flat': 0.8663101604278075, 'Down': 0.8333333333333334, 0: 0.48811700182815354, 1: 0.8288770053475936}


In [372]:
#assigning test and train data of categorical columns
X_test=X_test[cat_columns]
X_test=X_test.drop('HeartDisease',axis=1)
xp0,xp1 = test_prob(X_test,p0,p1)


In [376]:
# calculating posterior for both categorical and numerical columns for all rows
predictions = []
test=X_test_num
for i in range(len(test)):
    result = calculate_class_probabilities(model, test[i])
    predictions.append(result)  
posterior = []
for j in range(len(predictions)):
    predictions[j][0]*=xp0[j]
    predictions[j][1]*=xp1[j]
    bv=predict(predictions[j])
    posterior.append(bv)


In [377]:
#accuracy of the model
Y_test=np.array((Y_test))
accuracy = accuracy_rate(Y_test, posterior)
print("Accuracy of your model is: ", accuracy)

Accuracy of your model is:  77.71739130434783


In [378]:
#precision, recall, f1_score for test dataset
precision,recall,f1_score =confusion_matrix(Y_test,posterior)
print(precision,recall,f1_score)

0.7319587628865979 0.8255813953488372 0.7759562841530054
