In [1]:
import numpy as np
import pandas as pd

In [2]:
heart_df = pd.read_csv("./heart.csv")
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [334]:
# One Hot Encoding
Sex = pd.get_dummies(heart_df.Sex,prefix="Sex")
ExerciseAngina = pd.get_dummies(heart_df.ExerciseAngina,prefix="ExerciseAngina")
ChestPainType = pd.get_dummies(heart_df.ChestPainType,prefix="ChestPainType")
RestingECG = pd.get_dummies(heart_df.RestingECG,prefix="RestingECG")
STSlope = pd.get_dummies(heart_df.ST_Slope,prefix="STSlope")

In [335]:
heart_df_encoded = pd.DataFrame()
heart_df_encoded["HeartDisease"] = heart_df["HeartDisease"]
heart_df_encoded = heart_df_encoded.join(Sex)
heart_df_encoded = heart_df_encoded.join(ChestPainType)
heart_df_encoded = heart_df_encoded.join(RestingECG)
heart_df_encoded = heart_df_encoded.join(ExerciseAngina)
heart_df_encoded = heart_df_encoded.join(STSlope)
heart_df_encoded.head()

Unnamed: 0,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,STSlope_Down,STSlope_Flat,STSlope_Up
0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,1,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1


In [336]:
from sklearn.preprocessing import MinMaxScaler
def normalize(df):
    scaler = MinMaxScaler()
    scaler.fit(df)
    scaled = scaler.fit_transform(df)
    return pd.DataFrame(scaled)

In [337]:
# Normalize Continous Values
heart_df_encoded["RestingBP"] = normalize(heart_df["RestingBP"].values.reshape(-1, 1))
heart_df_encoded["Cholesterol"] = normalize(heart_df["Cholesterol"].values.reshape(-1, 1))
heart_df_encoded["FastingBS"] = normalize(heart_df["FastingBS"].values.reshape(-1, 1))
heart_df_encoded["MaxHR"] = normalize(heart_df["MaxHR"].values.reshape(-1, 1))
heart_df_encoded["Oldpeak"] = normalize(heart_df["Oldpeak"].values.reshape(-1, 1))
heart_df_encoded["Age"] = normalize(heart_df["Age"].values.reshape(-1, 1))
heart_df_encoded.head()

Unnamed: 0,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,...,ExerciseAngina_Y,STSlope_Down,STSlope_Flat,STSlope_Up,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Age
0,0,0,1,0,1,0,0,0,1,0,...,0,0,0,1,0.7,0.47927,0.0,0.788732,0.295455,0.244898
1,1,1,0,0,0,1,0,0,1,0,...,0,0,1,0,0.8,0.298507,0.0,0.676056,0.409091,0.428571
2,0,0,1,0,1,0,0,0,0,1,...,0,0,0,1,0.65,0.46932,0.0,0.267606,0.295455,0.183673
3,1,1,0,1,0,0,0,0,1,0,...,1,0,1,0,0.69,0.354892,0.0,0.338028,0.465909,0.408163
4,0,0,1,0,0,1,0,0,1,0,...,0,0,0,1,0.75,0.323383,0.0,0.43662,0.295455,0.530612


In [338]:
heart_df_encoded_class_0 = heart_df_encoded[heart_df_encoded['HeartDisease'] == 0]
heart_df_encoded_class_0_y = heart_df_encoded_class_0["HeartDisease"]
heart_df_encoded_class_0_x = heart_df_encoded_class_0.drop(["HeartDisease"], axis=1)
heart_df_encoded_class_1 = heart_df_encoded[heart_df_encoded['HeartDisease'] == 1]
heart_df_encoded_class_1_y = heart_df_encoded_class_1["HeartDisease"]
heart_df_encoded_class_1_x = heart_df_encoded_class_1.drop(["HeartDisease"], axis=1)


In [339]:
from sklearn.model_selection  import train_test_split
x_heart_train_df_1, x_heart_test_df_1, y_heart_train_df_1, y_heart_test_df_1 = train_test_split(heart_df_encoded_class_1_x, heart_df_encoded_class_1_y, test_size=.2)
x_heart_train_df_0, x_heart_test_df_0, y_heart_train_df_0, y_heart_test_df_0 = train_test_split(heart_df_encoded_class_0_x, heart_df_encoded_class_0_y, test_size=.2)

In [340]:
def get_inital_guess_P(df1,df2):
    return len(df1)/(len(df1)+len(df2))

In [341]:
def categorical_column_probability(col):
    return (sum(col)+1)/len(col)
    

In [342]:
def gaussian_column_stats(col):
    return {"Standard Deviation":np.std(col), "Mean":np.std(col)}

In [343]:
def gaussian_probability(val, sd, mean):
    return 1/(np.sqrt(2*np.pi*(sd**2)))*np.exp((-((val-mean)**2))/(2*(sd**2)))

In [344]:
def get_stat_dict(df1,df2):
    prob_dict = pd.DataFrame({'inital': get_inital_guess_P(df1,df2)}, index=[0])
    
    for col in df1.columns:
        if(str(df1[col].dtype) == 'uint8'):
            prob_dict[col] = categorical_column_probability(df1[col])
        elif(str(df1[col].dtype) == 'float64'):
            temp = gaussian_column_stats(df1[col])
            prob_dict[col+'_mean'] = temp['Mean']
            prob_dict[col+'_sd'] = temp['Standard Deviation']
    return prob_dict

In [353]:
df_1_stats = get_stat_dict(x_heart_train_df_1,x_heart_train_df_0)
df_0_stats = get_stat_dict(x_heart_train_df_0,x_heart_train_df_1)
df_0_stats

Unnamed: 0,inital,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,...,Cholesterol_mean,Cholesterol_sd,FastingBS_mean,FastingBS_sd,MaxHR_mean,MaxHR_sd,Oldpeak_mean,Oldpeak_sd,Age_mean,Age_sd
0,0.446866,0.353659,0.652439,0.262195,0.353659,0.335366,0.060976,0.207317,0.64939,0.152439,...,0.128299,0.128299,0.300811,0.300811,0.161257,0.161257,0.079911,0.079911,0.195764,0.195764


In [354]:
df_1_stats

Unnamed: 0,inital,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,...,Cholesterol_mean,Cholesterol_sd,FastingBS_mean,FastingBS_sd,MaxHR_mean,MaxHR_sd,Oldpeak_mean,Oldpeak_sd,Age_mean,Age_sd
0,0.553134,0.100985,0.903941,0.785714,0.046798,0.140394,0.036946,0.221675,0.564039,0.221675,...,0.210497,0.210497,0.477661,0.477661,0.164134,0.164134,0.132149,0.132149,0.178665,0.178665


In [355]:
def baysian_prob(model_prob, actual_value):
    if(actual_value == 0):
        return 1
    else:
        return model_prob

In [356]:
def model(model_df, x_df):
    pred = []
    x_array = x_df.to_numpy()
    for i in range(len(x_df)):
        l = []
        l.append(model_df.loc[0,'inital'])
        l.append(baysian_prob(model_df.loc[0,'Sex_F'],x_array[i,0]))
        l.append(baysian_prob(model_df.loc[0,'Sex_M'],x_array[i,1]))
        l.append(baysian_prob(model_df.loc[0,'ChestPainType_ASY'],x_array[i,2]))
        l.append(baysian_prob(model_df.loc[0,'ChestPainType_ATA'],x_array[i,3]))
        l.append(baysian_prob(model_df.loc[0,'ChestPainType_NAP'],x_array[i,4]))
        l.append(baysian_prob(model_df.loc[0,'ChestPainType_TA'],x_array[i,5]))
        l.append(baysian_prob(model_df.loc[0,'RestingECG_LVH'],x_array[i,6]))
        l.append(baysian_prob(model_df.loc[0,'RestingECG_Normal'],x_array[i,7]))
        l.append(baysian_prob(model_df.loc[0,'RestingECG_ST'],x_array[i,8]))
        l.append(baysian_prob(model_df.loc[0,'ExerciseAngina_N'],x_array[i,9]))
        l.append(baysian_prob(model_df.loc[0,'ExerciseAngina_Y'],x_array[i,10]))
        l.append(baysian_prob(model_df.loc[0,'STSlope_Down'],x_array[i,11]))
        l.append(baysian_prob(model_df.loc[0,'STSlope_Flat'],x_array[i,12]))
        l.append(baysian_prob(model_df.loc[0,'STSlope_Up'],x_array[i,13]))
        l.append(gaussian_probability(x_array[i,14],model_df.loc[0,'RestingBP_sd'],model_df.loc[0,'RestingBP_mean']))
        l.append(gaussian_probability(x_array[i,15],model_df.loc[0,'Cholesterol_sd'],model_df.loc[0,'Cholesterol_mean']))
        l.append(gaussian_probability(x_array[i,16],model_df.loc[0,'FastingBS_sd'],model_df.loc[0,'FastingBS_mean']))
        l.append(gaussian_probability(x_array[i,17],model_df.loc[0,'MaxHR_sd'],model_df.loc[0,'MaxHR_mean']))
        l.append(gaussian_probability(x_array[i,18],model_df.loc[0,'Oldpeak_sd'],model_df.loc[0,'Oldpeak_mean']))
        l.append(gaussian_probability(x_array[i,19],model_df.loc[0,'Age_sd'],model_df.loc[0,'Age_mean']))
        pred.append(np.sum(np.log(l)))
    return pred

In [351]:
def predict(class_0_stats, class_1_stats, x):
    predicted_0 = model(class_0_stats,x)
    predicted_1 = model(class_1_stats,x)
    l = []
    for i in range(len(predicted_0)):
        if(predicted_0[i] > predicted_1[i]):
            l.append(0)
        else:
            l.append(1)
    return l

In [357]:
from sklearn import metrics
model_pred = predict(df_0_stats,df_1_stats,x_heart_test_df_0)
actual = y_heart_test_df_0

accuracy = metrics.accuracy_score(actual, model_pred)
print(f"Accuracy: {accuracy}")
precision = metrics.precision_score(actual, model_pred)
print(f"Precision: {precision}")
recall = metrics.recall_score(actual, model_pred)
print(f"Recall: {recall}")
f1_score = metrics.f1_score(actual, model_pred)
print(f"F1 Score: {f1_score}")
print(metrics.confusion_matrix(actual, model_pred))

Accuracy: 0.0
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
[[ 0 82]
 [ 0  0]]


  _warn_prf(average, modifier, msg_start, len(result))
