In [None]:
import sys
!{sys.executable} -m pip3 install pandas
!{sys.executable} -m pip3 install numpy

In [8]:
import pandas
import math
import numpy as np

###### Read and show data

In [9]:
data = pandas.read_csv('heart.csv')
print(data.shape)
data.head()

(1025, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


###### Spliting data to train and test,  define discrete and continus dimentions

In [10]:
data = data.sample(frac=1).reset_index(drop=True)  # shuffeling data
train , test = data[:int(len(data)*0.8)], data[int(len(data)*0.8):]  # spliting data
discrete_dimensions = ['slope', 'ca', 'thal', 'sex', 'cp', 'fbs', 'restecg', 'exang']
continuous_dimensions = [col for col in data.columns if col not in discrete_dimensions and col != 'target']

#### Part A: Discrete dimensions are independent and continuous dimensions have Gaussian distribution

In [11]:
def get_gaussian_params(data, features):
    params = {}  # gaussian parameters for any class
    classes_data = data.groupby('target')
    for name, class_data in classes_data:
        params[name] = {'cov': class_data[features].cov(), 'mean': class_data[features].mean()}
    return params

def get_bayes_params(data, features):
    params = {}  # bayes parameters for any class: count of any value in any feature
    classes_data = data.groupby('target')
    for name, class_data in classes_data:
        params[name] = {}
        for col in features:
            params[name][col] = dict(class_data[col].value_counts())
        params[name]['class_size'] = class_data.shape[0]
    params['data_size'] = data.shape[0]
    return params

def calculate_gaussian_prob(data, params):
    """ This function gets a data and prometers of normal distribution and calculates probability """
    cov = params['cov']
    mean = params['mean']
    cov_det = np.linalg.det(cov)
    first_term = 1/((2*math.pi)*(cov_det**0.5))  ## first term of normal distribution formula
    data_diff_mean = data - mean
    cov_inv = np.linalg.inv(cov)
    second_term = np.exp(-0.5*data_diff_mean@cov_inv@data_diff_mean.T)  ## second term of normal distribution formula
    prob = first_term*second_term
    return prob 

def calculate_nive_prob(data, params):
    """ This function gets some value in adiscrete features and returns probability of that values in this features """
    prob = 1
    feature_number = len(data.keys())
    for col in data.keys():
        if data[col] in params[col]:
            prob *= ((params[col][data[col]]+1)/ (params['class_size']+feature_number))
        else:
            prob *= (1/ (params['class_size']+feature_number))
    return prob

def calculate_posterior_prob(data, class_name, gaussian_params, bayes_params):
    """ This function gets a data and class and returns probability of that class given that data: P(y|X) """
    continuous_part = data[continuous_dimensions]
    discrete_part = data[discrete_dimensions] 
    discrete_part_prob = calculate_nive_prob(discrete_part, bayes_params[class_name])
    continuous_part_prob = calculate_gaussian_prob(continuous_part, gaussian_params[class_name])
    prob_data_in_class =  discrete_part_prob*continuous_part_prob  # P(X|y)
    prob_class = bayes_params[class_name]['class_size']/bayes_params['data_size']
    return  prob_data_in_class*prob_class

##### Model evaluation on test and training data

In [12]:
def evaluate_model(data):
    """ This function gest train or test data and predicts class of them and then evaluates results """
    class_labels = data['target'].unique()
    confusion_matrix = {'FP':0, 'FN':0, 'TP':0, 'TN':0}
    i = 0
    for index, data_sample in data.iterrows():
        class_probes = {}
        for label in class_labels:
            prob = calculate_posterior_prob(data_sample, label, gaussian_params, bayes_params)
            class_probes[label] = prob
        predicted_label = max(class_probes, key=lambda x:class_probes[x])
        if data_sample['target'] == 1:
            if predicted_label == 1:
                confusion_matrix['TP'] += 1
            else:
                confusion_matrix['FN'] += 1
        else:
            if predicted_label == 1:
                confusion_matrix['FP'] += 1
            else:
                confusion_matrix['TN'] += 1
    print("confusion matrix: " + str(confusion_matrix))
    print('\t precision: ' + str(confusion_matrix['TP']/(confusion_matrix['TP']+confusion_matrix['FP'])))
    print('\t recall: ' + str(confusion_matrix['TP']/(confusion_matrix['TP']+confusion_matrix['FN'])))

gaussian_params = get_gaussian_params(train, continuous_dimensions)
bayes_params = get_bayes_params(train, discrete_dimensions)

print("### train ###")
evaluate_model(train)
print("### test ###")
evaluate_model(test)

### train ###
confusion matrix: {'FP': 65, 'FN': 50, 'TP': 366, 'TN': 339}
	 precision: 0.8491879350348028
	 recall: 0.8798076923076923
### test ###
confusion matrix: {'FP': 15, 'FN': 12, 'TP': 98, 'TN': 80}
	 precision: 0.8672566371681416
	 recall: 0.8909090909090909


#### Part B: All dimensions are independent

In [13]:
def calculate_posterior_prob2(data, class_name, bayes_params):
    """ This function gets a data and calculates posterior probability P(Y|X) """
    prior_prob = calculate_nive_prob(data, bayes_params[class_name]) # P(X|y)
    class_prob = bayes_params[class_name]['class_size']/bayes_params['data_size']
    return  prior_prob*class_prob

##### Model evaluation on test and training data

In [14]:
def evaluate_model2(data):
    """ This function gest train or test data and predicts class of them and then evaluates results """
    class_labels = data['target'].unique()
    confusion_matrix = {'FP':0, 'FN':0, 'TP':0, 'TN':0}
    for index, data_sample in data.iterrows():
        class_probes = {}
        for label in class_labels:
            prob = calculate_posterior_prob2(data_sample, label, bayes_params)
            class_probes[label] = prob
        predicted_label = max(class_probes, key=lambda x:class_probes[x])
        if data_sample['target'] == 1:
            if predicted_label == 1:
                confusion_matrix['TP'] += 1
            else:
                confusion_matrix['FN'] += 1
        else:
            if predicted_label == 1:
                confusion_matrix['FP'] += 1
            else:
                confusion_matrix['TN'] += 1
    print("confusion matrix: " + str(confusion_matrix))
    print('\t precision: ' + str(confusion_matrix['TP']/(confusion_matrix['TP']+confusion_matrix['FP'])))
    print('\t recall: ' + str(confusion_matrix['TP']/(confusion_matrix['TP']+confusion_matrix['FN'])))

bayes_params = get_bayes_params(train, train.columns)
print("### train ###")
evaluate_model2(train)
print("### test ###")
evaluate_model2(test)

### train ###
confusion matrix: {'FP': 0, 'FN': 0, 'TP': 416, 'TN': 404}
	 precision: 1.0
	 recall: 1.0
### test ###
confusion matrix: {'FP': 0, 'FN': 0, 'TP': 110, 'TN': 95}
	 precision: 1.0
	 recall: 1.0


#### Part C: Remove 'chol' and 'oldpeak'  features and repeat Part B

###### Remove 'chol' feature

In [15]:
train1, test1 = train.drop(columns='chol'), test.drop(columns='chol')
bayes_params = get_bayes_params(train1, train1.columns)
print("### train ###")
evaluate_model2(train1)
print("### test ###")
evaluate_model2(test1)

### train ###
confusion matrix: {'FP': 4, 'FN': 0, 'TP': 416, 'TN': 400}
	 precision: 0.9904761904761905
	 recall: 1.0
### test ###
confusion matrix: {'FP': 0, 'FN': 0, 'TP': 110, 'TN': 95}
	 precision: 1.0
	 recall: 1.0


###### Remove 'oldpeak' feature

In [16]:
train2, test2 = train.drop(columns='oldpeak'), test.drop(columns='oldpeak')
bayes_params = get_bayes_params(train2, train2.columns)
print("### train ###")
evaluate_model2(train2)
print("### test ###")
evaluate_model2(test2)

### train ###
confusion matrix: {'FP': 0, 'FN': 0, 'TP': 416, 'TN': 404}
	 precision: 1.0
	 recall: 1.0
### test ###
confusion matrix: {'FP': 0, 'FN': 0, 'TP': 110, 'TN': 95}
	 precision: 1.0
	 recall: 1.0
