#### - Sobhan Moradian Daghigh
#### - 12/15/2021
#### - ML - EX02 - Q1

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.metrics import accuracy_score
import seaborn as sns
from scipy.stats import multivariate_normal

In [2]:
dataset = pd.read_csv('./HeartDisease/heart.csv')
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [3]:
# Independent features: 
indeps = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


#### Splite the dataset into 80% of train and 20% of test.

In [5]:
x_train, x_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1], dataset.iloc[:, -1], test_size=0.2)
train = pd.DataFrame(x_train)
train['target'] = y_train
test = pd.DataFrame(x_test)
test['target'] = y_test

### Bayesian Classifier

In [6]:
def cal_prior_probabilities(dataset):
    return np.log(dataset.groupby(by = 'target').apply(lambda x: np.divide(len(x), dataset.shape[0]))).values

In [246]:
# Continues with Noraml Distribution
def cal_probability_density(mean, cov, x):    
    var = multivariate_normal(mean=mean, cov=cov)
    prob = var.pdf(x)
    return prob

In [247]:
# Discrete
def cal_probability(dataset, col, x, class_i):
    frequent = dataset.groupby(by='target').apply(lambda a: a.iloc[:, col].value_counts()[x])[class_i]
    return frequent / dataset.groupby(by='target').size()[class_i]

In [248]:
def cal_likelihood_probabilities(df_row, n_unique_labels, dataset, cov, mean_continues):
    
    df_discrete = df_row[indeps]
    df_continues = df_row.drop(indeps)
    
    likelihood_probabilities = []
    for i in range(n_unique_labels):
        likelihood = 0
    
        # For Discretes
        for j in range(dataset.shape[1]):
            if dataset.columns[j] in indeps:
                likelihood += np.log(cal_probability(dataset, j, df_discrete[dataset.columns[j]], i))
        
        # For Continueses
        likelihood += np.log(cal_probability_density(mean_continues[i], cov[i], df_continues))
        
        likelihood_probabilities.append(likelihood)

    return likelihood_probabilities

In [249]:
def NBC_fit(dataset):
    
    unique_labels = dataset['target'].unique()
    n_unique_labels = len(unique_labels)
    
    dataset_continues = dataset.drop(indeps, axis=1)
    dataset_continues['target'] = dataset['target']
    dataset_continues0 = dataset_continues[dataset_continues['target'] == 0]
    dataset_continues1 = dataset_continues[dataset_continues['target'] == 1]
    dataset_continues0 = dataset_continues0.drop('target', axis=1)
    dataset_continues1 = dataset_continues1.drop('target', axis=1)
        
    cov0, mean0 = dataset_continues0.cov(), dataset_continues0.mean()
    cov1, mean1 = dataset_continues1.cov(), dataset_continues1.mean()
    cov, mean_continues = [cov0, cov1], [mean0, mean1]
    
    prior_probabilities = cal_prior_probabilities(dataset)

    return {
      'unique_labels': unique_labels,
      'n_unique_labels': n_unique_labels,
      'prior_probabilities': prior_probabilities,
      'dataset': dataset,
      'cov': cov,
      'mean_continues': mean_continues
    }

In [250]:
def predict(test_dataset, nbc):
    predictions = []
    for i in range(test_dataset.shape[0]):
        prior = nbc['prior_probabilities']
        likelihood = cal_likelihood_probabilities(test_dataset.iloc[i, :-1], nbc['n_unique_labels'], 
                                                  nbc['dataset'], nbc['cov'], nbc['mean_continues'])
        # log(a*b) = loga + logb
        probabilities = prior + likelihood
        mx_idx = np.argmax(probabilities)
        predictions.append(nbc['unique_labels'][mx_idx])

    return predictions

In [251]:
nbc = NBC_fit(train)
predictions = predict(test, nbc)
accuracy = accuracy_score(test.iloc[:, -1], predictions)
print('accuracy: {:.3f}'.format(accuracy))

accuracy: 0.844
