## Naive Bayesian Classifier
This is a program to build a naive bayesian classifier that calculates conditional probabilities.

In [2]:
# Preparation
import pandas as pd
import numpy as np
import pprint as pp # for printing
import scipy.stats as st # for Normal PDF

# Plotting libraries 
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *

# Silence warnings 
import warnings
warnings.filterwarnings("ignore")

In [3]:
dta = pd.read_csv('data/clean_data.csv')

In [6]:
dta.head()

Unnamed: 0,Participant_ID,Category,Age_yrs,Age_category,Education_yrs,Education_category,Sex,Marital_status,Income,Occupation_class,...,Vegetable,Vegetable_category,Fruit,Fruit_category,Salted_fish,Salted_fish_category,Instant_noodle,Instant_noodle_category,Tempe,Tempe_category
0,80001,Dementia,70.0,60-74,3.0,0-6 years,Female,Married,No Income,housewife,...,2 times/day,Frequent,Sometimes,Non Frequent,Sometimes,Non-Frequent,Sometimes,Non-Frequent,Few times/week,Non-Frequent
1,80002,Normal,63.0,60-74,0.0,0-6 years,Female,Single,< monthly minimum wage,profesional,...,3-4 times/week,Non Frequent,Sometimes,Non Frequent,Sometimes,Non-Frequent,Sometimes,Non-Frequent,Few times/week,Non-Frequent
2,80003,Normal,64.0,60-74,6.0,0-6 years,Female,Married,No Income,housewife,...,3-4 times/week,Non Frequent,Sometimes,Non Frequent,Sometimes,Non-Frequent,,,,
3,80004,Normal,62.0,60-74,6.0,0-6 years,Female,Married,≥ monthly minimum wage,housewife,...,2 times/day,Frequent,Sometimes,Non Frequent,Sometimes,Non-Frequent,Few times/week,Frequent,Few times/week,Non-Frequent
4,80005,Dementia,78.0,>75,3.0,0-6 years,Female,Married,No Income,housewife,...,2 times/day,Frequent,Sometimes,Non Frequent,Sometimes,Non-Frequent,Sometimes,Non-Frequent,Sometimes,Non-Frequent


In [5]:
dta.columns

Index(['Participant_ID', 'Category', 'Age_yrs', 'Age_category',
       'Education_yrs', 'Education_category', 'Sex', 'Marital_status',
       'Income', 'Occupation_class', 'Living_area', 'BP_sistol', 'BP_diastol',
       'BMI', 'GDS', 'Glucose', 'Triglyceride', 'HDL', 'Hypertension',
       'BMI_category', 'Diabetes', 'Tri_200', 'HDL_40', 'Smoking_status',
       'Stroke', 'Depression', 'INA_AD8', 'ADL', 'AMT', 'Intellectual_1',
       'Intellectual_2', 'Intellectual_3', 'Intellectual_4', 'Intellectual_5',
       'Intellectual_6', 'Intellectual_7', 'Intellectual_8', 'Social_1',
       'Social_2', 'Social_3', 'Social_4', 'Social_5', 'Social_6', 'Social_7',
       'Social_8', 'Social_9', 'Recreational_1', 'Recreational_2',
       'Recreational_3', 'Recreational_4', 'Recreational_5', 'Recreational_6',
       'Recreational_7', 'Recreational_8', 'Physical_1', 'Physical_2',
       'Physical_3', 'Physical_4', 'Physical_5', 'Intellectually_active',
       'Socially_active', 'Recreationally_act

In [7]:
# Create dummy variable for "dementia = 1" and "normal = 0"
dta_dummy = pd.get_dummies(dta, columns=['Category', 'Age_category', 'Education_category', 'Sex', 'Marital_status', 
                                         'Income', 'Occupation_class', 'Living_area', 
                                        'Hypertension', 'BMI_category', 'Diabetes', 'Tri_200', 'HDL_40', 
                                        'Smoking_status', 'Stroke', 'Depression', 'Intellectually_active', 
                                        'Socially_active', 'Recreationally_active', 'Physically_active',
                                        'Total_active', 'Carbo_category', 'Protein_category', 'Vegetable_category',
                                        'Fruit_category', 'Salted_fish_category', 'Instant_noodle_category',
                                        'Tempe_category'])

In [8]:
# Verify addition of dummy variables
dta_dummy

Unnamed: 0,Participant_ID,Age_yrs,Education_yrs,BP_sistol,BP_diastol,BMI,GDS,Glucose,Triglyceride,HDL,...,Vegetable_category_Frequent,Vegetable_category_Non Frequent,Fruit_category_Frequent,Fruit_category_Non Frequent,Salted_fish_category_Frequent,Salted_fish_category_Non-Frequent,Instant_noodle_category_Frequent,Instant_noodle_category_Non-Frequent,Tempe_category_Frequent,Tempe_category_Non-Frequent
0,80001,70.0,3.0,150.0,60.0,19.95,3.0,81.0,90.0,67.0,...,1,0,0,1,0,1,0,1,0,1
1,80002,63.0,0.0,140.0,90.0,22.04,1.0,82.0,61.0,70.0,...,0,1,0,1,0,1,0,1,0,1
2,80003,64.0,6.0,150.0,100.0,22.02,0.0,,,,...,0,1,0,1,0,1,0,0,0,0
3,80004,62.0,6.0,120.0,80.0,28.77,0.0,86.0,183.0,42.0,...,1,0,0,1,0,1,1,0,0,1
4,80005,78.0,3.0,210.0,100.0,24.79,6.0,95.0,229.0,60.0,...,1,0,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,80734,61.0,8.0,130.0,90.0,25.09,0.0,103.0,374.0,39.0,...,1,0,1,0,0,1,0,1,0,1
682,80735,66.0,6.0,160.0,100.0,24.99,0.0,170.0,233.0,37.0,...,1,0,1,0,1,0,0,1,0,1
683,80736,60.0,6.0,120.0,80.0,19.27,1.0,,,,...,1,0,0,0,0,1,0,1,0,1
684,80737,60.0,6.0,120.0,90.0,24.65,7.0,,,,...,1,0,1,0,1,0,0,1,0,1


In [9]:
# Set seed
np.random.seed(1234)

# Train-Test split (just using Pandas)
train = dta_dummy.sample(frac=.8).reset_index(drop=True)
test = dta_dummy.drop(train.index).reset_index(drop=True)

# Print off the split count 
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])

# Look at the head of the data
train.head()

Training Data: 549 
Test Data: 137


Unnamed: 0,Participant_ID,Age_yrs,Education_yrs,BP_sistol,BP_diastol,BMI,GDS,Glucose,Triglyceride,HDL,...,Vegetable_category_Frequent,Vegetable_category_Non Frequent,Fruit_category_Frequent,Fruit_category_Non Frequent,Salted_fish_category_Frequent,Salted_fish_category_Non-Frequent,Instant_noodle_category_Frequent,Instant_noodle_category_Non-Frequent,Tempe_category_Frequent,Tempe_category_Non-Frequent
0,80218,68.0,3.0,170.0,100.0,15.86,3.0,,,,...,0,1,0,1,0,1,1,0,0,0
1,80734,61.0,8.0,130.0,90.0,25.09,0.0,103.0,374.0,39.0,...,1,0,1,0,0,1,0,1,0,1
2,80712,,4.0,200.0,110.0,15.64,7.0,,,,...,0,0,0,0,0,0,0,0,0,0
3,80439,70.0,2.0,160.0,80.0,14.5,0.0,95.0,71.0,68.0,...,0,1,0,1,1,0,1,0,1,0
4,80328,63.0,6.0,150.0,100.0,22.49,7.0,,,,...,0,1,1,0,0,0,0,0,0,0


In [10]:
dta_dummy.to_csv(r'data/dummy_data.csv', index = False)

In [17]:
train_temp = train[['Category_Dementia', 'Sex_Female', 'Marital_status_Married', 'Occupation_class_housewife',
           'Living_area_Rural', 'Hypertension_Hypertension', 'Diabetes_Diabetes', 'Smoking_status_Not Smoking',
           'Stroke_Normal', 'Depression_Depression', 'Intellectually_active_active', 'Socially_active_active',
           'Recreationally_active_active', 'Physically_active_active', 'Total_active_active', 'Carbo_category_Frequent',
           'Protein_category_Frequent', 'Vegetable_category_Frequent', 'Fruit_category_Frequent', 
           'Salted_fish_category_Frequent', 'Instant_noodle_category_Frequent', 'Tempe_category_Frequent']].copy()
test_temp = test[['Category_Dementia', 'Sex_Female', 'Marital_status_Married', 'Occupation_class_housewife',
           'Living_area_Rural', 'Hypertension_Hypertension', 'Diabetes_Diabetes', 'Smoking_status_Not Smoking',
           'Stroke_Normal', 'Depression_Depression', 'Intellectually_active_active', 'Socially_active_active',
           'Recreationally_active_active', 'Physically_active_active', 'Total_active_active', 'Carbo_category_Frequent',
           'Protein_category_Frequent', 'Vegetable_category_Frequent', 'Fruit_category_Frequent', 
           'Salted_fish_category_Frequent', 'Instant_noodle_category_Frequent', 'Tempe_category_Frequent']].copy()

### Calculate Class Probabilities
* Category_Dementia = 1: individual has dementia
* Category_Dementia = 0: individual does not have dementia

In [8]:
N = train.shape[0]

# Subset the data by class
d1 = train.query("Category_Dementia == 1")
d0 = train.query("Category_Dementia == 0")

# Calculate the probability for each class
pr_d1 = d1.shape[0]/N
pr_d0 = d0.shape[0]/N

# Print the probabilities
print(
f"""
Pr(Category_Dementia = 1): {pr_d1}
Pr(Category_Dementia = 0): {pr_d0}
""")


Pr(Category_Dementia = 1): 0.2987249544626594
Pr(Category_Dementia = 0): 0.7012750455373407



### Calculate Conditional Probabilities

In [9]:
def calc_probs(data,outcome_var=""):
    '''
    This function calculates class and conditional probabilities for binary data.
    
    Arguments
    ---------
    data: dataset to be used for calculations
    var: the outcome variable that's being conditioned on
    
    Return
    ------
    dictionary (with tuple keys): (variable, it's value, and the outcome that's being conditioned on)
    '''
    # Generate empty dictionary containers
    class_probs = {};cond_probs = {}
    
    # Locate all variables that are not the outcome
    #vars = [v for v in data.columns if v != outcome_var]
    vars = ['Category_Dementia', 'Sex_Female', 'Marital_status_Married', 'Occupation_class_housewife',
           'Living_area_Rural', 'Hypertension_Hypertension', 'Diabetes_Diabetes', 'Smoking_status_Not Smoking',
           'Stroke_Normal', 'Depression_Depression', 'Intellectually_active_active', 'Socially_active_active',
           'Recreationally_active_active', 'Physically_active_active', 'Total_active_active', 'Carbo_category_Frequent',
           'Protein_category_Frequent', 'Vegetable_category_Frequent', 'Fruit_category_Frequent', 
           'Salted_fish_category_Frequent', 'Instant_noodle_category_Frequent', 'Tempe_category_Frequent']
    
    # Iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        
        # Calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        
        # Calculate the conditional probabilities for each variable given the class
        for v in vars:
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,0,y)] = round(pr,4)
            cond_probs[(v,1,y)] = round(1 - pr,4)
            
    # Return calculated values
    return class_probs, cond_probs


# Run the function on the training dataset
#class_probs, cond_probs = calc_probs(train,outcome_var="Category_Dementia")
class_probs, cond_probs = calc_probs(train_temp,outcome_var="Category_Dementia")

# Print results
print("class probabilities -- reported as {class value: class probability}",end="\n\n")
pp.pprint(class_probs)
print("\n")
print("conditional probabilities -- reported as {(variable, its value, conditional class value): probability}",end="\n\n")
pp.pprint(cond_probs)

class probabilities -- reported as {class value: class probability}

{0: 0.7012750455373407, 1: 0.2987249544626594}


conditional probabilities -- reported as {(variable, its value, conditional class value): probability}

{('Carbo_category_Frequent', 0, 0): 0.4571,
 ('Carbo_category_Frequent', 0, 1): 0.3659,
 ('Carbo_category_Frequent', 1, 0): 0.5429,
 ('Carbo_category_Frequent', 1, 1): 0.6341,
 ('Category_Dementia', 0, 0): 0.0,
 ('Category_Dementia', 0, 1): 1.0,
 ('Category_Dementia', 1, 0): 1.0,
 ('Category_Dementia', 1, 1): 0.0,
 ('Depression_Depression', 0, 0): 0.039,
 ('Depression_Depression', 0, 1): 0.1098,
 ('Depression_Depression', 1, 0): 0.961,
 ('Depression_Depression', 1, 1): 0.8902,
 ('Diabetes_Diabetes', 0, 0): 0.0597,
 ('Diabetes_Diabetes', 0, 1): 0.0854,
 ('Diabetes_Diabetes', 1, 0): 0.9403,
 ('Diabetes_Diabetes', 1, 1): 0.9146,
 ('Fruit_category_Frequent', 0, 0): 0.2364,
 ('Fruit_category_Frequent', 0, 1): 0.122,
 ('Fruit_category_Frequent', 1, 0): 0.7636,
 ('Fruit_cate

### Making Predictions

In [11]:
def predict(data,class_probs,cond_probs):
    '''
    Function calculates the conditional probability for membership into each class.
    Then returns both the probabilities and the most likely class. 
    '''
    store_preds = []
    for i,row in data.iterrows():
        pr_1 = 1; pr_0 = 1
        for j in range(1,len(row.index)):
            pr_0 *= cond_probs[(row.index[j],row.values[j],0)]
            pr_1 *= cond_probs[(row.index[j],row.values[j],1)]     
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        store_preds.append([pr_0,pr_1,max([(pr_0,0),(pr_1,1)])[1]])
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

# Run 
preds = predict(train_temp, class_probs, cond_probs)
preds.head()

Unnamed: 0,pr_0,pr_1,pred
0,4.648576e-12,1.445157e-12,0
1,1.383106e-11,7.47284e-11,1
2,3.328801e-10,7.961641e-11,0
3,6.589915e-12,1.217673e-12,0
4,1.264813e-11,6.467615e-11,1


#### Predictive Accuracy:

In [15]:
accuracy = sum(train.Category_Dementia == preds.pred)/train.shape[0]
accuracy

0.3970856102003643

Only obtaining a predictive accuracy of 39.7% on the training data. This is because of our limited number of observations - we should aim for more than 549 observations in a training dataset.

In [19]:
test_preds = predict(test_temp, class_probs, cond_probs)
test_accuracy = sum(test.Category_Dementia == test_preds.pred)/test.shape[0]
test_accuracy

0.40875912408759124

On the test data, there's a slightly higher predictive accuracy of 40.9%, but still not ideal.

### Continuous Predictors