In [1]:
#Initialize all the packages.
import os
import json
import numpy as np
import pandas as pd
from io import StringIO
import random
from sklearn.metrics import f1_score

In [2]:
#Read the data
with open("/home/lsuman/projects/naive_bayes_scratch/data/haberman.data", 'r') as f:
    data_dump = f.read()
data_ = pd.read_csv(StringIO(data_dump), sep=',', names = ['age', 'year_o_op', 'pos_nodes', 'survival_state'])

In [3]:
#Define classes and features.
unique_classes = data_['survival_state'].unique()
features = ['age', 'year_o_op', 'pos_nodes']

In [4]:
#Split the dataset into training and testing (70, 30%).
data_ = data_.sample(frac=1)
test_df = data_.sample(frac = 0.3)
train_df = data_.drop(test_df.index)

In [5]:
#Create probability map for feature values according to class values.
def create_prob_map(classes, train_df)->list:
    class_prob = dict()
    class_ind_prob = dict()
    for class_val in classes:
        feature_prob = dict()
        data_df = train_df[train_df['survival_state'] == class_val]
        total_count = len(data_df)
        for feature_val in features:
            feature_val_prob = dict()
            for val, count in data_df[feature_val].value_counts().iteritems():
    #             print("feature is", feature_val, "Value is", val)
                prob = float(count/total_count)
                feature_val_prob[val] = prob
            feature_prob[feature_val] = feature_val_prob
        class_ind_prob[class_val] = feature_prob
        class_prob[class_val] = total_count/len(train_df)
    return [class_prob, class_ind_prob]

#Calculate the probability for given new feature.
def calculate_prob(new_feature, dict_struct, class_prob):
    if not isinstance(new_feature, pd.Series):
        raise IOError("Argument should be in pandas series")
    out_prob = dict()
    for class_val in list(class_prob.keys()):
        class_prob_ = class_prob[class_val]
        for feature, val in new_feature.iteritems():
            try:
                class_prob_ *= dict_struct[class_val][feature][val]
            except KeyError:
                class_prob_ = 0
        out_prob[class_val] = class_prob_
    return out_prob
                
#Claasify the data according to given probability distribution.
def classify(new_feature, dict_struct, class_prob):
    predicted_prob = calculate_prob(new_feature, dict_struct, class_prob)
    mx = 0
    predicted_class = ''
    for class_, prob in predicted_prob.items():
        if prob>mx:
            mx = prob
            predicted_class = class_
    return predicted_class
    
#Calculate the simple accuracy.
def cal_accuracy(data_df, features, prob_map, class_prob):
    correct_predicted_ops = []
    actual_ops = []
    for i in data_df.index:
        op = classify(data_df.loc[i, features], prob_map, class_prob)
        if op == data_df.loc[i, 'survival_state']:   
            correct_predicted_ops.append(op)
        actual_ops.append(data_df.loc[i, 'survival_state'])
    accuracy = len(correct_predicted_ops)/len(actual_ops)
    return accuracy

In [155]:
#Accuracy for test data.
class_prob_, class_ind_prob_ = create_prob_map(unique_classes, train_df)    
test_accuracy = cal_accuracy(test_df, features, prob_map=class_ind_prob_, class_prob=class_prob_)
print("Test Acuracy is: ", test_accuracy*100, '%')

Test Acuracy is:  67.3913043478261 %
