In [74]:
import pandas as pd
import math
import numpy as np
import random
from pandas.api.types import is_numeric_dtype
from sklearn.metrics import confusion_matrix, accuracy_score


def input_filters(input_data, label_column, among_which_labels_list, based_list):
    input_data_1=pd.read_csv(input_data)
    based=based_list
    input_data = input_data_1[based]
    input_data[label_column]=input_data_1[label_column]
    adf= input_data[input_data[label_column].isin(among_which_labels_list)]

    return adf

In [75]:
def class_prob(input_data, label_column):
    k=input_data[label_column].value_counts()
    sum_of_all_classes=k.sum()
    class_prob={}
    labels=[]
    for key,value in k.to_dict().items():
        prob=value/sum_of_all_classes
        class_prob.update({key:prob})
        labels.append(key)
    
    return class_prob, labels


In [76]:
def test_to_row_dict(test_data):
    
    numeric_cols = [col for col in test_data.columns if test_data[col].dtype in ['int64', 'float64']]
    test_data_numeric=test_data.loc[:,numeric_cols].to_dict()
    max_length = max(len(v) for v in test_data_numeric.values())

    output_dict = {}

    for i in range(max_length):
        new_dict = {}
        for key, value in test_data_numeric.items():
            if i < len(value):
                new_dict[key] = value[i]
            else:
                new_dict[key] = None
        output_dict[i+1] = new_dict

    return output_dict


In [77]:
def expform(x, mean, var):
    expo = -(x - mean)**2 / (2 * var)
    prob = (1 / np.sqrt(2 * np.pi * var)) * np.exp(expo)
    
    return prob


In [78]:
def mean_var(input_data, label_column):
    column_names=list(input_data.columns.values)
    labels_in_data=input_data[label_column].unique()
    mean_var_dict={}

    for column in column_names:
        for labels in labels_in_data:
            if column != label_column and is_numeric_dtype(input_data[column]):
                df = input_data[input_data[label_column] == labels]
                mean = df.loc[:,column].mean()
                var = df.loc[:,column].var()
                if column not in mean_var_dict:
                    mean_var_dict[column] = {}
                if labels not in mean_var_dict[column]:
                    mean_var_dict[column][labels] = []
                mean_var_dict[column][labels].append(mean)
                mean_var_dict[column][labels].append(var)
    return mean_var_dict


In [79]:
def prior(input_data, label_column, labels_list, test_data_in_dict):
    k = mean_var(input_data, label_column)
    prior_dict = {}

    row_keys = list(test_data_in_dict.keys())
    col_keys = list(test_data_in_dict[row_keys[0]].keys())

    for row_key in row_keys:
        prior_dict[row_key] = {}

        for col_key in col_keys:
            col_val = test_data_in_dict[row_key][col_key]

            for label in labels_list:
                prior = expform(col_val, k[col_key][label][0], k[col_key][label][1])
                prior_dict[row_key][col_key, label] = prior

    return prior_dict


In [80]:
def posterior(input_data, label_column, labels_list, test_data_in_dict):
    input_dict=input_data.to_dict()
    prior_dict=prior(input_data, label_column, labels_list, test_data_in_dict) 
    prob_of_class=class_prob(input_data, label_column)[0]
    k = mean_var(input_data, label_column)
    columns=[]
    for key,value in k.items():
        columns.append(key)
    
    posterior_dict = {}
    for i in range(len(prior_dict)):
        row_posterior = {}
        denom = 0
        for labels in labels_list:
            p = 1
            for column in columns:
                post = prior_dict[i+1][(column, labels)]
                p = p * post
            num = p * prob_of_class[labels]
            denom += num
            row_posterior[labels] = num
        
        for labels in labels_list:
            row_posterior[labels] /= denom
            
        posterior_dict[i+1] = row_posterior
    return posterior_dict



In [81]:
#Can implement this to get the output 

def Bayes_class(input_data, test_data, label_column, among_which_labels_list, based_list):
    input_filtered=input_filters(input_data, label_column, among_which_labels_list, based_list)
    prob_of_class=class_prob(input_filtered, label_column)
    test_row_dict=test_to_row_dict(pd.read_csv(test_data))
    k=mean_var(input_filtered, label_column)
    m=prior(input_filtered, label_column, prob_of_class[1], test_row_dict)
    posterior_dict = posterior(input_filtered, label_column, prob_of_class[1], test_row_dict)
    for key in posterior_dict.keys():
        max_label = max(posterior_dict[key], key=posterior_dict[key].get)
        posterior_dict[key] = {max_label: posterior_dict[key][max_label]}
    key_list = [list(d.keys())[0] for d in posterior_dict.values()]
    test_df=pd.read_csv(test_data)
    test_df["predicted"]=key_list
    test_df["Input"]=input_filtered["De"]
    return test_df
    
result=Bayes_class("./admission-1.csv", "./admission-1 test.csv", "De", ["admit","notadmit","border"], ["GPA", "GMAT"])
result.head(84)


Unnamed: 0,GPA,GMAT,predicted,Input
0,2.96,596,admit,admit
1,3.14,473,border,admit
2,3.22,482,border,admit
3,3.29,527,admit,admit
4,3.69,505,admit,admit
...,...,...,...,...
79,3.03,438,border,border
80,3.05,399,border,border
81,2.85,483,border,border
82,3.01,453,border,border


In [82]:
def confusion_matrix_and_accuracy(true_labels, predicted_labels):
    confusion = confusion_matrix(true_labels, predicted_labels)
    accuracy = accuracy_score(true_labels, predicted_labels)
    
    return confusion, accuracy

confusion, accuracy = confusion_matrix_and_accuracy(result['Input'], result['predicted'])
print("Confusion:\n",confusion)
print("\nAccuracy:", accuracy)


Confusion:
 [[28  3  0]
 [ 1 23  2]
 [ 0  2 26]]

Accuracy: 0.9058823529411765
