# Naive Bayes Algorithm

In [29]:
import pandas as pd
from NaiveBayes import NaiveBayes
from sklearn.model_selection import train_test_split
import numpy as np
import random 

In [30]:
df = pd.read_csv("adult.data", header=None)
df.columns = ["age", "work-class", "fnlwgt", "education", "education-num", "maritial-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

## Preprocessing the data

In [31]:
df = df.replace(" ?", np.nan)

In [32]:
df.isna().sum().sum()

4262

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1   work-class       30725 non-null  object
 2   fnlwgt           32561 non-null  int64 
 3   education        32561 non-null  object
 4   education-num    32561 non-null  int64 
 5   maritial-status  32561 non-null  object
 6   occupation       30718 non-null  object
 7   relationship     32561 non-null  object
 8   race             32561 non-null  object
 9   sex              32561 non-null  object
 10  capital-gain     32561 non-null  int64 
 11  capital-loss     32561 non-null  int64 
 12  hours-per-week   32561 non-null  int64 
 13  native-country   31978 non-null  object
 14  income           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [34]:
def impute(df):
    for column in df.columns:
        if df[column].dtype == "object":
            df[column] = df[column].fillna(df[column].mode()[0])
        else:
            df[column] = df[column].fillna(df[column].mean())

In [35]:
impute(df)

In [36]:
df.isna().sum().sum()

0

## Training and testing the data

In [37]:
train_splits = []

In [38]:
for i in range(10):
    train, test = train_test_split(df, test_size=0.33, random_state=random.randint(0,100))
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    train_splits.append((train,test))

In [39]:
results_list = []

In [40]:
for i,(train,test) in enumerate(train_splits):
    accuracies_dict = {}

    integer_columns = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
    ranges: dict[str, np.array] = dict()
    def convert_from_integer_to_range(df, columns, n_bins=100, _test=False):
        for column in columns:
            if not _test:
                ranges[column] = pd.cut(df[column], bins=n_bins, labels=False, retbins=True)[1]
            df[column] = pd.cut(df[column], bins=ranges[column], labels=False)
    convert_from_integer_to_range(train, integer_columns)
    naive = NaiveBayes(train, "income")
    convert_from_integer_to_range(test, integer_columns, _test=True)
    tp,tn,fp,fn = naive.accuracy(test)
    print("Accuracy for split {0} : {1}".format(i+1,(tp+tn)/(tp+tn+fp+fn)))
    accuracies_dict["index"] = f"Accuracy {i+1}"
    accuracies_dict["accuracy"]= (tp+tn)/(tp+tn+fp+fn)
    accuracies_dict['precision'] = (tp)/(tp+fp)
    accuracies_dict['recall'] = (tp)/(tp+fn)
    accuracies_dict['f1 score'] = (2*accuracies_dict['recall']*accuracies_dict['precision'])/(accuracies_dict['precision'] + accuracies_dict['recall'])
    results_list.append(accuracies_dict)


Accuracy for split 1 : 0.8355667225013959
Accuracy for split 2 : 0.8332402754513307
Accuracy for split 3 : 0.8306346547552578
Accuracy for split 4 : 0.8317513493392891
Accuracy for split 5 : 0.8342639121533594
Accuracy for split 6 : 0.8351944909733855
Accuracy for split 7 : 0.8258887027731249
Accuracy for split 8 : 0.8309138284012656
Accuracy for split 9 : 0.8337055648613437
Accuracy for split 10 : 0.837893169551461


In [41]:
results = pd.DataFrame.from_records(results_list)
results

Unnamed: 0,index,accuracy,precision,recall,f1 score
0,Accuracy 1,0.835567,0.791004,0.624235,0.697794
1,Accuracy 2,0.83324,0.793483,0.614846,0.692835
2,Accuracy 3,0.830635,0.782609,0.61232,0.68707
3,Accuracy 4,0.831751,0.847882,0.92388,0.884251
4,Accuracy 5,0.834264,0.849219,0.925603,0.885767
5,Accuracy 6,0.835194,0.848191,0.928456,0.886511
6,Accuracy 7,0.825889,0.841584,0.920038,0.879064
7,Accuracy 8,0.830914,0.845051,0.925916,0.883638
8,Accuracy 9,0.833706,0.790898,0.616514,0.692903
9,Accuracy 10,0.837893,0.852948,0.929007,0.889355


In [42]:
print(f"Mean for 10 splits is {results['accuracy'].mean()}")
print(f"Standard deviation for 10 splits is {results['accuracy'].std()}")
print(f"Average precision for 10 splits is {results['precision'].mean()}")
print(f"Average recall for 10 splits is {results['recall'].mean()}")

Mean for 10 splits is 0.8329052670761212
Standard deviation for 10 splits is 0.003324770582014348
Average precision for 10 splits is 0.824287029594134
Average recall for 10 splits is 0.8020815574020963
