# Naive Bayes Algorithm

In [1]:
import pandas as pd
from NaiveBayes import NaiveBayes
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,recall_score,f1_score,precision_score
import numpy as np
from sklearn.linear_model import LogisticRegression
import random 

In [2]:
df = pd.read_csv("adult.data", header=None)
df.columns = ["age", "work-class", "fnlwgt", "education", "education-num", "maritial-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

## Preprocessing the data

In [3]:
df = df.replace(" ?", np.nan)

In [4]:
df.isna().sum().sum()

4262

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1   work-class       30725 non-null  object
 2   fnlwgt           32561 non-null  int64 
 3   education        32561 non-null  object
 4   education-num    32561 non-null  int64 
 5   maritial-status  32561 non-null  object
 6   occupation       30718 non-null  object
 7   relationship     32561 non-null  object
 8   race             32561 non-null  object
 9   sex              32561 non-null  object
 10  capital-gain     32561 non-null  int64 
 11  capital-loss     32561 non-null  int64 
 12  hours-per-week   32561 non-null  int64 
 13  native-country   31978 non-null  object
 14  income           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
def impute(df):
    for column in df.columns:
        if df[column].dtype == "object":
            df[column] = df[column].fillna(df[column].mode()[0])
        else:
            df[column] = df[column].fillna(df[column].mean())

In [7]:
impute(df)

In [8]:
df.isna().sum().sum()

0

## Training and testing the data

In [9]:
train_splits = []

In [10]:
for i in range(10):
    train, test = train_test_split(df, test_size=0.33, random_state=random.randint(0,100))
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    train_splits.append((train,test))

In [11]:
results_list = []

In [12]:
for i,(train,test) in enumerate(train_splits):
    accuracies_dict = {}

    integer_columns = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
    ranges: dict[str, np.array] = dict()
    def convert_from_integer_to_range(df, columns, n_bins=100, _test=False):
        for column in columns:
            if not _test:
                ranges[column] = pd.cut(df[column], bins=n_bins, labels=False, retbins=True)[1]
            df[column] = pd.cut(df[column], bins=ranges[column], labels=False)
    convert_from_integer_to_range(train, integer_columns)
    naive = NaiveBayes(train, "income")
    convert_from_integer_to_range(test, integer_columns, _test=True)
    tp,tn,fp,fn = naive.accuracy(test)
    print("Accuracy for split {0} : {1}".format(i+1,(tp+tn)/(tp+tn+fp+fn)))
    accuracies_dict["index"] = f"Accuracy {i+1}"
    accuracies_dict["accuracy"]= (tp+tn)/(tp+tn+fp+fn)
    accuracies_dict['precision'] = (tp)/(tp+fp)
    accuracies_dict['recall'] = (tp)/(tp+fn)
    accuracies_dict['f1 score'] = (2*accuracies_dict['recall']*accuracies_dict['precision'])/(accuracies_dict['precision'] + accuracies_dict['recall'])
    results_list.append(accuracies_dict)


Accuracy for split 1 : 0.8334263912153359
Accuracy for split 2 : 0.8296110180532291
Accuracy for split 3 : 0.8320305229852969
Accuracy for split 4 : 0.8334263912153359
Accuracy for split 5 : 0.8351944909733855
Accuracy for split 6 : 0.837893169551461
Accuracy for split 7 : 0.8334263912153359
Accuracy for split 8 : 0.8329611018053229
Accuracy for split 9 : 0.8315652335752838
Accuracy for split 10 : 0.8355667225013959


## KNN Model for the given data

In [13]:
categorical_columns = list(set(df.columns)-set(integer_columns))
categorical_columns.remove('income')
df = pd.get_dummies(df,columns=categorical_columns)
df

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,maritial-status_ Divorced,maritial-status_ Married-AF-spouse,maritial-status_ Married-civ-spouse,...,work-class_ Self-emp-inc,work-class_ Self-emp-not-inc,work-class_ State-gov,work-class_ Without-pay,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife
0,39,77516,13,2174,0,40,<=50K,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,50,83311,13,0,0,13,<=50K,0,0,1,...,0,1,0,0,1,0,0,0,0,0
2,38,215646,9,0,0,40,<=50K,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,53,234721,7,0,0,40,<=50K,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,28,338409,13,0,0,40,<=50K,0,0,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,<=50K,0,0,1,...,0,0,0,0,0,0,0,0,0,1
32557,40,154374,9,0,0,40,>50K,0,0,1,...,0,0,0,0,1,0,0,0,0,0
32558,58,151910,9,0,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,1,0
32559,22,201490,9,0,0,20,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [14]:
train_splits = []

In [15]:
for i in range(10):
    train, test = train_test_split(df, test_size=0.33, random_state=random.randint(0,100))
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    train_splits.append((train,test))

In [16]:
for i,(train,test) in enumerate(train_splits):
    X_train, y_train = train.drop(columns=["income"]).to_numpy(), train["income"].to_numpy()
    X_test, y_test = test.drop(columns=["income"]).to_numpy(), test["income"].to_numpy()
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    # print(X_test)
    knn = KNeighborsClassifier(n_neighbors=100)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print("Accuracy for split {0} : {1}".format(i+1,accuracy_score(y_test, y_pred)))
    results_list[i]['knn accuracy'] = accuracy_score(y_test, y_pred)
    results_list[i]['knn recall'] = recall_score(y_test,y_pred, average="binary", pos_label=' <=50K')
    results_list[i]['knn f1 score'] = f1_score(y_test,y_pred, average="binary", pos_label=' <=50K')
    results_list[i]['knn precision'] = precision_score(y_test,y_pred, average="binary", pos_label=' <=50K')

Accuracy for split 1 : 0.8273776288851665
Accuracy for split 2 : 0.8270984552391587
Accuracy for split 3 : 0.8324958123953099
Accuracy for split 4 : 0.8316582914572864
Accuracy for split 5 : 0.8255164712451145
Accuracy for split 6 : 0.8255164712451145
Accuracy for split 7 : 0.829890191699237
Accuracy for split 8 : 0.8278429182951796
Accuracy for split 9 : 0.8285873813512005
Accuracy for split 10 : 0.8321235808672994


## Logistic Regression for the Given Model

In [17]:
model = LogisticRegression()

In [18]:
for i,(train,test) in enumerate(train_splits):
    X_train, y_train = train.drop(columns=["income"]).to_numpy(), train["income"].to_numpy()
    X_test, y_test = test.drop(columns=["income"]).to_numpy(), test["income"].to_numpy()
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print("Accuracy for split {0} : {1}".format(i+1,accuracy_score(y_test, y_pred)))
    results_list[i]['logreg accuracy'] = accuracy_score(y_test, y_pred)
    results_list[i]['logreg recall'] = recall_score(y_test,y_pred,average='binary',pos_label=' <=50K')
    results_list[i]['logreg f1 score'] = f1_score(y_test,y_pred,pos_label=' <=50K')
    results_list[i]['logreg precision'] = precision_score(y_test,y_pred,pos_label=' <=50K')


Accuracy for split 1 : 0.8465475525777033
Accuracy for split 2 : 0.8505490415038154
Accuracy for split 3 : 0.8535268937278988
Accuracy for split 4 : 0.8519449097338545
Accuracy for split 5 : 0.8441280476456355
Accuracy for split 6 : 0.8441280476456355
Accuracy for split 7 : 0.8493392890377816
Accuracy for split 8 : 0.8498045784477946
Accuracy for split 9 : 0.8510143309138284
Accuracy for split 10 : 0.8500837520938024


In [19]:
results = pd.DataFrame.from_records(results_list)
results

Unnamed: 0,index,accuracy,precision,recall,f1 score,knn accuracy,knn recall,knn f1 score,knn precision,logreg accuracy,logreg recall,logreg f1 score,logreg precision
0,Accuracy 1,0.833426,0.800391,0.615038,0.695578,0.827378,0.91719,0.88934,0.863131,0.846548,0.92568,0.901228,0.878035
1,Accuracy 2,0.829611,0.844173,0.923709,0.882152,0.827098,0.916646,0.889207,0.863363,0.850549,0.925744,0.903636,0.88256
2,Accuracy 3,0.832031,0.843546,0.931408,0.885302,0.832496,0.929664,0.893289,0.859653,0.853527,0.940276,0.906388,0.874856
3,Accuracy 4,0.833426,0.800391,0.615038,0.695578,0.831658,0.924098,0.893457,0.864782,0.851945,0.933967,0.905986,0.879633
4,Accuracy 5,0.835194,0.849188,0.928438,0.887046,0.825516,0.925368,0.888744,0.854909,0.844128,0.928704,0.899743,0.872533
5,Accuracy 6,0.837893,0.852948,0.929007,0.889355,0.825516,0.925368,0.888744,0.854909,0.844128,0.928704,0.899743,0.872533
6,Accuracy 7,0.833426,0.800391,0.615038,0.695578,0.82989,0.930039,0.892698,0.858239,0.849339,0.932118,0.90398,0.87749
7,Accuracy 8,0.832961,0.847853,0.926904,0.885618,0.827843,0.923954,0.890636,0.859637,0.849805,0.931436,0.90394,0.878021
8,Accuracy 9,0.831565,0.795687,0.623481,0.699136,0.828587,0.927515,0.890941,0.857143,0.851014,0.935897,0.904617,0.87536
9,Accuracy 10,0.835567,0.791004,0.624235,0.697794,0.832124,0.924416,0.893845,0.865231,0.850084,0.927702,0.90442,0.882278


## NAIVE BAYES RESULTS

In [20]:
print(f"Mean for 10 splits is {results['accuracy'].mean()}")
print(f"Standard deviation for 10 splits is {results['accuracy'].std()}")
print(f"Average precision for 10 splits is {results['precision'].mean()}")
print(f"Average recall for 10 splits is {results['recall'].mean()}")

Mean for 10 splits is 0.8335101433091383
Standard deviation for 10 splits is 0.0023039840329878627
Average precision for 10 splits is 0.8225573136732635
Average recall for 10 splits is 0.7732295149387498


## KNN RESULTS

In [21]:
print(f"Mean for 10 splits is {results['knn accuracy'].mean()}")
print(f"Standard deviation for 10 splits is {results['knn accuracy'].std()}")
print(f"Average precision for 10 splits is {results['knn precision'].mean()}")
print(f"Average recall for 10 splits is {results['knn recall'].mean()}")

Mean for 10 splits is 0.8288107202680066
Standard deviation for 10 splits is 0.0026141738913198088
Average precision for 10 splits is 0.860099639732819
Average recall for 10 splits is 0.924425787705777


## LOGISTIC REGRESSION RESULTS

In [22]:
print(f"Mean for 10 splits is {results['logreg accuracy'].mean()}")
print(f"Standard deviation for 10 splits is {results['logreg accuracy'].std()}")
print(f"Average precision for 10 splits is {results['logreg precision'].mean()}")
print(f"Average recall for 10 splits is {results['knn recall'].mean()}")

Mean for 10 splits is 0.8491066443327749
Standard deviation for 10 splits is 0.003178607462644565
Average precision for 10 splits is 0.8773298657874193
Average recall for 10 splits is 0.924425787705777
