# Naive Bayes Implementation

## Data Preprocessing

In [54]:
import numpy as np 
import pandas as pd

Train data import

In [55]:
full_dataset_train = pd.read_csv('adult.csv', header=0)
full_dataset_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Test data import

In [56]:
full_dataset_test = pd.read_csv('adult_test.csv', header=0)
full_dataset_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,y
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


Value counts for train and test data

In [57]:
print(full_dataset_train.workclass.value_counts())
print('-'*20)
print(full_dataset_test.workclass.value_counts())

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64
--------------------
 Private             11210
 Self-emp-not-inc     1321
 Local-gov            1043
 ?                     963
 State-gov             683
 Self-emp-inc          579
 Federal-gov           472
 Without-pay             7
 Never-worked            3
Name: workclass, dtype: int64


Cleaning the data to remove spaces and '?' values

In [58]:
def clean_data(df):
    filtered_df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    for index, row in filtered_df.iterrows():
        # check if '?' is in any of the columns
        #print(row.values)
        if '?' in row.values:
            # drop the row
            filtered_df.drop(index, inplace=True)
    return filtered_df

filtered_train, filtered_test = clean_data(full_dataset_train), clean_data(full_dataset_test)
        

Dropping the 'education-num' column and printing value counts of the output

In [59]:
filtered_train = filtered_train.drop(['education-num'], axis=1)
filtered_test = filtered_test.drop(['education-num'], axis=1)
#filtered_df = filtered_df.drop(['fnlwgt'], axis=1)
#filtered_df = filtered_df.drop(['capital-gain'], axis=1)
#filtered_df = filtered_df.drop(['capital-loss'], axis=1)
def printBA(df1, df2):
    print("Before Filtering the Data: ")
    print(df1.shape)
    print(df1.y.value_counts())
    print("-"*10)
    print("After Filtering the Data: ")
    print(df2.shape)
    print(df2.y.value_counts())

print("Train Data")
printBA(full_dataset_train,filtered_train)
print('='*20)
print()
print("Test Data")
printBA(full_dataset_test,filtered_test)


Train Data
Before Filtering the Data: 
(32561, 15)
 <=50K    24720
 >50K      7841
Name: y, dtype: int64
----------
After Filtering the Data: 
(30162, 14)
<=50K    22654
>50K      7508
Name: y, dtype: int64

Test Data
Before Filtering the Data: 
(16281, 15)
 <=50K    12435
 >50K      3846
Name: y, dtype: int64
----------
After Filtering the Data: 
(15060, 14)
<=50K    11360
>50K      3700
Name: y, dtype: int64


Label Encoding(catagorecial data)

In [60]:
from sklearn.preprocessing import LabelEncoder
for col in filtered_train.columns:
    if filtered_train[col].dtypes == 'object':
        encoder = LabelEncoder()
        filtered_train[col] = encoder.fit_transform(filtered_train[col])
        filtered_test[col] = encoder.transform(filtered_test[col])

In [61]:
filtered_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,y
0,39,5,77516,9,4,0,1,4,1,2174,0,40,38,0
1,50,4,83311,9,2,3,0,4,1,0,0,13,38,0
2,38,2,215646,11,0,5,1,4,1,0,0,40,38,0
3,53,2,234721,1,2,5,0,2,1,0,0,40,38,0
4,28,2,338409,9,2,9,5,2,0,0,0,40,4,0


In [62]:
filtered_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,y
0,25,2,226802,1,4,6,3,2,1,0,0,40,38,0
1,38,2,89814,11,2,4,0,4,1,0,0,50,38,0
2,28,1,336951,7,2,10,0,4,1,0,0,40,38,1
3,44,2,160323,15,2,6,0,2,1,7688,0,40,38,1
5,34,2,198693,0,4,7,1,4,1,0,0,30,38,0


## Model Creation

In [63]:
class NB():
    def __init__(self,train_df,categorical,continous):
        self.train_df = train_df
        self.out_col = list(self.train_df.columns)[-1:][0] #name of the output column
        self.categorical = categorical
        self.continous = continous
        self.cat_dict = dict()
        self.cont_dict = dict()

    def fit(self):
        #Calculating Conditional Probabilities P(X=x|Y=y) for all the Categorical Features
        for feature in self.categorical:
            try:
                n = len(self.train_df[feature].unique())
                self.cat_dict[feature] = dict()
                self.cat_dict[feature][0] = self.cat_dict[feature][1] = [1]*n
                for i in range(2):
                    for j in range(n):
                        self.cat_dict[feature][i][j] = self.calc_categorical(feature,i,j,self.out_col)
                #print("Training cat", feature," . size ",n)
            except:
                continue
        #print("done calc cat values")
        #Calculating the Mean and Standard Deviation for all the Continous Features
        for feature in self.continous:
            try:
                check = len(self.train_df[feature])
                self.cont_dict[feature] = dict()
                self.cont_dict[feature][0] = self.cont_dict[feature][1] = dict()
                for i in range(2):
                    df = self.train_df[self.train_df[self.out_col]==i]
                    self.cont_dict[feature][i] = {"mean":df[feature].mean(),"std":df[feature].std()}
                #print(self.cont_dict[feature])
            except:
                continue
        #print("done calc mean std values")

    def calc_gaussian(self, feat_name, feat_val, out_class):
        mean, std = self.cont_dict[feat_name][out_class]["mean"], self.cont_dict[feat_name][out_class]["std"]
        p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((feat_val-mean)**2 / (2 * std**2 )))
        return p_x_given_y

    def calc_categorical(self, feat_name, out_class, feat_val,Y):
        df = self.train_df[self.train_df[Y]==out_class]
        p_x_given_y = (len(df[df[feat_name]==feat_val])+1) / (len(df)+len(self.train_df[feat_name].unique()))
        return p_x_given_y
    
    def calc_ProbY(self, Y):
        #Calculating P(Y=y) for all possible y values
        all_classes = sorted(list(self.train_df[Y].unique()))
        probY = []
        for i in all_classes:
            probY.append(len(self.train_df[self.train_df[Y]==i])/len(self.train_df))
        return probY
    
    def predict(self, X_test):
        prior = self.calc_ProbY(self.out_col)
        all_features = list(self.train_df.columns)[:-1]
        Y_predicted = list()
        # loop over every test data sample to calculate probabilities
        print("Predicting outputs for ",len(X_test), " samples...")
        for currTest in X_test:
            out_classes = sorted(list(self.train_df[self.out_col].unique()))
            likelihood = [1]*len(out_classes)
            for out_class in range(len(out_classes)):
                for i in range(len(all_features)):
                    feature = all_features[i]
                    if all_features[i] in self.continous:
                        likelihood[out_class] *= self.calc_gaussian(feature, currTest[i], out_classes[out_class])
                    else:
                        likelihood[out_class] *= self.cat_dict[feature][out_class][currTest[i]]

            # calculating probability
            post_prob = [1]*len(out_classes)
            for j in range(len(out_classes)):
                post_prob[j] = likelihood[j] * prior[j]

            Y_predicted.append(np.argmax(post_prob))
        print("-"*20)
        print("Done")
        return np.array(Y_predicted)

Declaring categorical and continous features

In [64]:
categorical = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']
continous = ['age','fnlwgt','capital-gain','capital-loss','hours-per-week']

## Model Execution and Performance Metrics

Oversampling training data

In [65]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=22)
X = filtered_train.iloc[:,:-1]
Y = filtered_train['y']
ros.fit(X, Y)
X_resampled, Y_resampled = ros.fit_resample(X, Y)
resampled_train = pd.concat([X_resampled, Y_resampled], axis=1, join='inner')
print(resampled_train.y.value_counts())

0    22654
1    22654
Name: y, dtype: int64


Calculating Y_predicted

In [66]:
X_test = filtered_test.iloc[:,:-1].values
Y_test = filtered_test.iloc[:,-1].values
model = NB(resampled_train,categorical,continous)
model.fit()
Y_predicted = model.predict(X_test)

Predicting outputs for  15060  samples...
--------------------
Done


In [67]:
def compute_acc(y_test,y_pred):
    total = len(y_test)
    correct = 0
    for i in range(total):
        if (y_pred[i]==y_test[i]):
            correct += 1
    return "The accuracy is "+str(correct/total*100)+"%"

print(compute_acc(Y_test, Y_predicted))

The accuracy is 78.71181938911023%


In [68]:
from sklearn.metrics import classification_report, confusion_matrix
cm = confusion_matrix(Y_test, Y_predicted)
cm_df = pd.DataFrame(cm.T, index=["<=50K(0)",">50K(1)"], columns=["<=50K(0)",">50K(1)"])
cm_df.index.name = 'Predicted'
cm_df.columns.name = 'True'
print(cm_df)
print()
print("-"*60)
print()
print(classification_report(Y_test, Y_predicted))

True       <=50K(0)  >50K(1)
Predicted                   
<=50K(0)      10713     2559
>50K(1)         647     1141

------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.81      0.94      0.87     11360
           1       0.64      0.31      0.42      3700

    accuracy                           0.79     15060
   macro avg       0.72      0.63      0.64     15060
weighted avg       0.77      0.79      0.76     15060

