As data is a mix of categorical and continuous attributes, 2 models are used: GaussianNB model for continuous and CategoricalNB for categorical. The 2 classifiers are then combined.

In [39]:
import pandas as pd
import numpy as np
import settings
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
datapath = "../data/"

fullTrainData = pd.read_excel(settings.labelledDatapath)

print(fullTrainData.columns)
print("Class balance:")
fullTrainData["class"].value_counts()

  warn("Workbook contains no default style, apply openpyxl's default")


Index(['RowID', 'age', 'workclass', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'class'],
      dtype='object')
Class balance:


<=50K    24720
>50K      7841
Name: class, dtype: int64

In [3]:
#removing redundant columns
fullTrainData = fullTrainData.drop(labels=settings.redundantFeatures, axis=1)
fullTrainData.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
#making labels numerical - 1 for >50K, 0 for <=50K or unknown
fullTrainData["class"] = fullTrainData["class"].map(lambda x: 1 if(x == ">50K") else 0)

#### Up until now, the start process was the same as for Decision Trees.

In [11]:
#encoding categorical features
fullTrainData[settings.categoricalFeatures] = fullTrainData[settings.categoricalFeatures].apply(LabelEncoder().fit_transform)
fullTrainData.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,6,13,4,0,1,4,1,2174,0,40,38,0
1,50,5,13,2,3,0,4,1,0,0,13,38,0
2,38,3,9,0,5,1,4,1,0,0,40,38,0
3,53,3,7,2,5,0,2,1,0,0,40,38,0
4,28,3,13,2,9,5,2,0,0,0,40,4,0


In [12]:
#splitting features and labels
attributes = fullTrainData.drop(labels=["class"], axis=1)
labels = fullTrainData["class"]

#getting test and train set
trainFeatures, testFeatures, trainLabels, testLabels = train_test_split(attributes, labels, test_size=0.2, random_state=5) #random state is like a seed to allow repeatable results


In [13]:
categoricalModel = CategoricalNB()
gaussianModel = GaussianNB()

categoricalModel.fit(trainFeatures[settings.categoricalFeatures], trainLabels)
gaussianModel.fit(trainFeatures[settings.numericalFeatures], trainLabels)

#### Combining results
What we want:
$$P(C|\text{all features}) = \frac{P(\text{all features}|C)P(C)}{P(\text{all features})}$$

How we get it:

Because "Naive" (assumes independence)
$$P(\text{all features}) = P(\text{cat. features}).P(\text{num. features})$$
and
$$P(\text{all features}|C) = P(\text{cat. features}|C).P(\text{num. features}|C)$$

$$P(\text{cat. features}|C) = \frac{P(C|\text{cat. features})P(\text{cat. features})}{P(C)}$$
$$P(\text{num. features}|C) = \frac{P(C|\text{num. features})P(\text{num. features})}{P(C)}$$

so:

$$P(C|\text{all features}) = \frac{\frac{P(C|\text{cat. features})P(\text{cat. features})}{P(C)}.\frac{P(C|\text{num. features})P(\text{num. features})}{P(C)}.P(C)}{P(\text{all features})}$$
$$= \frac{\frac{P(C|\text{cat. features})P(\text{cat. features})P(C|\text{num. features})P(\text{num. features})}{P(C)}}{P(\text{all features})}$$
$$= \frac{P(C|\text{cat. features})P(C|\text{num. features})}{P(C)}$$


In [29]:
#combine using log probabilities to use + and - and avoid precision errors.
#i.e. overall log prob. = catLogProb + numLogProb - log of prior
catLogProb = categoricalModel.predict_log_proba(testFeatures[settings.categoricalFeatures])
print(catLogProb)
numLogProb = gaussianModel.predict_log_proba(testFeatures[settings.numericalFeatures])
print(numLogProb)
print(numLogProb.shape)

[[-1.28283748e+00 -3.24691198e-01]
 [-8.15150102e-03 -4.81362618e+00]
 [-4.22544281e-01 -1.06530489e+00]
 ...
 [-1.96077382e+00 -1.51694739e-01]
 [-1.69781049e-01 -1.85693536e+00]
 [-1.28061493e-03 -6.66105514e+00]]
[[-8.87031874e+00 -1.40507670e-04]
 [-5.48410366e-03 -5.20864241e+00]
 [-1.88011496e-03 -6.27736227e+00]
 ...
 [-8.35946177e-03 -4.78853805e+00]
 [-2.29084345e-02 -3.78768247e+00]
 [-9.86239395e-03 -4.62395349e+00]]
(6513, 2)


In [38]:
print(categoricalModel.class_log_prior_)
overall = (catLogProb + numLogProb) - categoricalModel.class_log_prior_
print(overall)
print(overall.shape)

[-0.27653437 -1.42050309]
[[-9.87662185  1.09567138]
 [ 0.26289876 -8.60176551]
 [-0.14789003 -5.92216407]
 ...
 [-1.69259892 -3.51972971]
 [ 0.08384488 -4.22411475]
 [ 0.26539136 -9.86450555]]
(6513, 2)


In [37]:
(-6.66105514e+00 + -4.62395349e+00) - -1.42050309

-9.86450554

In [45]:
print(np.exp(overall))

[[5.13614922e-05 2.99119023e+00]
 [1.30069503e+00 1.83781041e-04]
 [8.62525962e-01 2.67939550e-03]
 ...
 [1.84040595e-01 2.96074367e-02]
 [1.08746020e+00 1.46382876e-02]
 [1.30394118e+00 5.19875886e-05]]


## Doing without logs

In [46]:
catProb = categoricalModel.predict_proba(testFeatures[settings.categoricalFeatures])
print(catProb)
numProb = gaussianModel.predict_proba(testFeatures[settings.numericalFeatures])
print(numProb)
print(numProb.shape)

[[0.27724949 0.72275051]
 [0.99188163 0.00811837]
 [0.65537723 0.34462277]
 ...
 [0.14074946 0.85925054]
 [0.84384956 0.15615044]
 [0.9987202  0.0012798 ]]
[[1.40497799e-04 9.99859502e-01]
 [9.94530907e-01 5.46909341e-03]
 [9.98121651e-01 1.87834865e-03]
 ...
 [9.91675381e-01 8.32461863e-03]
 [9.77351971e-01 2.26480286e-02]
 [9.90186080e-01 9.81392003e-03]]
(6513, 2)


In [50]:
print(gaussianModel.class_prior_)
overall = (catProb * numProb) - gaussianModel.class_prior_
print(overall)
print(overall.shape)

[0.75840756 0.24159244]
[[-0.7583686   0.48105652]
 [ 0.22804938 -0.24154804]
 [-0.10426135 -0.24094512]
 ...
 [-0.61882978 -0.23443951]
 [ 0.06633047 -0.23805595]
 [ 0.23051129 -0.24157988]]
(6513, 2)
