# Objective 


## To predict whether income exceeds 50k/yr based on census data

In [5]:
# pandas and numpy libraries
import numpy as np
import pandas as pd

In [6]:
# for preprocessing
import sklearn
from sklearn.impute import SimpleImputer


In [7]:
# To split the data set into train and test
from sklearn.model_selection import train_test_split

In [8]:
# To model the bayes Gaussian classifier
from sklearn.naive_bayes import GaussianNB

# To calculate the accuracy of the model
from sklearn.metrics import accuracy_score

In [12]:
# loading data
adult_df = pd.read_csv("adult.data",header = None, delimiter = ",", engine= 'python')

In [13]:
# checking columns in the data set
adult_df.columns

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64')

In [14]:
# Add headers to columns
adult_df.columns = ['age','workclass','fnlwgt','education','education_num','marital_status','occupation', 'relationship','race','sex','capital_gain',
                    'capital_loss','hours_per_week','native_country','income']

In [15]:
# Handling missing data- check whether there is any null value in the data set or not
adult_df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [19]:
len(adult_df)

32561

The above result shows that there are no null values,but lets try and check whether there are any ? in it or not.So we will check whether any of the categorical variables have values of ? in them.

In [28]:
for value in ('workclass','education','marital_status','occupation','relationship','race','sex','native_country','income'):
    print(value, ":", sum(adult_df[value] == " ?"))


workclass : 1836
education : 0
marital_status : 0
occupation : 1843
relationship : 0
race : 0
sex : 0
native_country : 583
income : 0


The output above shows that there are 1836 missing values in workclass, 1843 in occupation and 583 in native_country.

## Data preprocessing

In [29]:
# we wil make a duplicate copy first. make a Deep copy of adult_df
adult_df_rev = adult_df.copy(deep=True)


We will firs get some summary statistics of our data frame.For this we will use describe()

In [30]:
adult_df_rev.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [31]:
adult_df_rev.describe(include = 'all')

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


### Data imputation

We replace the missing values in the categorical values i.e ? with the top row values from the describe method.For example in workplace we replace ? with 'private' value.

In [32]:
for value in ['workclass','education','marital_status','occupation','relationship','race','sex','native_country','income']:
    replacevalue = adult_df_rev.describe(include ='all')[value][2]
    adult_df_rev[value][adult_df_rev[value]=='?'] = replacevalue

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adult_df_rev[value][adult_df_rev[value]=='?'] = replacevalue


for naive bayes we need to convert all the data values in one format.We are going to encode all the values between 0 and 1.Here it will be 0 and 1.We shall use scikit learns LabelEncoder.

In [43]:
from sklearn import preprocessing 

len = preprocessing.LabelEncoder()
workclass_cat = len.fit_transform(adult_df.workclass)
education_cat = len.fit_transform(adult_df.education)
marital_cat = len.fit_transform(adult_df.marital_status)
occupation_cat = len.fit_transform(adult_df.occupation)
relationship_cat = len.fit_transform(adult_df.relationship)
race_cat = len.fit_transform(adult_df.race)
sex_cat = len.fit_transform(adult_df.sex)
native_country_cat = len.fit_transform(adult_df.native_country)


In [44]:
adult_df_rev['workclass_cat']=workclass_cat
adult_df_rev['education_cat']=education_cat
adult_df_rev['marital_cat']= marital_cat
adult_df_rev['occupation_cat']=occupation_cat
adult_df_rev['relationship_cat']=relationship_cat
adult_df_rev['race_cat']=race_cat
adult_df_rev['sex_cat']=sex_cat
adult_df_rev['native_country_cat']=native_country_cat


In [40]:
adult_df_rev.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,...,hours_per_week,native_country,income,workclass_cat,education_cat,occupation_cat,relationship_cat,race_cat,sex_cat,native_country_cat
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,40,United-States,<=50K,7,9,1,1,4,1,39
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,13,United-States,<=50K,6,9,4,0,4,1,39
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,40,United-States,<=50K,4,11,6,1,4,1,39
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,40,United-States,<=50K,4,1,6,0,2,1,39
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,40,Cuba,<=50K,4,9,10,5,2,0,5


In [45]:
# Drop the old categories
dummy_fields = ['workclass','education','marital_status','occupation','relationship','race','sex','native_country']
adult_df_rev = adult_df_rev.drop(dummy_fields, axis =1)

Re index the columns properly using column names as parameter and axis = 1 for reindexing the columns.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [47]:
adult_df_rev =adult_df_rev.reindex(['age','workclass_cat','fnlwgt','education_cat','education_num','marital_cat','occupation_cat',
                                        'relationship_cat','race_cat','sex_cat','capital_gain','capital_loss','hours_per_week','native_country_cat','income'],axis =1)
adult_df_rev.head()

Unnamed: 0,age,workclass_cat,fnlwgt,education_cat,education_num,marital_cat,occupation_cat,relationship_cat,race_cat,sex_cat,capital_gain,capital_loss,hours_per_week,native_country_cat,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,<=50K
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,<=50K
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,<=50K
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,<=50K
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,<=50K


## Data slicing

In [54]:
# Arranging data into dependent and independent variables
X = adult_df_rev.values[:,:14] ## Features
Y = adult_df_rev.values[:,14] ## Target

In [55]:
# splitting the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 10)

implement Gaussian Naive Bayes

In [56]:
# Build the Gaussian classifier and use fit method to train it using the training data
clf = GaussianNB()
clf.fit(X_train, Y_train)

In [57]:
# we can then use predict to make predictions on the test features
Y_pred = clf.predict(X_test)

Checking the accuracy of the Gaussian model

In [58]:
accuracy_score(Y_test,Y_pred, normalize = True)

0.7925069096120381