In [1]:
# Determine whether a person makes over 50k a year
# Which factors are important
# Which algorithm is best for this dataset

In [2]:
# Load modules

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# Load data

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
# Name Columns

df.columns=["age","workclass","fnlwgt","education","education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","income"]
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Print summary of Dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
# Print number of missing values in each column in descending order

missing=df.isnull().sum().sort_values(ascending=False)
missing

income            0
native_country    0
hours_per_week    0
capital_loss      0
capital_gain      0
sex               0
race              0
relationship      0
occupation        0
marital_status    0
education_num     0
education         0
fnlwgt            0
workclass         0
age               0
dtype: int64

In [7]:
# Print unique values of each non-numerical column

cols=["workclass","education","marital_status","occupation","relationship","race","sex","native_country","income"]

for c in cols:
    print(len(df[c].unique()))
    print(df[c].unique())

9
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']
16
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']
7
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
15
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv']
6
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
5
[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']
2
[' Male' ' Female']
42
[' United-States' ' Cuba' ' Jamaica' ' India' ' ?' ' Mexico' ' South'
 ' Puerto-Rico' ' Honduras' ' England' ' 

In [8]:
# Factorize values of the non-numerical columns

for c in cols:
    df[c],class_names = pd.factorize(df[c])

In [9]:
# Check first 5 data after factorization

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,0,77516,0,13,0,0,0,0,0,2174,0,40,0,0
1,50,1,83311,0,13,1,1,1,0,0,0,0,13,0,0
2,38,2,215646,1,9,2,2,0,0,0,0,0,40,0,0
3,53,2,234721,2,7,1,2,1,1,0,0,0,40,0,0
4,28,2,338409,0,13,1,3,2,1,1,0,0,40,1,0


In [10]:
# Assign predictor and target values

X=df.drop("income",axis=1).values
y=df["income"].values

In [11]:
# Split the dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [12]:
# Logistic Regression

In [13]:
# Fit the first classifier model based on X-train and y_train

from sklearn.linear_model import LogisticRegression

classifier1 = LogisticRegression()
classifier1.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
# Predict y_pred1 values for X_test using the built classifier model

y_pred1 = classifier1.predict(X_test)
y_pred1

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [15]:
# Print confusion matrix for y_test and y_pred1

from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(y_test, y_pred1)
cm1

array([[5969,  190],
       [1467,  515]], dtype=int64)

In [16]:
# Print accuracy

from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred1))

0.796462351062523


In [17]:
# Decision Tree Classifier

In [18]:
# Fit the second classifier model 

from sklearn.tree import DecisionTreeClassifier

classifier2 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier2.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [19]:
# Predict y_pred2 values for X_test using the built classifier model

y_pred2 = classifier2.predict(X_test)

In [20]:
# Print confusion matrix for y_test and y_pred2

cm2 = confusion_matrix(y_test, y_pred2)
cm2

array([[5387,  772],
       [ 751, 1231]], dtype=int64)

In [21]:
# Print accuracy

print(metrics.accuracy_score(y_test, y_pred2))

0.8129222454243951


In [22]:
# Random Forest Classifier

In [23]:
# Fit the third classifier model

from sklearn.ensemble import RandomForestClassifier

classifier3 = RandomForestClassifier(random_state=1)
classifier3.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [24]:
# Predict y_pred3 values for X_test using the built classifier model

y_pred3 = classifier3.predict(X_test)

In [25]:
# Print confusion matrix for y_test and y_pred3

cm3 = confusion_matrix(y_test, y_pred3)
cm3

array([[5738,  421],
       [ 817, 1165]], dtype=int64)

In [26]:
# Print accuracy

print(metrics.accuracy_score(y_test, y_pred3))

0.8479302297015109


In [27]:
# XG Boost Classifier

In [28]:
# Fit the fourth classifier model 

from xgboost.sklearn import XGBClassifier

params = {
    'objective': 'binary:logistic',
    'max_depth': 2,
    'learning_rate': 1.0,
    'silent': 1,
    'n_estimators': 5
}
classifier4 = XGBClassifier(**params).fit(X_train, y_train)

In [29]:
# Predict y_pred4 values for X_test using the built classifier model

y_pred4 = classifier4.predict(X_test)

In [30]:
# Print confusion matrix for y_test and y_pred4

cm4 = confusion_matrix(y_test, y_pred4)
cm4

array([[5824,  335],
       [ 936, 1046]], dtype=int64)

In [31]:
# Print accuracy

print(metrics.accuracy_score(y_test, y_pred4))

0.8438766736273186


In [32]:
#Based on accuracy scores, Random Forest Classifier has given highest accuracy. So this model is the better among the four.