# Adult Data Set
## Source: http://archive.ics.uci.edu/ml/datasets/Adult
## Written by Abiola Obembe
### Date : 14-06-2020
### Objective: Predict whether income exceeds 50k/per year based on census data. Also known as "Census Income" datase

In [58]:
# import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
print("Dependencies installed successfully!")

Dependencies installed successfully!


In [59]:
# Import dataset
df = pd.read_excel('Dataset.xlsx')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


## Data Cleaning and Pre-processing

In [60]:
# Print number of rows and columns
print('The dataframe has', df.shape[0], 'rows', 'and', df.shape[1],'columns/attributes')

The dataframe has 48843 rows and 15 columns/attributes


In [61]:
# print columns of dataframe as list
Columns = df.columns.tolist()
Columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income']

In [62]:
# Perform quick statistical analysis for continous variables
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [63]:
# Check unique value counts in target variable
df['income'].value_counts()

 <=50K    37155
 >50K     11687
Name: income, dtype: int64

In [64]:
# Check for missing values
missing_values = df.isnull().sum().sum()
print("The number of missing values on dataframe is:", missing_values)

The number of missing values on dataframe is: 15


In [65]:
# Investigate further the columns with missing values
df.isnull().sum()

age               1
workclass         1
fnlwgt            1
education         1
education-num     1
marital-status    1
occupation        1
relationship      1
race              1
sex               1
capital-gain      1
capital-loss      1
hours-per-week    1
native-country    1
income            1
dtype: int64

In [66]:
# Check datatypes
df.dtypes

age               float64
workclass          object
fnlwgt            float64
education          object
education-num     float64
marital-status     object
occupation         object
relationship       object
race               object
sex                object
capital-gain      float64
capital-loss      float64
hours-per-week    float64
native-country     object
income             object
dtype: object

In [67]:
# drop row with income value missing
df.dropna( axis = 0, inplace= True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [68]:
# Check again for missing values by column
print("The number of missing values is ", df.isnull().sum().sum())

The number of missing values is  0


In [69]:
# For convenience we drop all categorical features
df.drop(['workclass','education','marital-status','relationship','sex','native-country'], axis =1, inplace= True)
df.head()

Unnamed: 0,age,fnlwgt,education-num,occupation,race,capital-gain,capital-loss,hours-per-week,income
0,39.0,77516.0,13.0,Adm-clerical,White,2174.0,0.0,40.0,<=50K
1,50.0,83311.0,13.0,Exec-managerial,White,0.0,0.0,13.0,<=50K
2,38.0,215646.0,9.0,Handlers-cleaners,White,0.0,0.0,40.0,<=50K
3,53.0,234721.0,7.0,Handlers-cleaners,Black,0.0,0.0,40.0,<=50K
4,28.0,338409.0,13.0,Prof-specialty,Black,0.0,0.0,40.0,<=50K


In [70]:
# Split dataframe to X and y
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [71]:
# print first 5 rows of X
print(X[0:5])

[[39.0 77516.0 13.0 ' Adm-clerical' ' White' 2174.0 0.0 40.0]
 [50.0 83311.0 13.0 ' Exec-managerial' ' White' 0.0 0.0 13.0]
 [38.0 215646.0 9.0 ' Handlers-cleaners' ' White' 0.0 0.0 40.0]
 [53.0 234721.0 7.0 ' Handlers-cleaners' ' Black' 0.0 0.0 40.0]
 [28.0 338409.0 13.0 ' Prof-specialty' ' Black' 0.0 0.0 40.0]]


In [72]:
# print first 5 rows of target variable y
print(y[0:5])

[' <=50K' ' <=50K' ' <=50K' ' <=50K' ' <=50K']


In [73]:
# Encoding categorical data
# Encoding the Independent Variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3,4])], remainder='passthrough')
#ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,3,5,6,7,8,9,13])], remainder='passthrough')
#X = np.array(ct.fit_transform(X))
X = ct.fit_transform(X)
print(X)

[[0.0 1.0 0.0 ... 2174.0 0.0 40.0]
 [0.0 0.0 0.0 ... 0.0 0.0 13.0]
 [0.0 0.0 0.0 ... 0.0 0.0 40.0]
 ...
 [0.0 0.0 0.0 ... 0.0 0.0 50.0]
 [0.0 1.0 0.0 ... 5455.0 0.0 40.0]
 [0.0 0.0 0.0 ... 0.0 0.0 60.0]]


In [74]:
# Encoding the Dependent Variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)


[0 0 0 ... 0 0 1]


In [75]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)


Train set shape: (39073, 26) (39073,)
Test set shape: (9769, 26) (9769,)


In [76]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [46]:
# Applying Kernel PCA
#from sklearn.decomposition import KernelPCA
#kpca = KernelPCA(n_components = 2, kernel = 'rbf')
#X_train = kpca.fit_transform(X_train)
#X_test = kpca.transform(X_test)


MemoryError: Unable to allocate 11.4 GiB for an array with shape (39073, 39073) and data type float64

## Model Training 

In [77]:
# Training the Logistic Regression model on the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [78]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[7051  421]
 [1310  987]]


0.8228068379568021

In [81]:
# Training the K-NN model on the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[6808  664]
 [1217 1080]]


0.8074521445388474

In [82]:
# Training the SVM model on the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)


[[7308  164]
 [1625  672]]


0.8168696898351929

In [52]:
# Training the SVM kernel model on the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[7223  249]
 [1453  844]]


0.8257754120176067

In [83]:
# Training the Naive Bayes model on the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)


# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[5188 2284]
 [ 503 1794]]


0.7147097962944007

In [55]:
# Training the Decision Tree Classification model on the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)


# Predicting the Test set results
y_pred = classifier.predict(X_test)


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[6372 1100]
 [1051 1246]]


0.7798136963865289

In [84]:
# Training the Random Forest Classification model on the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)


# Predicting the Test set results
y_pred = classifier.predict(X_test)


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[6908  564]
 [1207 1090]]


0.8187122530453476

In [85]:
# Training the XGBoost Classification model on the Training set
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(random_state=0)
classifier.fit(X_train, y_train)


# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[7117  355]
 [1171 1126]]


0.8437915856280069