# Cencus 1994 USA
### Data Explore

In [1]:
# importing necessary modules
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import os

In [2]:
%matplotlib notebook

In [3]:
# column names and types for reading
names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender',
         'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
dtype = { 'age':int, 'workclass':object, 'fnlwgt':int, 'education':object, 'education-num':int, 'marital-status':object, 'occupation':object,
        'relationship':object, 'race':object, 'gender':object, 'capital-gain':int, 'capital-loss':int, 'hours-per-week':int, 
        'native-country':object, 'income':str}

In [4]:
# load datasets
file1 = "adult.data"
file2 = "adult.test"
folder = r'/home/huzyk/Documents/Python/Representing Data/Cencus_1994_USA/data'
path1 = os.path.join(folder, file1)
path2 = os.path.join(folder, file2)

df_train = pd.read_csv(path1, names=names, dtype=dtype)
df_test = pd.read_csv(path2, names=names, dtype=dtype)

df_test = df_test.replace({'income': (r'>.*', r'<.*')}, {'income': ('>50K', '<=50K')}, regex=True)

In [5]:
# exploring the shape of datasets
print("Train set shape: {}".format(df_train.shape))
print("Test set shape: {}".format(df_test.shape))

Train set shape: (32561, 15)
Test set shape: (16281, 15)


In [6]:
dataset = df_train.merge(df_test, how='outer')
print("All dataset shape: {}".format(dataset.shape))
dataset.head(10)

All dataset shape: (48819, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


Dataset consist of:<br>
- 7 - polinomials attributes<br>
- 1 - binomical attributes<br>
- 6 - continuous attributes

In [7]:
# Continuous attributes
df_cont = pd.DataFrame()
cont_attr = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
df_cont["Mean"] = dataset[cont_attr].mean(axis=0)
df_cont["Median"] = dataset[cont_attr].median(axis=0)
df_cont["Std Dev"] = dataset[cont_attr].std(axis=0)
df_cont["Min Val"] = dataset[cont_attr].min(axis=0)
df_cont["Max Val"] = dataset[cont_attr].max(axis=0)
df_cont

Unnamed: 0,Mean,Median,Std Dev,Min Val,Max Val
age,38.64903,37.0,13.710001,17,90
fnlwgt,189665.226572,178142.0,105615.005993,12285,1490400
education-num,10.078207,10.0,2.570763,1,16
capital-gain,1079.576005,0.0,7453.737498,0,99999
capital-loss,87.543538,0.0,403.095,0,4356
hours-per-week,40.423216,40.0,12.393218,1,99


In [8]:
fig, axes = plt.subplots(3, 2)
labels=['>50K', '<=50K']
dataset = dataset.rename(columns={'education-num':'education_num', 'capital-gain':'capital_gain', 'capital-loss':'capital_loss',
                                 'hours-per-week':'hours_per_week'})
axes[0, 0].boxplot([dataset[dataset["income"] == " >50K"].age, dataset[dataset["income"] != " >50K"].age], labels=labels)
axes[0, 0].set_title('Age')
axes[0, 1].boxplot([dataset[dataset["income"] == " >50K"].fnlwgt, dataset[dataset["income"] != " >50K"].fnlwgt], labels=labels)
axes[0, 1].set_title('Final Weight')
axes[1, 0].boxplot([dataset[dataset["income"] == " >50K"].education_num, dataset[dataset["income"] != " >50K"].education_num], labels=labels)
axes[1, 0].set_title('Education Number')
axes[1, 1].boxplot([dataset[dataset["income"] == " >50K"].capital_gain, dataset[dataset["income"] != " >50K"].capital_gain], labels=labels)
axes[1, 1].set_title('Capital Gain')
axes[2, 0].boxplot([dataset[dataset["income"] == " >50K"].capital_loss, dataset[dataset["income"] != " >50K"].capital_loss], labels=labels)
axes[2, 0].set_title('Capital Loss')
axes[2, 1].boxplot([dataset[dataset["income"] == " >50K"].hours_per_week, dataset[dataset["income"] != " >50K"].hours_per_week], labels=labels)
axes[2, 1].set_title('Hours per Week')
dataset = dataset.rename(columns={'education_num':'education-num', 'capital_gain':'capital-gain', 'capital_loss':'capital-loss',
                                 'hours_per_week':'hours-per-week'})


<IPython.core.display.Javascript object>

In [9]:
# Polinomails attributes
arr_index = np.array(['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race'])
arr_val = []
arr_subindex = []

for el in arr_index:
    ser = dataset[el].value_counts()
    arr_val.extend(list(ser))
    arr_subindex.append(list(ser.index))

i = 0
arr_index_subindex = []
for index in arr_index:
    for subindex in arr_subindex[i]:
        arr_index_subindex.append([index, subindex])
    i += 1
    
df_index = pd.DataFrame(arr_index_subindex, columns=['Attribute', 'Value'])
df_pol = pd.DataFrame(data = arr_val, index=pd.MultiIndex.from_frame(df_index), columns=['Count'])

np.set_printoptions(suppress=True, precision=1)
df_pol['Percentage'] = np.around((np.array(arr_val)/dataset.shape[0])*100, decimals=2)

df_pol

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Percentage
Attribute,Value,Unnamed: 2_level_1,Unnamed: 3_level_1
workclass,Private,33887,69.41
workclass,Self-emp-not-inc,3862,7.91
workclass,Local-gov,3136,6.42
workclass,?,2795,5.73
workclass,State-gov,1981,4.06
workclass,Self-emp-inc,1695,3.47
workclass,Federal-gov,1432,2.93
workclass,Without-pay,21,0.04
workclass,Never-worked,10,0.02
education,HS-grad,15777,32.32


In [10]:
# Binomical attributes
df_bin = pd.DataFrame()
df_bin["Count"] = dataset.gender.value_counts()
per_arr = np.array([])
for x in df_bin['Count'].iteritems():
    per_arr = np.append(per_arr, x[1]/dataset.shape[0])
df_bin["Percentage"] = per_arr
df_bin

Unnamed: 0,Count,Percentage
Male,32633,0.668449
Female,16186,0.331551


### Data Preparation

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [12]:
dataset_dummies = pd.get_dummies(dataset)

X = dataset_dummies.loc[:, 'age':'native-country_ Yugoslavia']
y = dataset_dummies['income_ >50K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("X train size: {}".format(X_train.shape))
print("X test size: {}".format(X_test.shape))
print("y train size: {}".format(y_train.shape))
print("y test size: {}".format(y_test.shape))

X train size: (39055, 108)
X test size: (9764, 108)
y train size: (39055,)
y test size: (9764,)


In [13]:
def computation(X_train, X_test, y_train, y_test):
    # Data Frame for results
    results = pd.DataFrame()

    # Logistic Regression
    clf = LogisticRegression(solver='lbfgs', max_iter=600).fit(X_train, y_train)
    s = pd.Series([clf.score(X_train, y_train), clf.score(X_test, y_test)], index=['Train', 'Test'])
    s.name = "Logistic Regression"
    results = results.append(s)

    # KNeighborsClassifier
    clf = KNeighborsClassifier(n_neighbors=14).fit(X_train, y_train)
    s = pd.Series([clf.score(X_train, y_train), clf.score(X_test, y_test)], index=['Train', 'Test'])
    s.name = "KNeighborsClassifier"
    results = results.append(s)

    # MultinomialNB
    clf = MultinomialNB().fit(X_train, y_train)
    s = pd.Series([clf.score(X_train, y_train), clf.score(X_test, y_test)], index=['Train', 'Test'])
    s.name = "MultinomialNB"
    results = results.append(s)

    # GaussianNB
    clf = GaussianNB().fit(X_train, y_train)
    s = pd.Series([clf.score(X_train, y_train), clf.score(X_test, y_test)], index=['Train', 'Test'])
    s.name = "GaussianNB"
    results = results.append(s)

    # RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=12).fit(X_train, y_train)
    s = pd.Series([clf.score(X_train, y_train), clf.score(X_test, y_test)], index=['Train', 'Test'])
    s.name = "Ranfom Forest Classifier"
    results = results.append(s)

    return results

In [20]:
computation(X_train, X_test, y_train, y_test)

Unnamed: 0,Test,Train
Logistic Regression,0.798546,0.797721
KNeighborsClassifier,0.799058,0.808757
MultinomialNB,0.784412,0.783254
GaussianNB,0.794654,0.795135
Ranfom Forest Classifier,0.857435,0.869748


In [21]:
# removing features 'native-country', 'fnlwgt', 'marital-status'
features = ['age', 'workclass', 'education', 'education-num', 'occupation', 'relationship', 'race', 'gender',
         'capital-gain', 'capital-loss', 'hours-per-week', 'income']
ch_dataset_1 = dataset[features]
dataset_dummies = pd.get_dummies(ch_dataset_1)
X = dataset_dummies.loc[:, 'age':'gender_ Male']
y = dataset_dummies['income_ >50K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

computation(X_train, X_test, y_train, y_test)



Unnamed: 0,Test,Train
Logistic Regression,0.850778,0.850416
KNeighborsClassifier,0.855592,0.869569
MultinomialNB,0.787587,0.778517
GaussianNB,0.826403,0.826988
Ranfom Forest Classifier,0.862966,0.871028


In [14]:
dataset_binning = dataset

In [15]:
bins_hours = np.linspace(min(dataset_binning['hours-per-week']), max(dataset_binning['hours-per-week']), 10)
group_names = ["Very Low", "Low", "Above Low", "Bellow Avg", "Avg", "Above Avg", "Bellow High", "High", "Above High"]
dataset_binning["hours_binned"] = pd.cut(dataset_binning['hours-per-week'], bins_hours, labels=group_names)

In [16]:
bins_ages = np.linspace(min(dataset_binning['age']), max(dataset_binning['age']), 8)
group_names = ["Missing", 'Infant', "Child", 'Teenager', "Young Adult", 'Adult', 'Senior']
dataset_binning["ages_binned"] = pd.cut(dataset_binning['age'], bins_ages, labels=group_names)

In [25]:
# binning dataset
features = ['ages_binned', 'workclass', 'education', 'education-num', 'occupation', 'relationship', 'race', 'gender',
         'capital-gain', 'capital-loss', 'hours_binned', 'income']
dataset_binning = dataset_binning[features]
dataset_dummies = pd.get_dummies(dataset_binning)
X = dataset_dummies.loc[:, 'education-num':'hours_binned_Above High']
y = dataset_dummies['income_ >50K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
computation(X_train, X_test, y_train, y_test)

Unnamed: 0,Test,Train
Logistic Regression,0.853953,0.855844
KNeighborsClassifier,0.86061,0.877352
MultinomialNB,0.785027,0.779055
GaussianNB,0.816571,0.818256
Ranfom Forest Classifier,0.859586,0.868749


In [26]:
features = ['ages_binned', 'workclass', 'education', 'relationship', 'race',
         'capital-gain', 'capital-loss', 'hours_binned', 'income']
dataset_binning = dataset_binning[features]
dataset_dummies = pd.get_dummies(dataset_binning)
X = dataset_dummies.loc[:, 'capital-gain':'hours_binned_Above High']
y = dataset_dummies['income_ >50K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
computation(X_train, X_test, y_train, y_test)

Unnamed: 0,Test,Train
Logistic Regression,0.849652,0.84952
KNeighborsClassifier,0.858767,0.865165
MultinomialNB,0.778677,0.780438
GaussianNB,0.828451,0.824734
Ranfom Forest Classifier,0.854977,0.862604


### Modelling

In [28]:
features = ['age', 'workclass', 'education', 'education-num', 'occupation', 'relationship', 'race', 'gender',
         'capital-gain', 'capital-loss', 'hours-per-week', 'income']
ch_dataset_1 = dataset[features]
dataset_dummies = pd.get_dummies(ch_dataset_1)
X = dataset_dummies.loc[:, 'age':'gender_ Male']
y = dataset_dummies['income_ >50K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

results = pd.DataFrame()
for i in range(4, 20, 1):
    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=i).fit(X_train, y_train)
    s = pd.Series([clf.score(X_train, y_train), clf.score(X_test, y_test)], index=['Train', 'Test'])
    s.name = "RanForClassifier max_depth " + str(i)
    results = results.append(s)
results

Unnamed: 0,Test,Train
RanForClassifier max_depth 4,0.835621,0.828345
RanForClassifier max_depth 5,0.848832,0.842914
RanForClassifier max_depth 6,0.8517,0.845321
RanForClassifier max_depth 7,0.856104,0.850826
RanForClassifier max_depth 8,0.858664,0.856075
RanForClassifier max_depth 9,0.859996,0.861017
RanForClassifier max_depth 10,0.861737,0.863782
RanForClassifier max_depth 11,0.862864,0.867571
RanForClassifier max_depth 12,0.863068,0.871617
RanForClassifier max_depth 13,0.863273,0.87515


In [18]:
dataset_binning = dataset

bins_hours = np.linspace(min(dataset_binning['hours-per-week']), max(dataset_binning['hours-per-week']), 10)
group_names = ["Very Low", "Low", "Above Low", "Bellow Avg", "Avg", "Above Avg", "Bellow High", "High", "Above High"]
dataset_binning["hours_binned"] = pd.cut(dataset_binning['hours-per-week'], bins_hours, labels=group_names)

bins_ages = np.linspace(min(dataset_binning['age']), max(dataset_binning['age']), 8)
group_names = ["Missing", 'Infant', "Child", 'Teenager', "Young Adult", 'Adult', 'Senior']
dataset_binning["ages_binned"] = pd.cut(dataset_binning['age'], bins_ages, labels=group_names)

features = ['ages_binned', 'workclass', 'education', 'education-num', 'occupation', 'relationship', 'race', 'gender',
         'capital-gain', 'capital-loss', 'hours_binned', 'income']
dataset_binning = dataset_binning[features]
dataset_dummies = pd.get_dummies(dataset_binning)
X = dataset_dummies.loc[:, 'education-num':'hours_binned_Above High']
y = dataset_dummies['income_ >50K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

results = pd.DataFrame()
for i in range(4, 32, 2):
    clf = KNeighborsClassifier(n_neighbors=i, n_jobs=-1).fit(X_train, y_train)
    s = pd.Series([clf.score(X_train, y_train), clf.score(X_test, y_test)], index=['Train', 'Test'])
    s.name = "n_neighbors " + str(i)
    print(s.name + "...")
    results = results.append(s)
results

n_neighbors 4...
n_neighbors 6...
n_neighbors 8...
n_neighbors 10...
n_neighbors 12...
n_neighbors 14...
n_neighbors 16...
n_neighbors 18...
n_neighbors 20...
n_neighbors 22...
n_neighbors 24...
n_neighbors 26...
n_neighbors 28...
n_neighbors 30...


Unnamed: 0,Test,Train
n_neighbors 4,0.856309,0.885956
n_neighbors 6,0.858869,0.882653
n_neighbors 8,0.861942,0.880758
n_neighbors 10,0.863888,0.879913
n_neighbors 12,0.865014,0.878223
n_neighbors 14,0.865014,0.876712
n_neighbors 16,0.860815,0.875663
n_neighbors 18,0.862044,0.875765
n_neighbors 20,0.859893,0.873921
n_neighbors 22,0.863273,0.873614


In [21]:
# Data Frame for results
results = pd.DataFrame()

# Logistic Regression
clf = LogisticRegression(solver='lbfgs', max_iter=600).fit(X_train, y_train)
s = pd.Series([clf.score(X_train, y_train), clf.score(X_test, y_test)], index=['Train', 'Test'])
s.name = "Logistic Regression"
results = results.append(s)

results

Unnamed: 0,Test,Train
Logistic Regression,0.85467,0.855409


In [24]:
features = ['age', 'workclass', 'education', 'education-num', 'occupation', 'relationship', 'race', 'gender',
         'capital-gain', 'capital-loss', 'hours-per-week', 'income']
ch_dataset_1 = dataset[features]
dataset_dummies = pd.get_dummies(ch_dataset_1)
X = dataset_dummies.loc[:, 'age':'gender_ Male']
y = dataset_dummies['income_ >50K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

results = pd.DataFrame()

clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=16).fit(X_train, y_train)
s = pd.Series([clf.score(X_train, y_train), clf.score(X_test, y_test)], index=['Train', 'Test'])
s.name = "Random Forests"
results = results.append(s)

dataset_binning = dataset

bins_hours = np.linspace(min(dataset_binning['hours-per-week']), max(dataset_binning['hours-per-week']), 10)
group_names = ["Very Low", "Low", "Above Low", "Bellow Avg", "Avg", "Above Avg", "Bellow High", "High", "Above High"]
dataset_binning["hours_binned"] = pd.cut(dataset_binning['hours-per-week'], bins_hours, labels=group_names)

bins_ages = np.linspace(min(dataset_binning['age']), max(dataset_binning['age']), 8)
group_names = ["Missing", 'Infant', "Child", 'Teenager', "Young Adult", 'Adult', 'Senior']
dataset_binning["ages_binned"] = pd.cut(dataset_binning['age'], bins_ages, labels=group_names)

features = ['ages_binned', 'workclass', 'education', 'education-num', 'occupation', 'relationship', 'race', 'gender',
         'capital-gain', 'capital-loss', 'hours_binned', 'income']
dataset_binning = dataset_binning[features]
dataset_dummies = pd.get_dummies(dataset_binning)
X = dataset_dummies.loc[:, 'education-num':'hours_binned_Above High']
y = dataset_dummies['income_ >50K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = KNeighborsClassifier(n_neighbors=18, n_jobs=-1).fit(X_train, y_train)
s = pd.Series([clf.score(X_train, y_train), clf.score(X_test, y_test)], index=['Train', 'Test'])
s.name = "kNN"
results = results.append(s)

clf = LogisticRegression(solver='lbfgs', max_iter=1000).fit(X_train, y_train)
s = pd.Series([clf.score(X_train, y_train), clf.score(X_test, y_test)], index=['Train', 'Test'])
s.name = "Logistic Regression"
results = results.append(s)

results



Unnamed: 0,Test,Train
Random Forests,0.865834,0.891051
kNN,0.856616,0.875125
Logistic Regression,0.85467,0.855307
