# Income predictor based on Census Data

I am going to use Census Data to predict whether an individual makes over $50k per year or not. 

In [1]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt

Step 1: Analyze the Data

In [2]:
adultDataSet_filename = os.path.join(os.getcwd(), "censusData.csv")
df = pd.read_csv(adultDataSet_filename, header=0)
df.shape
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex_selfID,capital-gain,capital-loss,hours-per-week,native-country,income_binary
0,39.0,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Non-Female,2174,0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Non-Female,0,0,13.0,United-States,<=50K
2,38.0,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Non-Female,0,0,40.0,United-States,<=50K
3,53.0,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Non-Female,0,0,40.0,United-States,<=50K
4,28.0,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40.0,Cuba,<=50K


In [3]:
columns = df.columns

In [4]:
for i in columns:
    print(i)
    print(df[i].dtype)
    print(df[i].unique())
    print()

age
float64
[39. 50. 38. 53. 28. 37. 49. 52. 31. 42. 30. 23. 32. 40. 34. 25. 43. 54.
 35. 59. 56. 19. 20. 45. 22. 48. 21. 24. 57. 44. 41. 29. nan 18. 47. 46.
 36. 79. 27. 67. 33. 76. 17. 55. 61. 70. 64. 71. 68. 66. 51. 58. 26. 60.
 90. 75. 65. 77. 62. 63. 80. 72. 74. 69. 73. 81. 78. 88. 82. 83. 84. 85.
 86. 87.]

workclass
object
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' nan
 'Self-emp-inc' 'Without-pay' 'Never-worked']

fnlwgt
int64
[ 77516  83311 215646 ...  34066  84661 257302]

education
object
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']

education-num
int64
[13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8]

marital-status
object
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']

occupation
object
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty

In [5]:
nan_count = np.sum(df.isnull(), axis = 0)
print(nan_count)
nan_detected = ['age', 'workclass', 'occupation', 'hours-per-week', 'native-country']
print(nan_detected)
df.shape
#I plan on one-hot-encoding workclass and occupation since they don't have many values
#I don't think native-country is relevant enough to use, and it has so many values so i will cut it
#I think i will cut the instances that are nan for age and hours-per-week since they seem like important 
#factors in prediction and not too many are nan

age                162
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex_selfID           0
capital-gain         0
capital-loss         0
hours-per-week     325
native-country     583
income_binary        0
dtype: int64
['age', 'workclass', 'occupation', 'hours-per-week', 'native-country']


(32561, 15)

Step 2: Prepare the data for the model. I will be dropping na instances for all columns as well as removing a few columns that are irrelevant to my problem. I will also perform one-hot-encoding to switch all data to numerical 

In [6]:
#cutting nan instances from age and hours-per-week
df = df.dropna(subset=["age"])
df = df.dropna(subset=["hours-per-week"])
df = df.dropna(subset=['occupation'])
df = df.dropna(subset=['workclass'])
nan_count = np.sum(df.isnull(), axis = 0)
print(nan_count)

age                 0
workclass           0
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation          0
relationship        0
race                0
sex_selfID          0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    550
income_binary       0
dtype: int64


In [7]:
#dropping native-country/fnlwgt
df = df.drop(['native-country'], axis=1)
df = df.drop(['fnlwgt'], axis=1)
df.columns

#also dropping relationship and education columns because it is too similar to marital-status and education-num
df = df.drop(['relationship'], axis=1)
df = df.drop(['education'], axis=1)
#len(df['fnlwgt'].unique().tolist())

In [8]:
#after creating my model I am going to drop some more feature columns
#and comment out their one-hot-encodings
df = df.drop(['marital-status'], axis =1)
df = df.drop(['occupation'], axis =1)
df = df.drop(['workclass'],axis =1)

In [9]:
#one-hot-encoding sex_selfID
df['sex_selfID'].unique()

df_sex_selfID = pd.get_dummies(df['sex_selfID'], prefix='sex_selfID')
# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type
df = df.join(df_sex_selfID)
# Remove the original 'room_type' column from DataFrame df
df.drop(columns = 'sex_selfID', inplace=True)

In [10]:
#one-hot-encoding race
df['race'].unique()
df_race = pd.get_dummies(df['race'], prefix='race')
# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type
df = df.join(df_race)
# Remove the original 'room_type' column from DataFrame df
df.drop(columns = 'race', inplace=True)

In [11]:
#one-hot-encoding marital-status
'''
df['marital-status'].unique()
df_marital_status = pd.get_dummies(df['marital-status'], prefix='marital-status')
# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type
df = df.join(df_marital_status)
# Remove the original 'room_type' column from DataFrame df
df.drop(columns = 'marital-status', inplace=True)
'''

"\ndf['marital-status'].unique()\ndf_marital_status = pd.get_dummies(df['marital-status'], prefix='marital-status')\n# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type\ndf = df.join(df_marital_status)\n# Remove the original 'room_type' column from DataFrame df\ndf.drop(columns = 'marital-status', inplace=True)\n"

In [12]:
#one-hot-encoding workclass
'''
df['workclass'].value_counts()

df_workclass = pd.get_dummies(df['workclass'], prefix='workclass')
# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type
df = df.join(df_workclass)
# Remove the original 'room_type' column from DataFrame df
df.drop(columns = 'workclass', inplace=True)
'''

"\ndf['workclass'].value_counts()\n\ndf_workclass = pd.get_dummies(df['workclass'], prefix='workclass')\n# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type\ndf = df.join(df_workclass)\n# Remove the original 'room_type' column from DataFrame df\ndf.drop(columns = 'workclass', inplace=True)\n"

In [13]:
#one-hot-encoding occupation
'''
df['occupation'].value_counts()

df_occupation = pd.get_dummies(df['occupation'], prefix='occupation')
# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type
df = df.join(df_occupation)
# Remove the original 'room_type' column from DataFrame df
df.drop(columns = 'occupation', inplace=True)
'''

"\ndf['occupation'].value_counts()\n\ndf_occupation = pd.get_dummies(df['occupation'], prefix='occupation')\n# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type\ndf = df.join(df_occupation)\n# Remove the original 'room_type' column from DataFrame df\ndf.drop(columns = 'occupation', inplace=True)\n"

In [14]:
df.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,income_binary,sex_selfID_Female,sex_selfID_Non-Female,race_Amer-Indian-Inuit,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,39.0,13,2174,0,40.0,<=50K,False,True,False,False,False,False,True
1,50.0,13,0,0,13.0,<=50K,False,True,False,False,False,False,True
2,38.0,9,0,0,40.0,<=50K,False,True,False,False,False,False,True
3,53.0,7,0,0,40.0,<=50K,False,True,False,False,True,False,False
4,28.0,13,0,0,40.0,<=50K,True,False,False,False,True,False,False


In [15]:
#fixing label to be 1 or 0

df_income_binary = pd.get_dummies(df['income_binary'], prefix='income_binary')
# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type
df = df.join(df_income_binary)
# Remove the original 'room_type' column from DataFrame df
df.drop(columns = 'income_binary', inplace=True)


In [16]:
df.head()
df.drop(columns = 'income_binary_<=50K', inplace=True)

In [17]:
#label is income_binary_>50k, rest are feature columns
df.head()
df.columns

Index(['age', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'sex_selfID_Female', 'sex_selfID_Non-Female',
       'race_Amer-Indian-Inuit', 'race_Asian-Pac-Islander', 'race_Black',
       'race_Other', 'race_White', 'income_binary_>50K'],
      dtype='object')

In [18]:
#changing everything to float
columns = df.columns.tolist()
df[columns] = df[columns].astype(float)

In [19]:
df.head()
df.dtypes
df.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,sex_selfID_Female,sex_selfID_Non-Female,race_Amer-Indian-Inuit,race_Asian-Pac-Islander,race_Black,race_Other,race_White,income_binary_>50K
0,39.0,13.0,2174.0,0.0,40.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,50.0,13.0,0.0,0.0,13.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,38.0,9.0,0.0,0.0,40.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,53.0,7.0,0.0,0.0,40.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,28.0,13.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Step 3: Creating the Model

In [20]:
#importing more packages to build the nueral network

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import time

In [21]:
import tensorflow as tf
import keras
from keras import layers

In [22]:
from keras.layers import BatchNormalization

In [23]:
#Create Labeled Examples from the Data Set
y = df['income_binary_>50K']
X = df.drop(columns = 'income_binary_>50K', axis=1)

In [24]:
#Create Training and Test Data Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [25]:
#over sampling because the model was just picking one option the entire time
%pip install imbalanced-learn 

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

Note: you may need to restart the kernel to use updated packages.


In [26]:
nn_model = keras.Sequential()

input_layer = keras.layers.InputLayer(input_shape=(X_train_resampled.shape[1],))

nn_model.add(input_layer)

hidden_layer_0 = keras.layers.Dense(units=128, activation='relu')
nn_model.add(hidden_layer_0)

hidden_layer_1 = keras.layers.Dense(units=64, activation='relu')
nn_model.add(hidden_layer_1)
nn_model.add(keras.layers.BatchNormalization())
nn_model.add(keras.layers.Dropout(0.2))


hidden_layer_2 = keras.layers.Dense(units=32, activation='relu')
nn_model.add(hidden_layer_2)
nn_model.add(keras.layers.BatchNormalization())

hidden_layer_3 = keras.layers.Dense(units=16, activation='relu')
nn_model.add(hidden_layer_3)

'''
nn_model.add(keras.layers.BatchNormalization())
nn_model.add(keras.layers.Dropout(0.5))
'''

output_layer = keras.layers.Dense(units=1, activation='sigmoid')
nn_model.add(output_layer)

nn_model.summary()



In [27]:
#defining optimization fuction
sgd_optimizer = keras.optimizers.SGD(learning_rate=0.05)

#defining loss function
#loss_fn = keras.losses.BinaryCrossentropy(from_logits=False)
loss_fn = keras.losses.BinaryCrossentropy(from_logits=False, label_smoothing=0.0,
                                            reduction='sum_over_batch_size',
                                            name='binary_crossentropy')

In [28]:
#compiling the model
nn_model.compile(optimizer=sgd_optimizer, loss=loss_fn, metrics=['accuracy'])

In [29]:
loss, accuracy = nn_model.evaluate(X_test, y_test)

[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 419us/step - accuracy: 0.7506 - loss: 59.7218 


In [30]:
probability_predictions = nn_model.predict(X_test)
class_label_predictions=[]
for i in range(0,len(y_test)):
    if probability_predictions[i] >= 0.6:
        class_label_predictions.append(1)
    else:
        class_label_predictions.append(0)

c_m = confusion_matrix(y_test, class_label_predictions, labels=[True, False])
pd.DataFrame(
c_m,
columns=['Predicted: over 50k', 'Predicted: under 50k'],
index=['Actual: over 50k', 'Actual: under 50k']
)

[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 594us/step


Unnamed: 0,Predicted: over 50k,Predicted: under 50k
Actual: over 50k,0,1939
Actual: under 50k,0,5628
