# Income predictor based on Census Data

I am going to use Census Data to predict whether an individual makes over $50k per year or not. 

In [None]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt

Step 1: Analyze the Data

In [None]:
adultDataSet_filename = os.path.join(os.getcwd(), "censusData.csv")
df = pd.read_csv(adultDataSet_filename, header=0)
df.shape
df.head()

In [None]:
columns = df.columns

In [None]:
for i in columns:
    print(i)
    print(df[i].dtype)
    print(df[i].unique())
    print()

In [None]:
nan_count = np.sum(df.isnull(), axis = 0)
print(nan_count)
nan_detected = ['age', 'workclass', 'occupation', 'hours-per-week', 'native-country']
print(nan_detected)
df.shape
#I plan on one-hot-encoding workclass and occupation since they don't have many values
#I don't think native-country is relevant enough to use, and it has so many values so i will cut it
#I think i will cut the instances that are nan for age and hours-per-week since they seem like important 
#factors in prediction and not too many are nan

Step 2: Prepare the data for the model. I will be dropping na instances for all columns as well as removing a few columns that are irrelevant to my problem. I will also perform one-hot-encoding to switch all data to numerical 

In [None]:
#cutting nan instances from age and hours-per-week
df = df.dropna(subset=["age"])
df = df.dropna(subset=["hours-per-week"])
df = df.dropna(subset=['occupation'])
df = df.dropna(subset=['workclass'])
nan_count = np.sum(df.isnull(), axis = 0)
print(nan_count)

In [None]:
#dropping native-country/fnlwgt
df = df.drop(['native-country'], axis=1)
df = df.drop(['fnlwgt'], axis=1)
df.columns

#also dropping relationship and education columns because it is too similar to marital-status and education-num
df = df.drop(['relationship'], axis=1)
df = df.drop(['education'], axis=1)
#len(df['fnlwgt'].unique().tolist())

In [None]:
#after creating my model I am going to drop some more feature columns
#and comment out their one-hot-encodings
df = df.drop(['marital-status'], axis =1)
df = df.drop(['occupation'], axis =1)
df = df.drop(['workclass'],axis =1)

In [None]:
#one-hot-encoding sex_selfID
df['sex_selfID'].unique()

df_sex_selfID = pd.get_dummies(df['sex_selfID'], prefix='sex_selfID')
# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type
df = df.join(df_sex_selfID)
# Remove the original 'room_type' column from DataFrame df
df.drop(columns = 'sex_selfID', inplace=True)

In [None]:
#one-hot-encoding race
df['race'].unique()
df_race = pd.get_dummies(df['race'], prefix='race')
# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type
df = df.join(df_race)
# Remove the original 'room_type' column from DataFrame df
df.drop(columns = 'race', inplace=True)

In [None]:
#one-hot-encoding marital-status
'''
df['marital-status'].unique()
df_marital_status = pd.get_dummies(df['marital-status'], prefix='marital-status')
# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type
df = df.join(df_marital_status)
# Remove the original 'room_type' column from DataFrame df
df.drop(columns = 'marital-status', inplace=True)
'''

In [None]:
#one-hot-encoding workclass
'''
df['workclass'].value_counts()

df_workclass = pd.get_dummies(df['workclass'], prefix='workclass')
# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type
df = df.join(df_workclass)
# Remove the original 'room_type' column from DataFrame df
df.drop(columns = 'workclass', inplace=True)
'''

In [None]:
#one-hot-encoding occupation
'''
df['occupation'].value_counts()

df_occupation = pd.get_dummies(df['occupation'], prefix='occupation')
# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type
df = df.join(df_occupation)
# Remove the original 'room_type' column from DataFrame df
df.drop(columns = 'occupation', inplace=True)
'''

In [None]:
df.head()

In [None]:
#fixing label to be 1 or 0

df_income_binary = pd.get_dummies(df['income_binary'], prefix='income_binary')
# Concatenate DataFrame df with the one-hot encoded DataFrame df_room_type
df = df.join(df_income_binary)
# Remove the original 'room_type' column from DataFrame df
df.drop(columns = 'income_binary', inplace=True)


In [None]:
df.head()
df.drop(columns = 'income_binary_<=50K', inplace=True)

In [None]:
#label is income_binary_>50k, rest are feature columns
df.head()
df.columns

In [None]:
#changing everything to float
columns = df.columns.tolist()
df[columns] = df[columns].astype(float)

In [None]:
df.head()
df.dtypes
df.head()

Step 3: Creating the Model

In [None]:
#importing more packages to build the nueral network

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import time

In [None]:
import tensorflow as tf
import keras
from keras import layers

In [None]:
from keras.layers import BatchNormalization

In [None]:
#Create Labeled Examples from the Data Set
y = df['income_binary_>50K']
X = df.drop(columns = 'income_binary_>50K', axis=1)

In [None]:
#Create Training and Test Data Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [None]:
#over sampling because the model was just picking one option the entire time
%pip install imbalanced-learn 

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
nn_model = keras.Sequential()

input_layer = keras.layers.InputLayer(input_shape=(X_train_resampled.shape[1],))

nn_model.add(input_layer)

hidden_layer_0 = keras.layers.Dense(units=128, activation='relu')
nn_model.add(hidden_layer_0)

hidden_layer_1 = keras.layers.Dense(units=64, activation='relu')
nn_model.add(hidden_layer_1)
nn_model.add(keras.layers.BatchNormalization())
nn_model.add(keras.layers.Dropout(0.2))


hidden_layer_2 = keras.layers.Dense(units=32, activation='relu')
nn_model.add(hidden_layer_2)
nn_model.add(keras.layers.BatchNormalization())

hidden_layer_3 = keras.layers.Dense(units=16, activation='relu')
nn_model.add(hidden_layer_3)

'''
nn_model.add(keras.layers.BatchNormalization())
nn_model.add(keras.layers.Dropout(0.5))
'''

output_layer = keras.layers.Dense(units=1, activation='sigmoid')
nn_model.add(output_layer)

nn_model.summary()

In [None]:
#defining optimization fuction
sgd_optimizer = keras.optimizers.SGD(learning_rate=0.05)

#defining loss function
#loss_fn = keras.losses.BinaryCrossentropy(from_logits=False)
loss_fn = keras.losses.BinaryCrossentropy(from_logits=False, label_smoothing=0.0,
                                            reduction='sum_over_batch_size',
                                            name='binary_crossentropy')

In [None]:
#compiling the model
nn_model.compile(optimizer=sgd_optimizer, loss=loss_fn, metrics=['accuracy'])

In [None]:
loss, accuracy = nn_model.evaluate(X_test, y_test)

In [None]:
probability_predictions = nn_model.predict(X_test)
class_label_predictions=[]
for i in range(0,len(y_test)):
    if probability_predictions[i] >= 0.6:
        class_label_predictions.append(1)
    else:
        class_label_predictions.append(0)

c_m = confusion_matrix(y_test, class_label_predictions, labels=[True, False])
pd.DataFrame(
c_m,
columns=['Predicted: over 50k', 'Predicted: under 50k'],
index=['Actual: over 50k', 'Actual: under 50k']
)