In [198]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

from sklearn import preprocessing

import tensorflow as tf

from tensorflow.keras.layers import Dense, Normalization
from tensorflow.keras.models import Model

from scipy import stats


In [199]:
data = pd.read_csv("data.csv")

In [200]:
data = data.drop('education', axis=1)

In [201]:
data.shape

(22792, 14)

In [202]:
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [203]:
# Replace all cells with the value '?' with null
data = data.replace({'?': np.nan})

In [204]:
for col in data.columns:
    data[col] = data[col].fillna(data[col].mode()[0])
    
# data = data[~(data.T == '?').any()]

In [205]:
data.shape

(22792, 14)

In [206]:
data = data.drop_duplicates()
data.shape

(22777, 14)

In [180]:
data = pd.get_dummies(data, columns=['position'], prefix='position')
data = pd.get_dummies(data, columns=['native-country'], prefix='country')
data = pd.get_dummies(data, columns=['work-class'], prefix='workk')
data = pd.get_dummies(data, columns=['marital-status'], prefix='marital')
data = pd.get_dummies(data, columns=['relationship'], prefix='relationship')
data = pd.get_dummies(data, columns=['race'], prefix='race')
data = pd.get_dummies(data, columns=['sex'], prefix='sex')

# data.insert(63, 'work_Never-worked', 0)

In [181]:
le = preprocessing.LabelEncoder()

data['salary'] = le.fit_transform(data.salary.values)
# data['position'] = le.fit_transform(data.position.values)
data['native-country'] = le.fit_transform(data['native-country'].values)
data['work-class'] = le.fit_transform(data['work-class'].values)
# data['marital-status'] = le.fit_transform(data['marital-status'].values)
# data['relationship'] = le.fit_transform(data['relationship'].values)
# data['race'] = le.fit_transform(data['race'].values)
# data['sex'] = le.fit_transform(data['sex'].values)

data.replace(['<=50K', '>50K'],
             [0, 1], inplace=True)
data.head()

Unnamed: 0,age,work-class,work-fnl,education-num,capital-gain,capital-loss,hours-per-week,native-country,salary,position_Adm-clerical,...,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male
0,39,6,77516,13,2174,0,40,38,0,1,...,0,0,0,0,0,0,0,1,0,1
1,50,5,83311,13,0,0,13,38,0,0,...,0,0,0,0,0,0,0,1,0,1
2,38,3,215646,9,0,0,40,38,0,0,...,0,0,0,0,0,0,0,1,0,1
3,53,3,234721,7,0,0,40,38,0,0,...,0,0,0,0,0,1,0,0,0,1
4,28,3,338409,13,0,0,40,4,0,0,...,0,0,1,0,0,1,0,0,1,0


In [182]:
def remove_outlier_IQR(df):
    num_columns = [x for x in data.columns if data[x].dtype !=object]
    reduced_df=data[num_columns]
    Q1=reduced_df.quantile(0.25)
    Q3=reduced_df.quantile(0.75)
    IQR=Q3-Q1
    df_final_index=reduced_df[~((reduced_df<(Q1-1.5*IQR)) | (reduced_df>(Q3+1.5*IQR)))].index
    return data.loc[df_final_index.values]

In [183]:
data = remove_outlier_IQR(data)

In [184]:
#balance data
# X = data.drop('salary', axis = 1)
# y = data['salary']
# robust = RobustScaler()
# X_scaled = robust.fit_transform(X)

In [185]:
# data = data[(np.abs(stats.zscore(data)) < 3).all(axis=1)]

In [186]:
train_dataset = data.sample(frac=0.8, random_state=0)
test_dataset = data.drop(train_dataset.index)


In [187]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('salary')
test_labels = test_features.pop('salary')


In [188]:
train_dataset[['capital-gain', 'capital-loss', 'work-fnl', 'age']].describe().transpose()[['mean', 'std']]

Unnamed: 0,mean,std
capital-gain,1056.558117,7307.70083
capital-loss,85.288004,398.612525
work-fnl,189728.549555,105068.844997
age,38.595489,13.662685


In [189]:
normalizer = tf.keras.layers.Normalization(axis=-1)


In [190]:
normalizer.adapt(np.array(train_features))

In [192]:
model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    # tf.keras.layers.Dense(512, activation='relu'),
    # tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2)
])


In [193]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [194]:
r = model.fit(train_features, train_labels, validation_data=(test_features, test_labels), epochs=50)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [106]:
test = pd.read_csv('test.csv')
test = test.drop('education', axis=1)
test = test.applymap(lambda x: x.strip() if isinstance(x, str) else x)
test = test.replace('?', np.nan)

test = pd.get_dummies(test, columns=['position'], prefix='position')
# test = pd.get_dummies(test, columns=['native-country'], prefix='country')
# test = pd.get_dummies(test, columns=['work-class'], prefix='work')
test = pd.get_dummies(test, columns=['marital-status'], prefix='marital')
test = pd.get_dummies(test, columns=['relationship'], prefix='relationship')
test = pd.get_dummies(test, columns=['race'], prefix='race')
test = pd.get_dummies(test, columns=['sex'], prefix='sex')

# test.insert(34, 'country_Holand-Netherlands', 0)

le = preprocessing.LabelEncoder()

test['native-country'] = le.fit_transform(test['native-country'].values)
test['work-class'] = le.fit_transform(test['work-class'].values)


In [195]:
predictions = model.predict(test)
predictions



array([[ 4.906, -3.699],
       [ 0.704, -0.13 ],
       [ 4.88 , -3.347],
       ...,
       [ 0.485,  0.117],
       [ 3.364, -1.919],
       [ 4.354, -2.802]], dtype=float32)

In [108]:
predictions = np.argmax(predictions, axis=1)

In [109]:
submission = pd.DataFrame({'index': test.index, 'salary': predictions})

submission['salary'].replace([0, 1], ['<=50K', '>50K'], inplace=True)

submission.to_csv('submission.csv', index=False)


In [111]:
df = pd.read_csv('submission.csv')

df['salary'] = df['salary'].apply(lambda x: ' ' + x)

# Save the modified DataFrame to a new CSV file
df.to_csv('modified_file.csv', index=False)