In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import os
from keras.models import Sequential
from keras.layers import Dense
import numpy as np

In [15]:
df_train = pd.read_csv('./data/adult.data')
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
df_test = pd.read_csv('./data/adult.test')
df_test.head()

In [4]:
label_encoders = {}
for column in df_train.select_dtypes(include=['object']).columns:
    labels = df_train[column].unique()
    with open(f'./categories/{column}.txt', 'w') as file:
        for label in labels:
            file.write(f'{label}\n')

In [17]:
mapping_folder_path = './categories'
file_extension = '.txt'
file_list = [file_name for file_name in os.listdir(mapping_folder_path) if file_name.endswith(file_extension)]

for file_name in file_list:
    column = file_name.split('.')[0]
    with open(f'./categories/{file_name}', 'r') as file:
        labels = []
        for line in file:
            labels.append(line.strip())
        label_encoder = LabelEncoder()
        label_encoder.fit(labels)
        df_train[column] = label_encoder.transform(df_train[column])
        df_test[column] = label_encoder.transform(df_test[column])

df_train = df_train.dropna()
df_test = df_test.dropna()
X_train = df_train.drop('income', axis=1).drop('fnlwgt', axis=1)
y_train = df_train['income']
X_test = df_test.drop('income', axis=1).drop('fnlwgt', axis=1)
y_test = df_test['income']

In [6]:
# scaler = StandardScaler()
# scaler.fit(X_train)
# x_std = scaler.transform(X_test.head(1))

In [18]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_train_std = X_train_std.astype('float32')
X_test_std = scaler.transform(X_test)
X_test_std = X_test_std.astype('float32')
y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

In [19]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_std.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid'),
])

In [20]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train_std, y_train, epochs=100, batch_size=32, validation_split=0.2)

In [22]:
y_pred_proba = model.predict(X_test_std)
y_pred = (y_pred_proba > 0.5).astype(int)



In [23]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.84


In [24]:
model.save_weights('./weights/dnn.h5')
model.save('./models/dnn')

INFO:tensorflow:Assets written to: ../models/dnn\assets


INFO:tensorflow:Assets written to: ../models/dnn\assets
