# Import libraries

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import os
from keras.models import Sequential
from keras.layers import Dense
import numpy as np

# Read data from CSV files

In [2]:
df_train = pd.read_csv('./data/adult.data')
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df_test = pd.read_csv('./data/adult.test')
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


# Preprocess data

## Extract list of labels categories to .txt files

In [4]:
label_encoders = {}
for column in df_train.select_dtypes(include=['object']).columns:
    labels = df_train[column].unique()
    with open(f'./categories/{column}.txt', 'w') as file:
        for label in labels:
            file.write(f'{label}\n')

## Encode data

In [5]:
mapping_folder_path = './categories'
file_extension = '.txt'
file_list = [file_name for file_name in os.listdir(mapping_folder_path) if file_name.endswith(file_extension)]

for file_name in file_list:
    column = file_name.split('.')[0]
    with open(f'./categories/{file_name}', 'r') as file:
        labels = []
        for line in file:
            labels.append(line.strip())
        label_encoder = LabelEncoder()
        label_encoder.fit(labels)
        df_train[column] = label_encoder.transform(df_train[column])
        df_test[column] = label_encoder.transform(df_test[column])

df_train = df_train.dropna()
df_test = df_test.dropna()
X_train = df_train.drop('income', axis=1).drop('fnlwgt', axis=1)
y_train = df_train['income']
X_test = df_test.drop('income', axis=1).drop('fnlwgt', axis=1)
y_test = df_test['income']

## Standardize data

In [6]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_train_std = X_train_std.astype('float32')
X_test_std = scaler.transform(X_test)
X_test_std = X_test_std.astype('float32')
y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

# Deep Neural Network (DNN)

## Create model

In [7]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_std.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid'),
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])





## Train model

In [8]:
model.fit(X_train_std, y_train, epochs=100, batch_size=32, validation_split=0.2)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 

<keras.src.callbacks.History at 0x23960fbed90>

## Test model

In [9]:
y_pred_proba = model.predict(X_test_std)
y_pred = (y_pred_proba > 0.5).astype(int)



## Evaluate

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.84


## Save model and weights

In [11]:
model.save_weights('./weights/dnn.h5')
model.save('./models/dnn')

INFO:tensorflow:Assets written to: ./models/dnn\assets


INFO:tensorflow:Assets written to: ./models/dnn\assets


# SVM

## Train model

In [12]:
svm_model = SVC(kernel='linear', verbose=True)
svm_model.fit(X_train_std, y_train)

[LibSVM]

## Test model

In [13]:
y_pred = svm_model.predict(X_test_std)

## Evaluate

In [14]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.81


# Logistic Regression

## Train model

In [20]:
classifier = LogisticRegression()
classifier.fit(X_train_std, y_train)

## Test model

In [21]:
predictions = classifier.predict(X_test)



## Evaluate model

In [22]:
accuracy = accuracy_score(y_test, predictions)
print(accuracy)

0.23622627602727106
