# ISEA Week 6 - Modeling practice

Author: Lovenoor Aulck

In this exercise, we are going to attempt to predict salary. The data we will use can be found here: 

https://archive.ics.uci.edu/dataset/2/adult

### Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn
from pytorch_tabnet.tab_model import TabNetClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
sns.set_style('whitegrid')

### Import and inspect data

In [3]:
data = pd.read_csv('adult.data', header = None)

In [4]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
        'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
        'hours-per-week', 'native-country', 'income']

In [5]:
data.columns = cols

In [6]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
data.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [8]:
data.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [9]:
data.nunique()

age                  73
workclass             9
fnlwgt            21648
education            16
education-num        16
marital-status        7
occupation           15
relationship          6
race                  5
sex                   2
capital-gain        119
capital-loss         92
hours-per-week       94
native-country       42
income                2
dtype: int64

In [12]:
data.income.value_counts(dropna = False, normalize = True)

 <=50K    0.75919
 >50K     0.24081
Name: income, dtype: float64

### Clean data

In [13]:
data_cleaned = data.copy()

In [14]:
data_cleaned = pd.get_dummies(data_cleaned)

In [15]:
data_cleaned.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,income_ <=50K,income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [16]:
data.shape, data_cleaned.shape

((32561, 15), (32561, 110))

In [18]:
y = data_cleaned['income_ <=50K'].copy()

In [20]:
X = data_cleaned.drop(['income_ <=50K', 'income_ >50K'], axis = 1).copy()

In [21]:
y.shape, X.shape

((32561,), (32561, 108))

### Make predictions

In [22]:
def get_pred_results(actuals, probs, preds):
    auc = roc_auc_score(actuals, probs)
    f1 = f1_score(actuals, preds)
    
    print("Model performance:")
    print("--------------------")
    print("AUC is {}".format(auc))
    print("F1 is {}".format(f1))

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)

In [24]:
X_train.shape, X_test.shape

((26048, 108), (6513, 108))

In [25]:
y_train.shape, y_test.shape

((26048,), (6513,))

#### Model 1 - logistic regression

In [26]:
model_1 = LogisticRegression()

In [27]:
model_1.fit(X_train, y_train)

In [32]:
y_probs = model_1.predict_proba(X_test)[:, 1]
y_preds = model_1.predict(X_test)

In [33]:
get_pred_results(y_test, y_probs, y_preds)

Model performance:
--------------------
AUC is 0.5740261192390785
F1 is 0.8846329229366166


#### Model 2 - Random Forests

In [34]:
model_2 = RandomForestClassifier()

In [35]:
model_2.fit(X_train, y_train)

In [36]:
y_probs = model_2.predict_proba(X_test)[:, 1]
y_preds = model_2.predict(X_test)

In [37]:
get_pred_results(y_test, y_probs, y_preds)

Model performance:
--------------------
AUC is 0.9034788158887823
F1 is 0.909429523622819


#### Model 3 - TabNet

In [38]:
X_train_array = np.array(X_train)
y_train_array = np.array(y_train).reshape(-1, 1)

In [39]:
X_train_2, X_val, y_train_2, y_val = train_test_split(X_train_array, y_train, test_size = 0.2, random_state = 11)

In [40]:
model_3 = TabNetClassifier(verbose = 10)



In [42]:
model_3.fit(
    X_train = X_train_2,
    y_train = y_train_2,
    eval_set = [(X_val, y_val)],
    eval_metric = ['auc'],
    max_epochs = 100,
    batch_size = 1024,
    patience = 50,
    num_workers = 0
)

epoch 0  | loss: 0.56256 | val_0_auc: 0.67766 |  0:00:01s
epoch 10 | loss: 0.37144 | val_0_auc: 0.87657 |  0:00:18s
epoch 20 | loss: 0.33292 | val_0_auc: 0.90585 |  0:00:35s
epoch 30 | loss: 0.31848 | val_0_auc: 0.90759 |  0:00:51s
epoch 40 | loss: 0.31265 | val_0_auc: 0.91094 |  0:01:08s
epoch 50 | loss: 0.30792 | val_0_auc: 0.91205 |  0:01:24s
epoch 60 | loss: 0.29947 | val_0_auc: 0.90853 |  0:01:41s
epoch 70 | loss: 0.2958  | val_0_auc: 0.90691 |  0:01:57s
epoch 80 | loss: 0.28945 | val_0_auc: 0.90929 |  0:02:14s
epoch 90 | loss: 0.29868 | val_0_auc: 0.90245 |  0:02:30s

Early stopping occurred at epoch 99 with best_epoch = 49 and best_val_0_auc = 0.91406




In [44]:
y_probs = model_3.predict_proba(np.array(X_test))[:, 1]
y_preds = model_3.predict(np.array(X_test))

In [45]:
get_pred_results(y_test, y_probs, y_preds)

Model performance:
--------------------
AUC is 0.9055987577986999
F1 is 0.9091801669121257
