To identify the appropriate datasets, a subset of candidate datasets with appropriate characteristics will be chosen. The draft characteristics are:
* Over 10,000 data points
* Binary classification problem
* Over 10 data fields in the dataset

For each dataset, the following protocol will be applied:
* Divide the dataset into 10 equal, randomly allocated folds
* For each fold:
	* Train the model with the fold being the test set
		* N.B. optionally, with the other 9 folds, one can be explicitly held out as a validation step for hyper-parameter management
	* Record the AUC (with full details)
	* Record secondary performance measures


# Include libraries

Get general systems

In [1]:
from __future__ import print_function
import os
import random
import time
import json
from datetime import datetime
from copy import deepcopy
import gzip
try:
    import cPickle as pickle  # pylint: disable=import-error
except ImportError:
    import pickle  # pylint: disable=import-error    

Data management and plotting

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

PyTorch

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda:1" if USE_CUDA else "cpu")
cuda_device = torch.device("cuda:1")

assert device.type == 'cuda'

sklearn stuff

In [4]:
from sklearn import datasets, metrics
from sklearn.preprocessing import StandardScaler, normalize, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

sklearn models

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression

# Set general parameters

In [6]:
RANDOM_SEED      = 42
NUMBER_OF_SPLITS = 10
SAVE_FILE_NAME   = './../../../data/uci/processed/results/adult/results_{}.csv'.format(datetime.now())

# Get the dataset

The dataset is Adult UCI

In [7]:
adult_columns = [
    "age", 
    "workclass",
    "fnlwgt", 
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "gt50k"]
y_cols = 'gt50k'


data = pd.read_csv('./../../../data/uci/processed/data/adult/adult.data',
                   names=adult_columns,
                  index_col=False)

Split into Xs and ys

In [8]:
x_cols = data.columns.values.tolist()
x_cols.remove(y_cols)

xs_raw = data[x_cols]
ys_raw = data[y_cols]

In [9]:
categorical_feature_mask = xs_raw.dtypes==object
numerical_feature_mask = xs_raw.dtypes=="int64"

categorical_cols = xs_raw.columns[categorical_feature_mask].tolist()
numerical_cols = xs_raw.columns[numerical_feature_mask].tolist()

scaler = StandardScaler()
xs = xs_raw.copy()

# OHE categoricals
onehotencoded = pd.get_dummies(xs_raw[categorical_cols])
xs[onehotencoded.columns] = onehotencoded
xs = xs.drop(categorical_cols, axis=1)

## Linear scaling
numericals = xs_raw[numerical_cols].values #returns a numpy array
scaler = StandardScaler()
numericals = scaler.fit_transform(xs_raw[numerical_cols].values)
xs[numerical_cols] = pd.DataFrame(numericals)


####

## Adjust outcome var
ys = data['gt50k'] == ' >50K'

# Results structure

Results are going to have:
* What type of model
* What random seed
* What cross fold
* What performance measures

In [10]:
def mark_result(model_name, random_seed, cross_fold_index, predictions, AUC_score):
    return {
        'modelName': model_name,
        'randomSeed': random_seed,
        'crossFoldIndex': cross_fold_index,
        'predictions': list(predictions),
        'auc':AUC_score
    }

# Cross validation

In [11]:
kf = KFold(n_splits = NUMBER_OF_SPLITS, 
           random_state = RANDOM_SEED,
          shuffle=True)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=0.15, random_state=42)

In [13]:
results = []

# Neural Network

In [14]:
class NeuralNetLarge(nn.Module):
    def __init__(self, input_size, output_size, hidden_width=256):
        super(NeuralNetLarge, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_width) 
        self.fc2 = nn.Linear(hidden_width, hidden_width)
        self.fc3 = nn.Linear(hidden_width, hidden_width)
        self.fc4 = nn.Linear(hidden_width, hidden_width)
        self.fc5 = nn.Linear(hidden_width, hidden_width)
        self.fc6 = nn.Linear(hidden_width, hidden_width)
        self.fc7 = nn.Linear(hidden_width, output_size)  
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.fc4(out)
        out = self.relu(out)
        out = self.fc5(out)
        out = self.relu(out)
        out = self.fc6(out)
        out = self.relu(out)
        out = self.fc7(out)
        return out
    
class NeuralNetMed(nn.Module):
    def __init__(self, input_size, output_size, hidden_width=128):
        super(NeuralNetMed, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_width) 
        self.fc2 = nn.Linear(hidden_width, hidden_width)
        self.fc3 = nn.Linear(hidden_width, hidden_width)
        self.fc4 = nn.Linear(hidden_width, hidden_width)
        self.fc5 = nn.Linear(hidden_width, output_size)  
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.fc4(out)
        out = self.relu(out)
        out = self.fc5(out)
        return out
    
class NeuralNetSmall(nn.Module):
    def __init__(self, input_size, output_size, hidden_width=32):
        super(NeuralNetSmall, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_width) 
        self.fc2 = nn.Linear(hidden_width, hidden_width)
        self.fc3 = nn.Linear(hidden_width, output_size)  
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out
    
    
    
class TabularDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, xs, ys):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.xs = xs
        self.ys = ys
        
    def __len__(self):
        return len(self.xs)

    def __getitem__(self, idx):
        x = self.xs.iloc[idx].to_numpy()
        y = 1 if self.ys.iloc[idx] else 0
        return (x, y)
    



## NN Large

In [None]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    X_train, X_test = xs.iloc[train_index], xs.iloc[test_index]
    y_train, y_test = ys[train_index], ys[test_index]
    
    batch_size = 1024
    learning_rate = 0.0005
    num_epochs = 25

    train_data = TabularDataset(X_train, y_train)
    train_loader = DataLoader(train_data, 
                               batch_size=batch_size, 
                               shuffle=True)

    validate_data = TabularDataset(X_test.reset_index(drop=True), y_test.reset_index(drop=True))
    validate_loader = DataLoader(dataset = validate_data,
                                 batch_size=batch_size, 
                                 shuffle=False)

    total_step = len(train_loader)

    my_random_seed = 42
    random.seed(my_random_seed)
    nn_model = NeuralNetLarge(108, 1).to(cuda_device)

    criterion = nn.BCEWithLogitsLoss().to(cuda_device)
    optimizer = torch.optim.Adam(nn_model.parameters(), lr=learning_rate)  
    for epoch in range(num_epochs):
        for i, (xsnn, ysnn) in enumerate(train_loader):  
            # Move tensors to the configured device
            xsnn = xsnn.float().to(cuda_device)
            ysnn = ysnn.view(-1, 1).float().to(cuda_device)

            # Forward pass
            outputs = nn_model(xsnn)
            train_loss = criterion(outputs, ysnn)


            # Backward and optimize
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
        if (epoch+1) % 1 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, train_loss.item()))

            
    nn_preds = torch.sigmoid( nn_model.forward(torch.from_numpy(X_test.to_numpy()).float().to(cuda_device)).to(device)).detach().cpu().numpy()
    nn_preds = nn_preds.reshape(nn_preds.shape[0])
    auc = roc_auc_score(y_test, nn_preds)
    print(auc)
    results.append(mark_result('NN Large', RANDOM_SEED, index, nn_preds, auc))


Currently training 0
Epoch [1/25], Step [29/29], Loss: 0.4147
Epoch [2/25], Step [29/29], Loss: 0.3598
Epoch [3/25], Step [29/29], Loss: 0.3151
Epoch [4/25], Step [29/29], Loss: 0.3301
Epoch [5/25], Step [29/29], Loss: 0.2921
Epoch [6/25], Step [29/29], Loss: 0.3333
Epoch [7/25], Step [29/29], Loss: 0.2719
Epoch [8/25], Step [29/29], Loss: 0.3362
Epoch [9/25], Step [29/29], Loss: 0.3291
Epoch [10/25], Step [29/29], Loss: 0.3572
Epoch [11/25], Step [29/29], Loss: 0.3014
Epoch [12/25], Step [29/29], Loss: 0.2822
Epoch [13/25], Step [29/29], Loss: 0.3126
Epoch [14/25], Step [29/29], Loss: 0.2893
Epoch [15/25], Step [29/29], Loss: 0.2720
Epoch [16/25], Step [29/29], Loss: 0.2834
Epoch [17/25], Step [29/29], Loss: 0.2606
Epoch [18/25], Step [29/29], Loss: 0.2569
Epoch [19/25], Step [29/29], Loss: 0.2402
Epoch [20/25], Step [29/29], Loss: 0.2570
Epoch [21/25], Step [29/29], Loss: 0.2735
Epoch [22/25], Step [29/29], Loss: 0.2709
Epoch [23/25], Step [29/29], Loss: 0.2147
Epoch [24/25], Step [2

Epoch [16/25], Step [29/29], Loss: 0.2765
Epoch [17/25], Step [29/29], Loss: 0.2647
Epoch [18/25], Step [29/29], Loss: 0.2435
Epoch [19/25], Step [29/29], Loss: 0.2837
Epoch [20/25], Step [29/29], Loss: 0.2714


## NN Med

In [None]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    X_train, X_test = xs.iloc[train_index], xs.iloc[test_index]
    y_train, y_test = ys[train_index], ys[test_index]
    
    batch_size = 1024
    learning_rate = 0.0005
    num_epochs = 10

    train_data = TabularDataset(X_train, y_train)
    train_loader = DataLoader(train_data, 
                               batch_size=batch_size, 
                               shuffle=True)

    validate_data = TabularDataset(X_test.reset_index(drop=True), y_test.reset_index(drop=True))
    validate_loader = DataLoader(dataset = validate_data,
                                 batch_size=batch_size, 
                                 shuffle=False)

    total_step = len(train_loader)

    my_random_seed = 42
    random.seed(my_random_seed)
    nn_model = NeuralNetLarge(108, 1).to(cuda_device)

    criterion = nn.BCEWithLogitsLoss().to(cuda_device)
    optimizer = torch.optim.Adam(nn_model.parameters(), lr=learning_rate)  
    for epoch in range(num_epochs):
        for i, (xsnn, ysnn) in enumerate(train_loader):  
            # Move tensors to the configured device
            xsnn = xsnn.float().to(cuda_device)
            ysnn = ysnn.view(-1, 1).float().to(cuda_device)

            # Forward pass
            outputs = nn_model(xsnn)
            train_loss = criterion(outputs, ysnn)


            # Backward and optimize
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
        if (epoch+1) % 1 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, train_loss.item()))

            
    nn_preds = torch.sigmoid( nn_model.forward(torch.from_numpy(X_test.to_numpy()).float().to(cuda_device)).to(device)).detach().cpu().numpy()
    nn_preds = nn_preds.reshape(nn_preds.shape[0])
    auc = roc_auc_score(y_test, nn_preds)
    print(auc)
    results.append(mark_result('NN Med', RANDOM_SEED, index, nn_preds, auc))


## NN Small

In [None]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    X_train, X_test = xs.iloc[train_index], xs.iloc[test_index]
    y_train, y_test = ys[train_index], ys[test_index]
    
    batch_size = 1024
    learning_rate = 0.0005
    num_epochs = 10

    train_data = TabularDataset(X_train, y_train)
    train_loader = DataLoader(train_data, 
                               batch_size=batch_size, 
                               shuffle=True)

    validate_data = TabularDataset(X_test.reset_index(drop=True), y_test.reset_index(drop=True))
    validate_loader = DataLoader(dataset = validate_data,
                                 batch_size=batch_size, 
                                 shuffle=False)

    total_step = len(train_loader)

    my_random_seed = 42
    random.seed(my_random_seed)
    nn_model = NeuralNetSmall(108, 1).to(cuda_device)

    criterion = nn.BCEWithLogitsLoss().to(cuda_device)
    optimizer = torch.optim.Adam(nn_model.parameters(), lr=learning_rate)  
    for epoch in range(num_epochs):
        for i, (xsnn, ysnn) in enumerate(train_loader):  
            # Move tensors to the configured device
            xsnn = xsnn.float().to(cuda_device)
            ysnn = ysnn.view(-1, 1).float().to(cuda_device)

            # Forward pass
            outputs = nn_model(xsnn)
            train_loss = criterion(outputs, ysnn)


            # Backward and optimize
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
        if (epoch+1) % 1 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, train_loss.item()))

            
    nn_preds = torch.sigmoid( nn_model.forward(torch.from_numpy(X_test.to_numpy()).float().to(cuda_device)).to(device)).detach().cpu().numpy()
    nn_preds = nn_preds.reshape(nn_preds.shape[0])
    auc = roc_auc_score(y_test, nn_preds)
    print(auc)
    results.append(mark_result('NN Small', RANDOM_SEED, index, nn_preds, auc))


In [None]:
# nn_preds = torch.sigmoid( nn_model.forward(torch.from_numpy(X_test.to_numpy()).float().to(cuda_device)).to(device)).detach().cpu().numpy()
# nn_preds = nn_preds.reshape(nn_preds.shape[0])
# auc = roc_auc_score(y_test, nn_preds)
# results.append(mark_result('NN', RANDOM_SEED, index, nn_preds, auc))


In [None]:
# auc

# Random forest

In [None]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    X_train, X_test = xs.iloc[train_index], xs.iloc[test_index]
    y_train, y_test = ys[train_index], ys[test_index]
    # Instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
    # Train the model on training data
    rf.fit(X_train, y_train)
    # Use the forest's predict method on the test data
    rf_preds = rf.predict(X_test)
    # Calculate the absolute errors
    errors = abs(rf_preds - y_test)
    auc = roc_auc_score(y_test, rf_preds)
    results.append(mark_result('Random Forest', RANDOM_SEED, index, rf_preds, auc))

# SVM

In [None]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    X_train, X_test = xs.iloc[train_index], xs.iloc[test_index]
    y_train, y_test = ys[train_index], ys[test_index]
    
    svm_model=SVC()
    svm_model.fit(X_train, y_train)
    svm_preds=svm_model.predict(X_test)
    auc = roc_auc_score(y_test, svm_preds)

    results.append(mark_result('SVM', RANDOM_SEED, index, svm_preds, auc))

# Regression

In [None]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    X_train, X_test = xs.iloc[train_index], xs.iloc[test_index]
    y_train, y_test = ys[train_index], ys[test_index]
    
    regression_model=LinearRegression()
    regression_model.fit(X_train, y_train)
    regression_preds=regression_model.predict(X_test)
    auc = roc_auc_score(y_test, regression_preds)

    results.append(mark_result('Regression', RANDOM_SEED, index, regression_preds, auc))

In [None]:
if not os.path.exists('./../../../data/uci/processed/results/adult'):
    os.makedirs('./../../../data/uci/processed/results/adult')
# with open(SAVE_FILE_NAME, 'w') as fp:
#     json.dump(results, fp)

In [None]:
res_df = pd.DataFrame(results)

In [None]:
res_df.to_csv(SAVE_FILE_NAME)

In [None]:
res_df

In [None]:
res_df.groupby('modelName').aggregate(['mean', 'std'])

In [None]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    print(test_index)