To identify the appropriate datasets, a subset of candidate datasets with appropriate characteristics will be chosen. The draft characteristics are:
* Over 10,000 data points
* Binary classification problem
* Over 10 data fields in the dataset

For each dataset, the following protocol will be applied:
* Divide the dataset into 10 equal, randomly allocated folds
* For each fold:
	* Train the model with the fold being the test set
		* N.B. optionally, with the other 9 folds, one can be explicitly held out as a validation step for hyper-parameter management
	* Record the AUC (with full details)
	* Record secondary performance measures


# Include libraries

Get general systems

In [1]:
from __future__ import print_function
import os
import random
import time
import json
from datetime import datetime
from copy import deepcopy
import gzip
try:
    import cPickle as pickle  # pylint: disable=import-error
except ImportError:
    import pickle  # pylint: disable=import-error    

Data management and plotting

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

PyTorch

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda:1" if USE_CUDA else "cpu")
cuda_device = torch.device("cuda:1")

assert device.type == 'cuda'

sklearn stuff

In [4]:
from sklearn import datasets, metrics
from sklearn.preprocessing import StandardScaler, normalize, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

sklearn models

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression

# Set general parameters

In [6]:
RANDOM_SEED      = 42
NUMBER_OF_SPLITS = 10
SAVE_FILE_NAME   = './../../../data/uci/processed/results/adult/results_{}.csv'.format(datetime.now())

# Get the dataset

The dataset is Adult UCI

In [7]:
adult_columns = [
    "age", 
    "workclass",
    "fnlwgt", 
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "gt50k"]
y_cols = 'gt50k'


data = pd.read_csv('./../../../data/uci/processed/data/adult/adult.data',
                   names=adult_columns,
                  index_col=False)

Split into Xs and ys

In [8]:
x_cols = data.columns.values.tolist()
x_cols.remove(y_cols)

xs_raw = data[x_cols]
ys_raw = data[y_cols]

In [9]:
categorical_feature_mask = xs_raw.dtypes==object
numerical_feature_mask = xs_raw.dtypes=="int64"

categorical_cols = xs_raw.columns[categorical_feature_mask].tolist()
numerical_cols = xs_raw.columns[numerical_feature_mask].tolist()

scaler = StandardScaler()
xs = xs_raw.copy()

# OHE categoricals
onehotencoded = pd.get_dummies(xs_raw[categorical_cols])
xs[onehotencoded.columns] = onehotencoded
xs = xs.drop(categorical_cols, axis=1)

## Linear scaling
numericals = xs_raw[numerical_cols].values #returns a numpy array
scaler = StandardScaler()
numericals = scaler.fit_transform(xs_raw[numerical_cols].values)
xs[numerical_cols] = pd.DataFrame(numericals)


####

## Adjust outcome var
ys = data['gt50k'] == ' >50K'

# Results structure

Results are going to have:
* What type of model
* What random seed
* What cross fold
* What performance measures

In [10]:
def mark_result(model_name, random_seed, cross_fold_index, predictions, AUC_score):
    return {
        'modelName': model_name,
        'randomSeed': random_seed,
        'crossFoldIndex': cross_fold_index,
        'predictions': list(predictions),
        'auc':AUC_score
    }

# Cross validation

In [11]:
kf = KFold(n_splits = NUMBER_OF_SPLITS, 
           random_state = RANDOM_SEED,
          shuffle=True)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=0.15, random_state=42)

In [13]:
results = []

# Neural Network

In [14]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_width=128):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_width) 
        self.fc2 = nn.Linear(hidden_width, hidden_width)
        self.fc3 = nn.Linear(hidden_width, hidden_width)
        self.fc4 = nn.Linear(hidden_width, hidden_width)
        self.fc5 = nn.Linear(hidden_width, hidden_width)
        self.fc6 = nn.Linear(hidden_width, output_size)  
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.fc4(out)
        out = self.relu(out)
        out = self.fc5(out)
        out = self.relu(out)
        out = self.fc6(out)
        return out
class TabularDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, xs, ys):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.xs = xs
        self.ys = ys
        
    def __len__(self):
        return len(self.xs)

    def __getitem__(self, idx):
        x = self.xs.iloc[idx].to_numpy()
        y = 1 if self.ys.iloc[idx] else 0
        return (x, y)
    



In [15]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    X_train, X_test = xs.iloc[train_index], xs.iloc[test_index]
    y_train, y_test = ys[train_index], ys[test_index]
    
    batch_size = 1024
    learning_rate = 0.0005
    num_epochs = 10

    train_data = TabularDataset(X_train, y_train)
    train_loader = DataLoader(train_data, 
                               batch_size=batch_size, 
                               shuffle=True)

    validate_data = TabularDataset(X_test.reset_index(drop=True), y_test.reset_index(drop=True))
    validate_loader = DataLoader(dataset = validate_data,
                                 batch_size=batch_size, 
                                 shuffle=False)

    total_step = len(train_loader)

    my_random_seed = 42
    random.seed(my_random_seed)
    nn_model = NeuralNet(108, 1).to(cuda_device)

    criterion = nn.BCEWithLogitsLoss().to(cuda_device)
    optimizer = torch.optim.Adam(nn_model.parameters(), lr=learning_rate)  
    for epoch in range(num_epochs):
        for i, (xsnn, ysnn) in enumerate(train_loader):  
            # Move tensors to the configured device
            xsnn = xsnn.float().to(cuda_device)
            ysnn = ysnn.view(-1, 1).float().to(cuda_device)

            # Forward pass
            outputs = nn_model(xsnn)
            train_loss = criterion(outputs, ysnn)


            # Backward and optimize
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
        if (epoch+1) % 1 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, train_loss.item()))

            
    nn_preds = torch.sigmoid( nn_model.forward(torch.from_numpy(X_test.to_numpy()).float().to(cuda_device)).to(device)).detach().cpu().numpy()
    nn_preds = nn_preds.reshape(nn_preds.shape[0])
    auc = roc_auc_score(y_test, nn_preds)
    print(auc)
    results.append(mark_result('NN', RANDOM_SEED, index, nn_preds, auc))


Currently training 0
Epoch [1/10], Step [29/29], Loss: 0.5393
Epoch [2/10], Step [29/29], Loss: 0.3511
Epoch [3/10], Step [29/29], Loss: 0.3365
Epoch [4/10], Step [29/29], Loss: 0.2726
Epoch [5/10], Step [29/29], Loss: 0.3091
Epoch [6/10], Step [29/29], Loss: 0.2711
Epoch [7/10], Step [29/29], Loss: 0.2737
Epoch [8/10], Step [29/29], Loss: 0.3165
Epoch [9/10], Step [29/29], Loss: 0.2864
Epoch [10/10], Step [29/29], Loss: 0.3264
0.9212603748571614
Currently training 1
Epoch [1/10], Step [29/29], Loss: 0.5104
Epoch [2/10], Step [29/29], Loss: 0.3733
Epoch [3/10], Step [29/29], Loss: 0.3155
Epoch [4/10], Step [29/29], Loss: 0.2965
Epoch [5/10], Step [29/29], Loss: 0.2799
Epoch [6/10], Step [29/29], Loss: 0.3170
Epoch [7/10], Step [29/29], Loss: 0.3297
Epoch [8/10], Step [29/29], Loss: 0.2800
Epoch [9/10], Step [29/29], Loss: 0.3157
Epoch [10/10], Step [29/29], Loss: 0.2996
0.9099884025869547
Currently training 2
Epoch [1/10], Step [29/29], Loss: 0.5471
Epoch [2/10], Step [29/29], Loss: 0.

In [16]:
# nn_preds = torch.sigmoid( nn_model.forward(torch.from_numpy(X_test.to_numpy()).float().to(cuda_device)).to(device)).detach().cpu().numpy()
# nn_preds = nn_preds.reshape(nn_preds.shape[0])
# auc = roc_auc_score(y_test, nn_preds)
# results.append(mark_result('NN', RANDOM_SEED, index, nn_preds, auc))


In [17]:
# auc

# Random forest

In [18]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    X_train, X_test = xs.iloc[train_index], xs.iloc[test_index]
    y_train, y_test = ys[train_index], ys[test_index]
    # Instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
    # Train the model on training data
    rf.fit(X_train, y_train)
    # Use the forest's predict method on the test data
    rf_preds = rf.predict(X_test)
    # Calculate the absolute errors
    errors = abs(rf_preds - y_test)
    auc = roc_auc_score(y_test, rf_preds)
    results.append(mark_result('Random Forest', RANDOM_SEED, index, rf_preds, auc))

Currently training 0
Currently training 1
Currently training 2
Currently training 3
Currently training 4
Currently training 5
Currently training 6
Currently training 7
Currently training 8
Currently training 9


# SVM

In [19]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    X_train, X_test = xs.iloc[train_index], xs.iloc[test_index]
    y_train, y_test = ys[train_index], ys[test_index]
    
    svm_model=SVC()
    svm_model.fit(X_train, y_train)
    svm_preds=svm_model.predict(X_test)
    auc = roc_auc_score(y_test, svm_preds)

    results.append(mark_result('SVM', RANDOM_SEED, index, svm_preds, auc))

Currently training 0
Currently training 1
Currently training 2
Currently training 3
Currently training 4
Currently training 5
Currently training 6
Currently training 7
Currently training 8
Currently training 9


# Regression

In [20]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    X_train, X_test = xs.iloc[train_index], xs.iloc[test_index]
    y_train, y_test = ys[train_index], ys[test_index]
    
    regression_model=LinearRegression()
    regression_model.fit(X_train, y_train)
    regression_preds=regression_model.predict(X_test)
    auc = roc_auc_score(y_test, regression_preds)

    results.append(mark_result('Regression', RANDOM_SEED, index, regression_preds, auc))

Currently training 0
Currently training 1
Currently training 2
Currently training 3
Currently training 4
Currently training 5
Currently training 6
Currently training 7
Currently training 8
Currently training 9


In [21]:
if not os.path.exists('./../../../data/uci/processed/results/adult'):
    os.makedirs('./../../../data/uci/processed/results/adult')
# with open(SAVE_FILE_NAME, 'w') as fp:
#     json.dump(results, fp)

TypeError: Object of type 'float32' is not JSON serializable

In [22]:
res_df = pd.DataFrame(results)

In [23]:
res_df.to_csv(SAVE_FILE_NAME)

In [24]:
res_df

Unnamed: 0,modelName,randomSeed,crossFoldIndex,predictions,auc
0,NN,42,0,"[0.0010204839, 0.008771328, 0.3439927, 0.51231...",0.92126
1,NN,42,1,"[0.9502995, 0.039839633, 0.11851892, 0.0078300...",0.909988
2,NN,42,2,"[0.07941426, 0.6614282, 0.3906917, 0.007087681...",0.903147
3,NN,42,3,"[0.002846464, 0.58251214, 0.50455076, 0.481285...",0.911537
4,NN,42,4,"[0.09773602, 0.24416648, 0.22646031, 0.0692157...",0.913628
5,NN,42,5,"[0.9039755, 0.29497188, 0.022892816, 0.8812705...",0.913598
6,NN,42,6,"[0.016023064, 0.67839915, 0.0015975306, 0.0011...",0.912556
7,NN,42,7,"[0.3468296, 0.29618916, 0.33501324, 0.01263161...",0.916549
8,NN,42,8,"[0.0013912838, 0.03490331, 0.6030983, 0.048054...",0.916428
9,NN,42,9,"[0.92544305, 0.31013364, 0.060395785, 0.626230...",0.91508


In [25]:
res_df.groupby('modelName').aggregate(['mean', 'std'])

Unnamed: 0_level_0,randomSeed,randomSeed,crossFoldIndex,crossFoldIndex,auc,auc
Unnamed: 0_level_1,mean,std,mean,std,mean,std
modelName,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
NN,42,0.0,4.5,3.02765,0.913377,0.004769
Random Forest,42,0.0,4.5,3.02765,0.904013,0.006286
Regression,42,0.0,4.5,3.02765,0.892746,0.003855
SVM,42,0.0,4.5,3.02765,0.764931,0.009382


In [28]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    print(test_index)

Currently training 0
[    6    34    46 ... 32552 32556 32558]
Currently training 1
[    8    30    33 ... 32544 32546 32560]
Currently training 2
[    3     4     7 ... 32540 32541 32547]
Currently training 3
[   31    41    42 ... 32516 32518 32549]
Currently training 4
[    0    19    47 ... 32501 32532 32539]
Currently training 5
[    5    14    15 ... 32545 32551 32554]
Currently training 6
[    2    10    12 ... 32536 32553 32555]
Currently training 7
[    1    18    48 ... 32506 32508 32525]
Currently training 8
[   16    24    25 ... 32523 32530 32548]
Currently training 9
[    9    11    13 ... 32538 32557 32559]
