In [None]:
%matplotlib notebook

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import scipy.signal
import sklearn.model_selection
import pandas as pd
import numpy as np
import util

# 3.1

Load dataset using pandas.

In [None]:
seed = 0
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

In [None]:
data = pd.read_csv('data/adult.csv')

# 3.2
Print out the dataset to see what it looks like.

In [None]:
data

Check out how many are high income and how many are low income.

In [None]:
data['income'].value_counts()

# 3.3
Remove rows with a '?'

In [None]:
data = data[(data == '?').sum(axis=1) == 0]
data.shape

# 3.4
Balance the dataset by dropping extra low income earners.

In [None]:
income_counts = data['income'].value_counts()
count_difference = income_counts['<=50K'] - income_counts['>50K']
data = data.drop(data['income'].sample(count_difference, random_state=seed, weights=(data['income']=='<=50K')).index)

In [None]:
data['income'].value_counts()

In [None]:
data.describe()

# 3.5

Workclass pie chart

In [None]:
util.pie_chart(data, 'workclass')

Education pie chart

In [None]:
util.pie_chart(data, 'education')

Marital status pie chart

In [None]:
util.pie_chart(data, 'marital-status')

In [None]:
util.pie_chart(data, 'race')

Workclass binary bar chart

In [None]:
util.binary_bar_chart(data, 'workclass')

Education binary bar chart

In [None]:
util.binary_bar_chart(data, 'education')

Marital status binary bar chart

In [None]:
util.binary_bar_chart(data, 'marital-status')

Race binary bar chart

In [None]:
util.binary_bar_chart(data, 'race')

In [None]:
util.binary_bar_chart(data, 'relationship')

# 3.6

Seperate continuous and categorical variables.

In [None]:
categorical_feats = ['workclass', 'race', 'education', 'marital-status', 'occupation',
                    'relationship', 'gender', 'native-country', 'income']

cts_data_raw = data.drop(columns=categorical_feats)
cat_data_raw = data[categorical_feats]

Normalize and convert cts. data to numpy array

In [None]:
cts_data = ((cts_data_raw - cts_data_raw.mean()) / cts_data_raw.std()).values

Convert categorical data to one hot encoding. We use the label encoder for the income, and we drop the income column from the one hot encoding transform.

In [None]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(cat_data_raw['income']).astype(np.float32)

del cat_data_raw['income']
one_hot_encoder = OneHotEncoder()
cat_data = one_hot_encoder.fit_transform(cat_data_raw).toarray()

Concatenate the two arrays

In [None]:
data_np = np.concatenate((cat_data, cts_data), axis=1).astype(np.float32)
data_np.shape

In [None]:
cat_data_raw

# 3.7

Split the data using `train_test_split`

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data_np, labels, test_size=0.2, random_state=seed)
print(X_train.shape, X_test.shape)

# 4.1

Implement the Dataset

In [None]:
class AdultDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

# 4.2

Create training and validation DataLoaders

In [None]:
def load_data(batch_size):
    train_dataset = AdultDataset(X_train, y_train)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size, shuffle=True)
    val_dataset = AdultDataset(X_test, y_test)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size, shuffle=True)
    return train_loader, val_loader

# 4.3

Create MLP model in PyTorch.

In [None]:
class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_size):
        super(MultiLayerPerceptron, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.activation1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 1)
        self.activation2 = nn.Sigmoid()

    def forward(self, features):
        x = self.fc1(features)
        x = self.activation1(x)
        x = self.fc2(x)
        x = self.activation2(x)
        return x

# 4.4

Instantiate model, with loss function and optimizer.

In [None]:
def load_model(model_class, lr):
    model = model_class(data_np.shape[1]).to(device)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr)
    return model, loss_fn, optimizer

# 4.5

Create main function, housing the training loop.

In [None]:
def main(epochs, lr, batch_size, eval_every, model_class):
    model, loss_fn, optimizer = load_model(model_class, lr)
    train_loader, val_loader = load_data(batch_size)
    
    train_acc_buffer = []
    train_acc = []
    val_acc = []
    
    batch = 0
    for i in range(epochs):
        for data, labels in train_loader:
            data = data.to(device)
            labels = labels.to(device)
            
            model.train()
            optimizer.zero_grad()
            predictions = model(data).squeeze(1)
            loss = loss_fn(predictions, labels)
            loss.backward()
            optimizer.step()
            train_acc_buffer.append(((predictions > 0.5) == labels).sum().item() / data.size(0))
            train_acc.append(np.mean(train_acc_buffer))

            if len(train_acc_buffer) > eval_every:
                del train_acc_buffer[0]

            batch += 1
            if batch == eval_every:
                val_acc.append(evaluate(model, val_loader))
                batch = 0
            
        print('{}: train_acc = {:.4f}\t val_acc = {:.4f}'.format(i+1, train_acc[-1], val_acc[-1]))

    return train_acc, val_acc

# 4.6

Define accuracy function

In [None]:
def evaluate(model, val_loader):
    total_corr = 0
    model.eval()
    for data, labels in val_loader:
        data = data.to(device)
        labels = labels.to(device)
        
        predictions = model(data).squeeze(1) > 0.5
        total_corr += (predictions == labels).sum().item()
    return total_corr/len(val_loader.dataset)

Create a convienence function for plotting accuracy

In [None]:
def plot_accuracies(train_acc, val_acc, time=None, filtering=True):
    if filtering:
        window_length = (data_np.shape[0] // hyperparameters['batch_size']) * 2
        window_length = window_length + 1
        train_acc = scipy.signal.savgol_filter(train_acc, window_length, polyorder=2)
    plt.title('Accuracy curves of {}\n {} epochs, lr={}, batch size={}'.format(
        hyperparameters['model_class'].__name__,
        hyperparameters['epochs'],
        hyperparameters['lr'],
        hyperparameters['batch_size']
    ))
    plt.ylabel('accuracy')

    if time is not None:
        plt.xlabel('time [seconds]')
        train_range = np.linspace(0, time, len(train_acc))
        val_range = np.linspace(0, time, len(val_acc))
    else:
        plt.xlabel('gradient steps')
        train_range = np.arange(len(train_acc))
        val_range = np.arange(len(val_acc)) * hyperparameters['eval_every']
        
    plt.plot(train_range, train_acc, label='training')
    plt.plot(val_range, val_acc, label='validation')
    plt.legend()
    plt.show()

# 5.4

Define `SmallModel` with only one layer.

In [None]:
class SmallModel(nn.Module):
    def __init__(self, input_size):
        super(SmallModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 1)
        self.activation = nn.Sigmoid()

    def forward(self, features):
        x = self.fc1(features)
        x = self.activation(x)
        return x

# 5.5

Define `LargeModel` with 4 layers.

In [None]:
class LargeModel(nn.Module):
    def __init__(self, input_size):
        super(LargeModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 1)
        self.activation = nn.ReLU()
        self.activation2 = nn.Sigmoid()
    
    def forward(self, features):
        x = self.activation(self.fc1(features))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.activation2(self.fc4(x))
        return x

# 5.6

Define `LargeTanhModel` using `LargeModel` with `tanh` rather than `ReLU`.

In [None]:
class LargeTanhModel(nn.Module):
    def __init__(self, input_size):
        super(LargeTanhModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 1)
        self.activation = nn.Tanh()
        self.activation2 = nn.Sigmoid()
    
    def forward(self, features):
        x = self.activation(self.fc1(features))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.activation2(self.fc4(x))
        return x

Define `LargeSigmoidModel` using `LargeModel` with `Sigmoid` rather than `ReLU`.

In [None]:
class LargeSigmoidModel(nn.Module):
    def __init__(self, input_size):
        super(LargeSigmoidModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 1)
        self.activation = nn.Sigmoid()
        self.activation2 = nn.Sigmoid()
    
    def forward(self, features):
        x = self.activation(self.fc1(features))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.activation2(self.fc4(x))
        return x

In [None]:
%%time
hyperparameters = { # Change these hyperparameters and run the cell to train
    'lr': 5e-2,
    'batch_size': 32,
    'epochs': 75,
    'eval_every': 64,
    'model_class': MultiLayerPerceptron
}

torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

train_acc, val_acc = main(**hyperparameters)

plot_accuracies(train_acc, val_acc)
print('max accuracy obtained', np.max(val_acc))