In [None]:
from copy import deepcopy

# for DL modeling
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

# for number-crunching
import numpy as np
import scipy.stats as stats

# for dataset management
import pandas as pd

# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Import and process the data

In [None]:
# import the data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url, sep=';')
data = data[data['total sulfur dioxide'] < 200]  # drop a few outliers

# z-score all columns except for quality
cols2zscore = data.keys()
cols2zscore = cols2zscore.drop('quality')
data[cols2zscore] = data[cols2zscore].apply(stats.zscore)

# Binarize good and bad wines according to certain threshold

In [None]:
def create_binarized_column_boolean_quality(data, quality_threshold):
    data['boolQuality'] = 0
    bool_quality = (data['quality'] > quality_threshold).astype(int)
    data['boolQuality'] = bool_quality
    return data


# Re-organize the data: train/test in DataLoaders

In [None]:
def create_dataloaders(data):
    # convert from pandas dataframe to tensor
    dataT = torch.tensor(data[cols2zscore].values).float()
    labels = torch.tensor(data['boolQuality'].values).float()
    labels = labels[:, None]  # transform to matrix

    # use scikitlearn to split the data
    train_data, test_data, train_labels, test_labels = train_test_split(
        dataT, labels, test_size=.1)

    # then convert them into PyTorch Datasets (note: already converted to tensors)
    train_data = torch.utils.data.TensorDataset(train_data, train_labels)
    test_data = torch.utils.data.TensorDataset(test_data, test_labels)

    # finally, translate into dataloader objects
    batchsize = 8
    train_loader = DataLoader(train_data,
                              batch_size=batchsize,
                              shuffle=True,
                              drop_last=True)
    test_loader = DataLoader(test_data,
                             batch_size=test_data.tensors[0].shape[0])
    return train_loader, test_loader

# Now for the DL part

In [None]:
# create a class for the model
class ANNwine(nn.Module):

    def __init__(self):
        super().__init__()

        ### input layer
        self.input = nn.Linear(11, 16)

        ### hidden layers
        self.fc1 = nn.Linear(16, 32)
        self.fc2 = nn.Linear(32, 32)

        ### output layer
        self.output = nn.Linear(32, 1)

    # forward pass
    def forward(self, x):
        # get activation function type
        # this code replaces torch.relu with torch.<self.actfun>
        actfun = getattr(torch.nn, 'LeakyReLU')
        x = actfun()(self.input(x))
        x = actfun()(self.fc1(x))
        x = actfun()(self.fc2(x))
        return self.output(x)

In [None]:
# test the model
net = ANNwine()
net(torch.randn(10, 11)).shape

# Train the model

In [None]:
# global parameter
numepochs = 500


def trainTheModel(winenet, train_loader, test_loader):
    # loss function and optimizer
    lossfun = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=.001)

    # initialize losses
    losses = torch.zeros(numepochs)
    trainAcc = []
    testAcc = []

    # loop over epochs
    for epochi in range(numepochs):

        # turn on training mode
        winenet.train()

        # loop over training data batches
        batchAcc = []
        batchLoss = []
        for X, y in train_loader:

            # forward pass and loss
            yHat = winenet(X)
            loss = lossfun(yHat, y)

            # backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # loss from this batch
            batchLoss.append(loss.item())

            # compute training accuracy for this batch
            batchAcc.append(100 * torch.mean(((yHat > 0) == y).float()).item())
        # end of batch loop...

        # now that we've trained through the batches, get their average training accuracy
        trainAcc.append(np.mean(batchAcc))

        # and get average losses across the batches
        losses[epochi] = np.mean(batchLoss)

        # test accuracy
        winenet.eval()
        X, y = next(iter(test_loader))  # extract X,y from test dataloader
        with torch.no_grad():  # deactivates autograd
            yHat = winenet(X)
        testAcc.append(100 * torch.mean(((yHat > 0) == y).float()).item())

    # function output
    return trainAcc, testAcc, losses

In [None]:
quality_thresholds = [4.5, 5.5, 6.5]

train_acc_by_activation = {}
test_acc_by_activation = {}
losses = {}
per_quality_accuracy = {}

for i, threshold in enumerate(quality_thresholds):
    data_for_threshold = create_binarized_column_boolean_quality(
        deepcopy(data), threshold)
    train_loader, test_loader = create_dataloaders(data_for_threshold)

    # create a model and train it
    winenet = ANNwine()
    train_acc_by_activation[threshold], test_acc_by_activation[
        threshold], losses[threshold] = trainTheModel(winenet, train_loader,
                                                      test_loader)

    # compute accuracy per quality type
    X, y = next(iter(test_loader))
    itemAccuracy = ((winenet(X) > 0) == y).float()
    per_quality_accuracy[threshold] = [
        100 * torch.mean(itemAccuracy[y == 0]),
        100 * torch.mean(itemAccuracy[y == 1])
    ]


In [None]:
# plot some results
fig, ax = plt.subplots(3, 3, figsize=(20, 16))

# common features
for i, threshold in enumerate(quality_thresholds):
    ax[i][0].plot(losses[threshold])
    ax[i][0].set_title('Losses')
    ax[i][1].legend(["Train", "Test"])
    ax[i][0].set_xlabel('Epoch')
    ax[i][0].set_ylabel('Accuracy (%)')
    ax[i][0].grid()

    ax[i][1].plot(train_acc_by_activation[threshold])
    ax[i][1].plot(test_acc_by_activation[threshold])
    ax[i][1].set_title('Accuracy')
    ax[i][1].legend(["Train", "Test"])
    ax[i][1].set_xlabel('Epoch')
    ax[i][1].set_ylabel('Accuracy (%)')
    ax[i][1].set_ylim([0, 100])
    ax[i][1].grid()

    # plot the per-quality accuracy
    bh = ax[i, 2].bar(['Bad', 'Good'], per_quality_accuracy[threshold])
    ax[i, 2].set_ylim([0, 100])
    ax[i, 2].set_xlabel('Wine quality')
    ax[i, 2].set_ylabel('Test accuracy')
    ax[i, 2].set_title('Per-qual acc. with qualthresh ' + str(threshold + .5))

    # print the counts on top of each bar
    for i, r in enumerate(bh):
        N = torch.sum(train_loader.dataset.tensors[1] == i).item()
        ax[i, 2].text(r.get_x() + r.get_width() / 2,
                      r.get_height() + 1,
                      'N=%s' % N,
                      ha='center',
                      va='bottom',
                      fontsize=14)

plt.show()