In [30]:
import csv
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [2]:
## Load in the data
wine_path = "../practice_data/tabular_wine/winequality-white.csv"
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, 
                         delimiter=";", skiprows=1)
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [3]:
## Verify all of the data was loaded as expected
col_list = next(csv.reader(open(wine_path), delimiter=";"))
wineq_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [4]:
## Load data into a PyTorch tensor
wineq = torch.from_numpy(wineq_numpy)
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [5]:
## Separate data from target
data = wineq[:, :-1] ## All rows, from all columns except the last
target =  wineq[:,-1].long() ## All rows from only the last column

In [20]:
target

tensor([6, 6, 6,  ..., 6, 7, 6])

In [15]:
## Group the data into categories
bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >=7]

## Gather some statistics
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

## View the stats
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [42]:
def prepare_data(data, target, val_pct):
    x = data
    y = target.unsqueeze(1).to(torch.float32)
    ## Normalize the data
    xn = torch.nn.functional.normalize(data, dim=1)
    
    ## Split up training/validation sets
    n_samples = x.shape[0]
    n_val = int(n_samples*val_pct)
    shuffled_indices = torch.randperm(n_samples)
    train_indices = shuffled_indices[:-n_val]
    val_indices = shuffled_indices[-n_val:]

    x_train = x[train_indices]
    x_val = x[val_indices]

    xn_train = xn[train_indices]
    xn_val = xn[val_indices]

    y_train = y[train_indices]
    y_val = y[val_indices]

    return x, y, x_train, x_val, xn_train, xn_val, y_train, y_val

x, y, x_train, x_val, xn_train, xn_val,\
    y_train, y_val = prepare_data(data, target, .20)

In [43]:
xn_train.shape, xn_val.shape, y_train.shape, y_val.shape

(torch.Size([3919, 11]),
 torch.Size([979, 11]),
 torch.Size([3919, 1]),
 torch.Size([979, 1]))

In [69]:
seq_model = nn.Sequential(
    nn.Linear(11,100),
    nn.ReLU(),
    nn.Linear(100,50),
    nn.ReLU(),
    nn.Linear(50,25),
    nn.ReLU(),
    nn.Linear(25,1)
)
optimizer = optim.SGD(seq_model.parameters(), lr=1e-1)

In [70]:
def training_loop(n_epochs,
                  optimizer, model, loss_fn, 
                  x_train, x_val,
                  y_train, y_val):
    for epoch in range(1, n_epochs+1):
        ## Training set
        p_train = model(x_train) ## Make a prediction
        loss_train = loss_fn(p_train, y_train) ## Compute the loss

        ## Validation set
        p_val = model(x_val) ## Make a predicton
        loss_val = loss_fn(p_val, y_val) ## Compute the loss

        ## Update params
        optimizer.zero_grad() ## Reset accumulated gradient
        loss_train.backward() ## Update the gradient at the training input
        optimizer.step() ## Step the params in the direction of the gradient

        if (epoch <= 5) or (epoch % 500 == 0):
            print(f"Epoch: {epoch}, Training loss: {loss_train.item():.4f},"
                  f"Validation loss: {loss_val.item():.4f}")

In [71]:
training_loop(
    n_epochs=10000,
    optimizer=optimizer,
    model=seq_model,
    loss_fn=nn.MSELoss(),
    x_train=xn_train,
    x_val=xn_val,
    y_train=y_train,
    y_val=y_val
)

Epoch: 1, Training loss: 36.6307,Validation loss: 36.7676
Epoch: 2, Training loss: 18.0938,Validation loss: 18.1817
Epoch: 3, Training loss: 4.5081,Validation loss: 4.4488
Epoch: 4, Training loss: 19.6316,Validation loss: 19.7244
Epoch: 5, Training loss: 11.2469,Validation loss: 11.3100
Epoch: 500, Training loss: 0.7677,Validation loss: 0.7549
Epoch: 1000, Training loss: 0.7548,Validation loss: 0.7430
Epoch: 1500, Training loss: 0.7418,Validation loss: 0.7342
Epoch: 2000, Training loss: 0.7297,Validation loss: 0.7256
Epoch: 2500, Training loss: 0.7127,Validation loss: 0.7069
Epoch: 3000, Training loss: 0.7005,Validation loss: 0.6923
Epoch: 3500, Training loss: 0.6877,Validation loss: 0.6779
Epoch: 4000, Training loss: 0.6782,Validation loss: 0.6682
Epoch: 4500, Training loss: 0.6769,Validation loss: 0.6659
Epoch: 5000, Training loss: 0.6656,Validation loss: 0.6539
Epoch: 5500, Training loss: 0.6674,Validation loss: 0.6533
Epoch: 6000, Training loss: 0.6584,Validation loss: 0.6468
Epoch

In [73]:
pred = seq_model(xn_val)

In [86]:
y_val[np.abs(y_val.detach().numpy()-pred.detach().numpy())<.75].shape

torch.Size([663])

In [83]:
y_val.shape

torch.Size([979, 1])

In [93]:
y_val[y_val>6].shape

torch.Size([218])

In [94]:
y_val[pred>6].shape

torch.Size([311])

In [97]:
actual_good = (y_val > 5)
pred_good = (pred > 5)

In [99]:
n_matches = torch.sum(actual_good & pred_good).item()
n_predicted = torch.sum(pred_good).item()
n_actual = torch.sum(actual_good).item()

n_matches, n_matches/n_predicted, n_matches/n_actual

(642, 0.6701461377870563, 0.9892141756548536)

In [None]:
full_pred = seq_model(x)