In [1]:
## PyTorch doesn't fully support MPS devices yet (as of Feb 5, 2024)
## See: https://github.com/pytorch/pytorch/issues/77764#
## The following allows PyTorch to fall back to the CPU whenever it
## encounters an unsupported operation
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

In [2]:
import csv
import torch
import numpy as np

# Importing the Data

In [3]:
## Load in the data
wine_path = "../practice_data/tabular_wine/winequality-white.csv"
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, 
                         delimiter=";", skiprows=1)
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [5]:
## Verify all of the data was loaded as expected
col_list = next(csv.reader(open(wine_path), delimiter=";"))
wineq_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [6]:
## Load data into a PyTorch tensor
wineq = torch.from_numpy(wineq_numpy)
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

# Organizing the data

In [13]:
## Separate data from target
data = wineq[:, :-1] ## All rows, from all columns except the last
target =  wineq[:,-1].long() ## All rows from only the last column

In [14]:
data.shape, target.shape

(torch.Size([4898, 11]), torch.Size([4898]))

We had to decide how wanted the taget to be encoded. Two immediate options were

* As an integer score (which is what we chose)
* By One-hot encoding scores as vectors

Here, *integer score* seemed appriate because it preserves order and distance. Higher score actually mean higher quality, and presumably the difference in quality from 2 to 4 should be silimiar to the difference from 2 to 6. This is in contrast to categorical variables such as maybe "grape type".

In [16]:
## Here is how one-hot encoding can be achieved with PyTorch tensors

## Initialize a tensor of zeros with as many rows as our target data
## Each row being length 10, one spot for each taget category (score)
target_onehot= torch.zeros(target.shape[0], 10)

## Use `scatter_` to fill the value in (in-place)
## Along dimension 1
## At the indices indicated (here, the score of the wine)
## Put the value to be scattered at that index (could have been a set of elements)
target_onehot.scatter_(1,target.unsqueeze(1), 1.0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [22]:
## Gathering statistics from data
data_mean = torch.mean(data, dim=0)
data_var = torch.var(data, dim=0)
data_mean, data_var

(tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
         1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01]),
 tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
         1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00]))

In [23]:
## Nornmalizing the data
data_normalized = (data-data_mean)/torch.sqrt(data_var)
data_normalized

tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
         -3.4915e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
          1.3422e-03, -8.2419e-01],
        [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
         -4.3677e-01, -3.3663e-01],
        ...,
        [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
         -2.6153e-01, -9.0545e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
         -9.6251e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
         -1.4882e+00,  1.0448e+00]])

# Exploring the Data

In [25]:
## Get the indices that correspond to crappy wine
bad_indices = (target <= 3)
bad_indices.shape, bad_indices.dtype, bad_indices.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [26]:
## bad_ndices is a tensor of bools with the same shape as target
## So, we can use it for indexing
bad_data = data[bad_indices]
bad_data.shape

torch.Size([20, 11])

In [28]:
## Group the data into categories
bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >=7]

## Gather some statistics
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

## View the stats
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


At a glance it looks like sulfur dioxide could provide a good baseline indicator of wine quality. Lower total sulfur dioxide translating to higher quality wine.

In [30]:
## Pick a threshold value (mean of mid wines seems fine)
total_sulfur_threshhold = 141.83

## Get the total sulfer column data
total_sulfur_data = data[:,6]

## Predict which wines are high quality based on this threshold
predicted_indices = torch.lt(total_sulfur_data, total_sulfur_threshhold)
predicted_indices.shape, predicted_indices.dtype, predicted_indices.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [34]:
## Compare this to the actual target values
actual_indices = (target > 5)
actual_indices.shape, actual_indices.dtype, actual_indices.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

It looks like our threshold predicts fewer good quality wines than the actual scores, so we can probably improve.

First, though, we can see how many wines our basic threshold prediction gets correct

In [37]:
n_matches = torch.sum(actual_indices & predicted_indices).item()
n_predicted = torch.sum(predicted_indices).item()
n_actual = torch.sum(actual_indices).item()

n_matches, n_matches/n_predicted, n_matches/n_actual

(2018, 0.74000733406674, 0.6193984039287906)

* We correctly labeled 2018 wines
* 74\% of our predictions that wine were good were correct
* We correctly identified ~62\% of the good wines

Not amazing.