# Red Wine Quality

Kaggle link: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009

In [118]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import wandb
import random


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./'): # '/kaggle/input'
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./simple-regression.ipynb
./winequality-red.csv
./wandb/debug-internal.log
./wandb/debug-cli.lorenzozanolin.log
./wandb/debug.log
./wandb/run-20231013_152213-1mdmbnvn/run-1mdmbnvn.wandb
./wandb/run-20231013_152213-1mdmbnvn/logs/debug-internal.log
./wandb/run-20231013_152213-1mdmbnvn/logs/debug.log
./wandb/run-20231013_152213-1mdmbnvn/files/requirements.txt
./wandb/run-20231013_152213-1mdmbnvn/files/output.log
./wandb/run-20231013_152213-1mdmbnvn/files/config.yaml
./wandb/run-20231013_152213-1mdmbnvn/files/wandb-metadata.json
./wandb/run-20231013_153847-h2jvr2b3/run-h2jvr2b3.wandb
./wandb/run-20231013_153847-h2jvr2b3/logs/debug-internal.log
./wandb/run-20231013_153847-h2jvr2b3/logs/debug.log
./wandb/run-20231013_153847-h2jvr2b3/files/requirements.txt
./wandb/run-20231013_153847-h2jvr2b3/files/output.log
./wandb/run-20231013_153847-h2jvr2b3/files/config.yaml
./wandb/run-20231013_153847-h2jvr2b3/files/wandb-summary.json
./wandb/run-20231013_153847-h2jvr2b3/files/wandb-metadata.json
./wand

First, we need to import Pytorch

In [119]:
import torch
from torch import nn
from torch import optim
from torch.utils import data
import wandb
wandb.init(project="simple_regression")



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168903700005709, max=1.0…

# Data Processing

In [120]:
train_data = pd.read_csv('./winequality-red.csv')    #'/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv'
train_data.head()
train_data.shape

(1599, 12)

We need to separate features from target

In [121]:
n_train = train_data.shape[0]   #rows number
all_features = train_data.iloc[:, 0:-1] #features excluding the first column (row index) and the last column (quality), which is the label => features are X
all_features = all_features.apply(lambda x: (x - x.mean()) / (x.std())) #normalization
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)   #create the tensor containing the features

train_features.shape


torch.Size([1599, 11])

In [122]:
trains_labels = train_data.quality.values.reshape(-1, 1)
trains_labels   #labels representing the quality of the wine

array([[5],
       [5],
       [5],
       ...,
       [6],
       [5],
       [6]])

In [123]:
trains_mean = trains_labels.mean()
trains_std = trains_labels.std()
trains_labels = (trains_labels - trains_mean) / trains_std  #normalization
train_labels = torch.tensor(trains_labels,          #tensor containing the normalized training labels
                            dtype=torch.float32)
train_labels

tensor([[-0.7878],
        [-0.7878],
        [-0.7878],
        ...,
        [ 0.4508],
        [-0.7878],
        [ 0.4508]])

## Training

Initialize the weights and bias of the linear regression

In [124]:
n_features = train_features.shape[1] 
weights = torch.randn((n_features),requires_grad=True) # TODO, initialize a random tensor of weights, one weight for each feature
bias = torch.zeros(1, requires_grad=True)
weights,bias

(tensor([-1.2975,  1.6544, -0.1506,  0.7650, -0.6935,  0.3108,  0.1193, -0.3888,
         -0.0314,  2.5950, -1.3929], requires_grad=True),
 tensor([0.], requires_grad=True))

Some functions definitions

In [125]:
def linreg(X, w, b):        #linear regression
    return torch.matmul(X, w) + b

def sgd(params, lr, batch_size):    #we will use stochastic GD
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size
            param.grad.zero_()
            
criterion = nn.MSELoss()    #the loss function will be mean squared error

def data_iter(batch_size, features, labels):    #to divide a single batch in multiple minibatches
    num_examples = len(features)
    indices = list(range(num_examples))
    # The examples are read at random, in no particular order
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(indices[i:min(i +
                                                   batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices]
        
def squared_loss(y_hat, y):
    return (y_hat - y.reshape(y_hat.shape))**2 / 2

Training loop

In [126]:
# MINI BATCH, calculate for each round the derivative for each minibatch, and then sum up them together

num_iterations = 512
batch_size = 16
lr = 1e-3

for i in range(num_iterations):
    for X, y in data_iter(batch_size, train_features, train_labels):    
        predictions = squared_loss(linreg(X,weights,bias),y)    #calculate the prediction, i.e. X(train features) * weights +b; then the loss w.r.t. labels
        predictions.sum().backward() #derivate calc
        sgd([weights,bias],lr,batch_size)   #update weights
    with torch.no_grad():
        train_l = squared_loss(linreg(X, weights, bias), y)
        print(f'epoch {i + 1}, loss {float(train_l.mean()):f}')
    

epoch 1, loss 1.268603
epoch 2, loss 6.247308
epoch 3, loss 2.489125
epoch 4, loss 2.019293
epoch 5, loss 1.756763
epoch 6, loss 1.440218
epoch 7, loss 2.183013
epoch 8, loss 1.670986
epoch 9, loss 1.688256
epoch 10, loss 0.870260
epoch 11, loss 0.775894
epoch 12, loss 0.310049
epoch 13, loss 0.574066
epoch 14, loss 0.883134
epoch 15, loss 1.128127
epoch 16, loss 0.418630
epoch 17, loss 0.561461
epoch 18, loss 0.840668
epoch 19, loss 0.642460
epoch 20, loss 0.562327
epoch 21, loss 0.425234
epoch 22, loss 0.750776
epoch 23, loss 0.418276
epoch 24, loss 2.508132
epoch 25, loss 0.251213
epoch 26, loss 0.293227
epoch 27, loss 0.504301
epoch 28, loss 0.330711
epoch 29, loss 0.335549
epoch 30, loss 0.596530
epoch 31, loss 0.351218
epoch 32, loss 0.356759
epoch 33, loss 0.291588
epoch 34, loss 0.330009
epoch 35, loss 0.331947
epoch 36, loss 0.688176
epoch 37, loss 0.402600
epoch 38, loss 0.405288
epoch 39, loss 0.156581
epoch 40, loss 0.565883
epoch 41, loss 0.441222
epoch 42, loss 0.526309
e

Get the real predictions

In [129]:
predictions = (predictions * trains_std) + trains_mean  #denormalization 
print(predictions.shape)
torch.floor(predictions) #resulting labels

torch.Size([15])


tensor([13., 13., 14., 13., 13., 13., 13., 13., 13., 14., 13., 14., 13., 14.,
        14.], grad_fn=<FloorBackward0>)

Our predictions seem very close to the ground truth!

**To go further**: Stochastic Gradient Descent is not the optimal algorithm in terms of convergeance.
If you are curious, you can read this nice article about an improvement to SGD, momentum and try to implement it: https://distill.pub/2017/momentum/