# Red Wine Quality

Kaggle link: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009

In [73]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import wandb
import random


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./'): # '/kaggle/input'
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./simple-regression.ipynb
./winequality-red.csv
./wandb/debug-internal.log
./wandb/debug-cli.lorenzozanolin.log
./wandb/debug.log
./wandb/run-20231013_152213-1mdmbnvn/run-1mdmbnvn.wandb
./wandb/run-20231013_152213-1mdmbnvn/logs/debug-internal.log
./wandb/run-20231013_152213-1mdmbnvn/logs/debug.log
./wandb/run-20231013_152213-1mdmbnvn/files/requirements.txt
./wandb/run-20231013_152213-1mdmbnvn/files/output.log
./wandb/run-20231013_152213-1mdmbnvn/files/config.yaml
./wandb/run-20231013_152213-1mdmbnvn/files/wandb-metadata.json
./wandb/run-20231013_153847-h2jvr2b3/run-h2jvr2b3.wandb
./wandb/run-20231013_153847-h2jvr2b3/logs/debug-internal.log
./wandb/run-20231013_153847-h2jvr2b3/logs/debug.log
./wandb/run-20231013_153847-h2jvr2b3/files/requirements.txt
./wandb/run-20231013_153847-h2jvr2b3/files/output.log
./wandb/run-20231013_153847-h2jvr2b3/files/config.yaml
./wandb/run-20231013_153847-h2jvr2b3/files/wandb-summary.json
./wandb/run-20231013_153847-h2jvr2b3/files/wandb-metadata.json
./wand

First, we need to import Pytorch

In [74]:
import torch
from torch import nn
from torch import optim
from torch.utils import data
import wandb
wandb.init(project="simple_regression")



# Data Processing

In [75]:
train_data = pd.read_csv('./winequality-red.csv')    #'/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv'
train_data.head()
train_data.shape

(1599, 12)

We need to separate features from target

In [76]:
n_train = train_data.shape[0]   #rows number
all_features = train_data.iloc[:, 0:-1] #features excluding the first column (row index) and the last column (quality), which is the label => features are X
all_features = all_features.apply(lambda x: (x - x.mean()) / (x.std())) #normalization
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)   #create the tensor containing the features

train_features.shape


torch.Size([1599, 11])

In [77]:
trains_labels = train_data.quality.values.reshape(-1, 1)
trains_labels   #labels representing the quality of the wine

array([[5],
       [5],
       [5],
       ...,
       [6],
       [5],
       [6]])

In [78]:
trains_mean = trains_labels.mean()
trains_std = trains_labels.std()
trains_labels = (trains_labels - trains_mean) / trains_std  #normalization
train_labels = torch.tensor(trains_labels,          #tensor containing the normalized training labels
                            dtype=torch.float32)
train_labels

tensor([[-0.7878],
        [-0.7878],
        [-0.7878],
        ...,
        [ 0.4508],
        [-0.7878],
        [ 0.4508]])

## Training

Initialize the weights and bias of the linear regression

In [83]:
n_features = train_features.shape[1] 
weights = torch.randn((n_features),requires_grad=True) # TODO, initialize a random tensor of weights, one weight for each feature
bias = torch.zeros(1, requires_grad=True)
weights,bias

(tensor([ 1.0756, -0.0727,  0.8611, -0.7896, -1.2702, -1.8445,  0.7174, -1.6390,
          1.6959, -0.6130, -0.7053], requires_grad=True),
 tensor([0.], requires_grad=True))

Some functions definitions

In [88]:
def linreg(X, w, b):        #linear regression
    return torch.matmul(X, w) + b

def sgd(params, lr, batch_size):    #we will use stochastic GD
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size
            param.grad.zero_()
            
criterion = nn.MSELoss()    #the loss function will be mean squared error

def data_iter(batch_size, features, labels):    #to divide a single batch in multiple minibatches
    num_examples = len(features)
    indices = list(range(num_examples))
    # The examples are read at random, in no particular order
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(indices[i:min(i +
                                                   batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices]
        
def squared_loss(y_hat, y):
    return (y_hat - y.reshape(y_hat.shape))**2 / 2

Training loop

In [89]:
# MINI BATCH, calculate for each round the derivative for each minibatch, and then sum up them together

num_iterations = 512
batch_size = 16
lr = 1e-3

for i in range(num_iterations):
    for X, y in data_iter(batch_size, train_features, train_labels):    
        predictions = squared_loss(linreg(X,weights,bias),y)    #calculate the prediction, i.e. X(train features) * weights +b; then the loss w.r.t. labels
        predictions.sum().backward() #derivate calc
        sgd([weights,bias],lr,batch_size)
    with torch.no_grad():
        train_l = squared_loss(linreg(X, weights, bias), y)
        print(f'epoch {i + 1}, loss {float(train_l.mean()):f}')
    

epoch 1, loss 2.467809
epoch 2, loss 1.616591
epoch 3, loss 1.183097
epoch 4, loss 3.642277
epoch 5, loss 1.223997
epoch 6, loss 1.139056
epoch 7, loss 1.244240
epoch 8, loss 2.207870
epoch 9, loss 1.452906
epoch 10, loss 0.670021
epoch 11, loss 1.043539
epoch 12, loss 1.275869
epoch 13, loss 0.883948
epoch 14, loss 1.025992
epoch 15, loss 0.719447
epoch 16, loss 0.753684
epoch 17, loss 0.785309
epoch 18, loss 1.231531
epoch 19, loss 0.610319
epoch 20, loss 0.702089
epoch 21, loss 0.348262
epoch 22, loss 0.692109
epoch 23, loss 0.504640
epoch 24, loss 0.558261
epoch 25, loss 0.475146
epoch 26, loss 0.395076
epoch 27, loss 0.832110
epoch 28, loss 0.555774
epoch 29, loss 0.632138
epoch 30, loss 0.547659
epoch 31, loss 1.378829
epoch 32, loss 0.503376
epoch 33, loss 0.381123
epoch 34, loss 0.619662
epoch 35, loss 0.269801
epoch 36, loss 0.456109
epoch 37, loss 0.277859
epoch 38, loss 0.539562
epoch 39, loss 0.845437
epoch 40, loss 0.834585
epoch 41, loss 0.324753
epoch 42, loss 0.203017
e

Get the real predictions

In [90]:
predictions = (predictions * trains_std) + trains_mean  #denormalization 
predictions #resulting labels

tensor([6.1017, 5.7241, 6.0344, 5.6368, 5.6717, 5.6719, 6.2113, 6.3774, 5.7180,
        5.6739, 5.7465, 6.1013, 6.2913, 5.7514, 7.2777],
       grad_fn=<AddBackward0>)

Our predictions seem very close to the ground truth!

**To go further**: Stochastic Gradient Descent is not the optimal algorithm in terms of convergeance.
If you are curious, you can read this nice article about an improvement to SGD, momentum and try to implement it: https://distill.pub/2017/momentum/