# Red Wine Quality

Kaggle link: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009

In [161]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import wandb
import random


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./'): # '/kaggle/input'
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./simple-regression.ipynb
./winequality-red.csv
./wandb/debug-internal.log
./wandb/debug-cli.lorenzozanolin.log
./wandb/debug.log
./wandb/run-20231013_152213-1mdmbnvn/run-1mdmbnvn.wandb
./wandb/run-20231013_152213-1mdmbnvn/logs/debug-internal.log
./wandb/run-20231013_152213-1mdmbnvn/logs/debug.log
./wandb/run-20231013_152213-1mdmbnvn/files/requirements.txt
./wandb/run-20231013_152213-1mdmbnvn/files/output.log
./wandb/run-20231013_152213-1mdmbnvn/files/config.yaml
./wandb/run-20231013_152213-1mdmbnvn/files/wandb-metadata.json
./wandb/run-20231013_153847-h2jvr2b3/run-h2jvr2b3.wandb
./wandb/run-20231013_153847-h2jvr2b3/logs/debug-internal.log
./wandb/run-20231013_153847-h2jvr2b3/logs/debug.log
./wandb/run-20231013_153847-h2jvr2b3/files/requirements.txt
./wandb/run-20231013_153847-h2jvr2b3/files/output.log
./wandb/run-20231013_153847-h2jvr2b3/files/config.yaml
./wandb/run-20231013_153847-h2jvr2b3/files/wandb-summary.json
./wandb/run-20231013_153847-h2jvr2b3/files/wandb-metadata.json
./wand

First, we need to import Pytorch

In [162]:
import torch
from torch import nn
from torch import optim
from torch.utils import data
from sklearn.model_selection import train_test_split
#import wandb
#wandb.init(project="simple_regression")

# Data Processing

In [163]:
train_data = pd.read_csv('./winequality-red.csv')    #'/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv'
train_data.head()
train_data.shape

(1599, 12)

We need to separate features from target

In [164]:
n_train = train_data.shape[0]   #rows number
all_features = train_data.iloc[:, 0:-1] #features excluding the last column (quality), which is the label => features are X
#trains_labels   #labels representing the quality of the wine

Then we normalize the values

In [165]:
all_features = all_features.apply(lambda x: (x - x.mean()) / (x.std())) #normalization

train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)   #create the tensor containing the features

trains_labels = train_data.quality.values.reshape(-1, 1)

trains_mean = trains_labels.mean()
trains_std = trains_labels.std()
trains_labels = (trains_labels - trains_mean) / trains_std  #normalization
train_labels = torch.tensor(trains_labels,          #tensor containing the normalized training labels
                            dtype=torch.float32)
#train_labels

Finally, We split the training set in two parts: *training* and *test* set.

In [166]:
X_train,X_test,Y_train,Y_test = train_test_split(train_features,train_labels,test_size=0.25)

## Training

Initialize the weights and bias of the linear regression

In [167]:
n_features = train_features.shape[1] 
weights = torch.randn((n_features),requires_grad=True) # TODO, initialize a random tensor of weights, one weight for each feature
bias = torch.zeros(1, requires_grad=True)
#weights,bias

Some functions definitions

In [168]:
def linreg(X, w, b):        #linear regression
    return torch.matmul(X, w) + b

def sgd(params, lr, batch_size):    #we will use stochastic GD
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size
            param.grad.zero_()
            
criterion = nn.MSELoss()    #the loss function will be mean squared error

def data_iter(batch_size, features, labels):    #to divide a single batch in multiple minibatches
    num_examples = len(features)
    indices = list(range(num_examples))
    # The examples are read at random, in no particular order
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(indices[i:min(i +
                                                   batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices]
        
def squared_loss(y_hat, y):
    return (y_hat - y.reshape(y_hat.shape))**2 / 2

Training loop

In [169]:
# MINI BATCH, calculate for each round the derivative for each minibatch, and then sum up them together
num_iterations = 512
batch_size = 16
lr = 1e-3
print('Training loss:')
for i in range(num_iterations):
    for X, y in data_iter(batch_size, X_train, Y_train):    
        loss = squared_loss(linreg(X,weights,bias),y)    #calculate the prediction, i.e. X(train features) * weights +b; then the loss w.r.t. labels
        loss.sum().backward() #derivate calc
        sgd([weights,bias],lr,batch_size)   #update weights
    with torch.no_grad():
        train_l = squared_loss(linreg(X, weights, bias), y)
        print(f'epoch {i + 1}, loss {float(train_l.mean()):f}')
    

Training loss:
epoch 1, loss 8.186532
epoch 2, loss 7.254434
epoch 3, loss 2.723802
epoch 4, loss 2.149223
epoch 5, loss 2.057572
epoch 6, loss 2.107205
epoch 7, loss 0.983156
epoch 8, loss 2.048052
epoch 9, loss 0.766096
epoch 10, loss 0.970571
epoch 11, loss 0.938635
epoch 12, loss 0.763483
epoch 13, loss 1.084273
epoch 14, loss 0.767156
epoch 15, loss 0.565751
epoch 16, loss 0.801695
epoch 17, loss 0.554372
epoch 18, loss 1.353199
epoch 19, loss 0.247459
epoch 20, loss 0.301668
epoch 21, loss 0.548876
epoch 22, loss 0.466553
epoch 23, loss 0.219659
epoch 24, loss 0.239900
epoch 25, loss 0.543891
epoch 26, loss 0.361447
epoch 27, loss 0.664256
epoch 28, loss 0.303511
epoch 29, loss 0.656653
epoch 30, loss 0.486389
epoch 31, loss 0.611940
epoch 32, loss 0.425996
epoch 33, loss 0.542093
epoch 34, loss 0.311715
epoch 35, loss 0.444710
epoch 36, loss 0.607627
epoch 37, loss 0.583015
epoch 38, loss 0.347301
epoch 39, loss 0.461741
epoch 40, loss 0.185194
epoch 41, loss 0.369931
epoch 42, 

Get the real predictions

In [170]:
predictions = linreg(train_features,weights,bias)
predictions = torch.floor((predictions * trains_std) + trains_mean)  #denormalization + floor approximation

print(predictions)

tensor([5., 5., 5.,  ..., 5., 5., 6.], grad_fn=<FloorBackward0>)


## Testing

We need to calculate the loss also over the test set

In [172]:
# test on the test dataset
with torch.no_grad():
    test_l = 0
    for X, y in data_iter(batch_size, X_test, Y_test):
        test_l += squared_loss(linreg(X, weights, bias), y).sum()
    test_l /= len(X_test)
    print(f'loss on the test dataset {float(test_l):f}')

loss on the test dataset 0.321159
