# Red Wine Quality

Kaggle link: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009

## Preliminaries

In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import metrics
from sklearn.preprocessing import StandardScaler    #per normalizzare i valori
#import wandb
import random
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./'): # '/kaggle/input'
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./simple-regression.ipynb
./winequality-red.csv
./wandb/debug-internal.log
./wandb/debug-cli.lorenzozanolin.log
./wandb/debug.log
./wandb/run-20231013_152213-1mdmbnvn/run-1mdmbnvn.wandb
./wandb/run-20231013_152213-1mdmbnvn/logs/debug-internal.log
./wandb/run-20231013_152213-1mdmbnvn/logs/debug.log
./wandb/run-20231013_152213-1mdmbnvn/files/requirements.txt
./wandb/run-20231013_152213-1mdmbnvn/files/output.log
./wandb/run-20231013_152213-1mdmbnvn/files/config.yaml
./wandb/run-20231013_152213-1mdmbnvn/files/wandb-metadata.json
./wandb/run-20231013_153847-h2jvr2b3/run-h2jvr2b3.wandb
./wandb/run-20231013_153847-h2jvr2b3/logs/debug-internal.log
./wandb/run-20231013_153847-h2jvr2b3/logs/debug.log
./wandb/run-20231013_153847-h2jvr2b3/files/requirements.txt
./wandb/run-20231013_153847-h2jvr2b3/files/output.log
./wandb/run-20231013_153847-h2jvr2b3/files/config.yaml
./wandb/run-20231013_153847-h2jvr2b3/files/wandb-summary.json
./wandb/run-20231013_153847-h2jvr2b3/files/wandb-metadata.json
./wand

First, we need to import Pytorch

In [13]:
import torch
from torch import nn
from torch import optim
from torch.utils import data
from sklearn.model_selection import train_test_split
#import wandb
#wandb.init(project="simple_regression")

## Data Processing

In [14]:
train_data = pd.read_csv('./winequality-red.csv')    #'/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv'
train_data.head()
train_data.shape

(1599, 12)

First we will split the training set in two parts: *training* and *test* set.

In [15]:

train_set,test_set = train_test_split(train_data.iloc[:, 0:].values,test_size=0.25) #splitting into training set and test set
print(train_set)


[[10.    0.41  0.45 ...  0.49 11.8   7.  ]
 [ 8.8   0.4   0.4  ...  0.64  9.2   5.  ]
 [ 7.3   0.39  0.31 ...  0.54  9.4   6.  ]
 ...
 [ 8.    0.42  0.32 ...  1.07  9.7   5.  ]
 [ 6.4   0.38  0.14 ...  0.65 11.1   6.  ]
 [ 8.    0.33  0.53 ...  0.8   9.6   6.  ]]


Now we will *normalize* features in the following manner:
- mean and std will be computed on the *training* set
- these values will be used to compute the normalization on the *test* set

In [16]:
def normalize_features(train,test):
    global scaler 
    scaler = StandardScaler()
    scaler.fit(train) # Addestra lo scaler sul training set (ad esempio, X_train è il tuo training set)
    # Applica la normalizzazione al training set
    X_train_norm = scaler.transform(train)
    # Applica la stessa normalizzazione al test set (ad esempio, X_test è il tuo test set)
    X_test_norm = scaler.transform(test)
    return X_train_norm,X_test_norm

train_norm,test_norm = normalize_features(train_set,test_set)   #normalizing the dataset

X_train,Y_train = torch.tensor(train_norm[:,:-1],dtype=torch.float32),torch.tensor(train_norm[:,-1],dtype=torch.float32)  #splitting the labels and the features and creating tensors with the corresponding values
X_test,Y_test = torch.tensor(test_norm[:,:-1],dtype=torch.float32),torch.tensor(test_norm[:,-1],dtype=torch.float32)


## Training

Initialize the weights and bias of the linear regression

In [17]:
n_features = X_train.shape[1] 
weights = torch.randn((n_features),requires_grad=True) # TODO, initialize a random tensor of weights, one weight for each feature
bias = torch.zeros(1, requires_grad=True)
#weights,bias

Some functions definitions

In [18]:
from matplotlib import pyplot as plt


def linreg(X, w, b):        #linear regression
    return torch.matmul(X, w) + b

def sgd(params, lr, batch_size):    #we will use stochastic GD
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size
            param.grad.zero_()
            
criterion = nn.MSELoss()    #the loss function will be mean squared error

def data_iter(batch_size, features, labels):    #to divide a single batch in multiple minibatches
    num_examples = len(features)
    indices = list(range(num_examples))
    # The examples are read at random, in no particular order
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(indices[i:min(i +
                                                   batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices]
        
def squared_loss(y_hat, y):
    return (y_hat - y.reshape(y_hat.shape))**2 / 2

def print_result(y_true, y_pred):
    #codice copiato
    plt.clf()
    plt.plot(y_pred, 'ro', label='Predictions', alpha=0.5)
    plt.plot(y_true, 'go', label='True', alpha=0.5)
    plt.legend(loc='best')
    plt.colorbar
    plt.show()
    
def print_correctness(ground_truth,predicted_labels):
    print(metrics.accuracy_score(ground_truth,predicted_labels))

Training loop

In [19]:
# MINI BATCH, calculate for each round the derivative for each minibatch, and then sum up them together
num_iterations = 512
batch_size = 16
lr = 1e-3
print('Training loss:')
for i in range(num_iterations):
    for X, y in data_iter(batch_size, X_train, Y_train):    
        loss = squared_loss(linreg(X,weights,bias),y)    #calculate the prediction, i.e. X(train features) * weights +b; then the loss w.r.t. labels
        loss.sum().backward() #derivate calc
        sgd([weights,bias],lr,batch_size)   #update weights
    with torch.no_grad():
        train_l = squared_loss(linreg(X, weights, bias), y) #loss of the final batch of the round
        print(f'epoch {i + 1}, loss {float(train_l.mean()):f}')
    

Training loss:
epoch 1, loss 6.919366
epoch 2, loss 3.197985
epoch 3, loss 5.024555
epoch 4, loss 2.554493
epoch 5, loss 2.989530
epoch 6, loss 4.629636
epoch 7, loss 1.899886
epoch 8, loss 1.285695
epoch 9, loss 1.388697
epoch 10, loss 2.043438
epoch 11, loss 1.066327
epoch 12, loss 1.135208
epoch 13, loss 1.304777
epoch 14, loss 0.892195
epoch 15, loss 1.098586
epoch 16, loss 0.689742
epoch 17, loss 0.657923
epoch 18, loss 0.595400
epoch 19, loss 0.517152
epoch 20, loss 0.624688
epoch 21, loss 1.476747
epoch 22, loss 0.661613
epoch 23, loss 2.177165
epoch 24, loss 0.440144
epoch 25, loss 1.089461
epoch 26, loss 0.681250
epoch 27, loss 0.744791
epoch 28, loss 0.541611
epoch 29, loss 0.494340
epoch 30, loss 1.766611
epoch 31, loss 0.354510
epoch 32, loss 0.595038
epoch 33, loss 0.860371
epoch 34, loss 0.297888
epoch 35, loss 0.435006
epoch 36, loss 0.447534
epoch 37, loss 0.709865
epoch 38, loss 0.316069
epoch 39, loss 0.285196
epoch 40, loss 0.288906
epoch 41, loss 0.319191
epoch 42, 

## Testing

We need to calculate the loss also over the test set

In [20]:
# test on the test dataset
with torch.no_grad():
    test_l = 0
    for X, y in data_iter(batch_size, X_test, Y_test):
        test_l += squared_loss(linreg(X, weights, bias), y).sum()
    test_l /= len(X_test)
    print(f'loss on the test dataset {float(test_l):f}')

loss on the test dataset 0.337425


## Results

Finally, the obtained predictions are the following

In [43]:
def denormalize_features(train,test):
    global scaler
    # Denormalizza il training set
    X_train_denormalized = scaler.inverse_transform(train)
    X_test_denormalized = scaler.inverse_transform(test)
    return X_train_denormalized,X_test_denormalized

denorm_train,denorm_test = denormalize_features(torch.cat((X_train,Y_train.reshape(-1, 1)),1),torch.cat((X_test,Y_test.reshape(-1, 1)),1))

train_predictions = torch.round(linreg(torch.tensor(denorm_train[:,:-1],dtype=torch.float32),weights,bias))
print(train_predictions)
test_predictions = torch.round(linreg(torch.tensor(denorm_test[:,:-1],dtype=torch.float32),weights,bias))


with torch.no_grad():
    print_correctness(Y_test,test_predictions)
    
    #print_result(torch.round((train_labels * trains_std) + trains_mean),torch.round((train_predictions,test_predictions),0))

#print(train_predictions,test_predictions)

tensor([  4.,  -2.,  -2.,  ..., -11.,   2.,  -6.], grad_fn=<RoundBackward0>)


ValueError: Classification metrics can't handle a mix of continuous and multiclass targets