# Red Wine Quality

Kaggle link: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import wandb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./'): # '/kaggle/input'
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./simple-regression.ipynb
./winequality-red.csv
./wandb/debug-internal.log
./wandb/debug.log
./wandb/run-20231013_152213-1mdmbnvn/run-1mdmbnvn.wandb
./wandb/run-20231013_152213-1mdmbnvn/logs/debug-internal.log
./wandb/run-20231013_152213-1mdmbnvn/logs/debug.log
./wandb/run-20231013_152213-1mdmbnvn/files/requirements.txt
./wandb/run-20231013_152213-1mdmbnvn/files/output.log
./wandb/run-20231013_152213-1mdmbnvn/files/config.yaml
./wandb/run-20231013_152213-1mdmbnvn/files/wandb-metadata.json


First, we need to import Pytorch

In [5]:
import torch
from torch import nn
from torch import optim
from torch.utils import data
import wandb
wandb.init(project="simple_regression")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlorenzozanolin-52[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Data Processing

In [15]:
train_data = pd.read_csv('./winequality-red.csv')    #'/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv'
train_data.head()
train_data.shape

(1599, 12)

We need to separate features from target

In [38]:
n_train = train_data.shape[0]   #rows number
all_features = train_data.iloc[:, 0:-1] #features excluding the first column (row index) and the last column (quality), which is the label => features are X
all_features = all_features.apply(lambda x: (x - x.mean()) / (x.std())) #normalization
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)   #create the tensor containing the features

train_features.shape


torch.Size([1599, 11])

In [40]:
trains_labels = train_data.quality.values.reshape(-1, 1)
trains_labels   #labels representing the quality of the wine

array([[5],
       [5],
       [5],
       ...,
       [6],
       [5],
       [6]])

In [9]:
trains_mean = trains_labels.mean()
trains_std = trains_labels.std()
trains_labels = (trains_labels - trains_mean) / trains_std  #normalization
train_labels = torch.tensor(trains_labels,          #tensor containing the normalized training labels
                            dtype=torch.float32)
train_labels

tensor([[-0.7878],
        [-0.7878],
        [-0.7878],
        ...,
        [ 0.4508],
        [-0.7878],
        [ 0.4508]])

## Training

Initialize the weight of the linear regression

In [44]:
n_features = train_features.shape[1] #-2
linear_weights = torch.randn((n_features),requires_grad=True) # TODO, initialize a random tensor of weights, one weight for each feature
linear_weights

tensor([ 1.6282, -1.6004,  1.6385, -1.0293, -0.5036, -0.0814,  0.9035, -1.2022,
         2.7352,  0.6013, -1.1299], requires_grad=True)

Training loop

In [45]:
num_iterations = 512
lr = 1e-3
loss = nn.MSELoss
for i in range(num_iterations):
    predictions = torch.matmul(train_features,linear_weights)    #calculate the prediction, i.e. X(train features) * weights 
    loss(predictions,train_labels)   #loss calculation
    loss.backward() #derivate calc
    
    with torch.no_grad:
        linear_weights-=lr*linear_weights.grad  #backprop
    linear_weights.requires_grad = True
    
    

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

Get the real predictions

In [None]:
predictions = (predictions * trains_std) + trains_mean  #denormalization 
predictions #resulting labels

Our predictions seem very close to the ground truth!

**To go further**: Stochastic Gradient Descent is not the optimal algorithm in terms of convergeance.
If you are curious, you can read this nice article about an improvement to SGD, momentum and try to implement it: https://distill.pub/2017/momentum/