# Red Wine Quality

Kaggle link: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009

## Preliminaries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import metrics
import random
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import torch
from torch import nn
from sklearn.model_selection import train_test_split
#import wandb
#wandb.init(project="simple_regression")

import os
for dirname, _, filenames in os.walk('./'): # '/kaggle/input'
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./simple-regression.ipynb
./winequality-red.csv


## Data Processing

In [10]:
train_data = pd.read_csv('./winequality-red.csv')    #'/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv'
train_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


First we will split the training set in two parts: *training* and *test* set.

In [3]:

X_train,X_test,y_train,y_test = train_test_split(train_data.iloc[:,:-1],train_data.iloc[:,-1],test_size=0.25) #splitting into training set and test set

#convert the sets from ndarray to tensor
X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)


Now we will *normalize* features in the following manner:
- mean and std will be computed on the *training* set
- these values will be used to compute the normalization on the *test* set

In [4]:
def normalize_features(X_train, X_test, y_train, y_test):
    global train_mean, train_std, pred_mean, pred_std

    train_mean = [0,0,0,0,0,0,0,0,0,0,0] 
    train_std =[0,0,0,0,0,0,0,0,0,0,0]
    
    
    for i in range(0, X_train.shape[1]-1):
        train_mean[i] = X_train[i].mean()
        train_std[i] = X_train[i].std()
    
        X_train = (X_train - train_mean[i])/train_std[i]
        X_test = (X_test - train_mean[i])/train_std[i]
    
    print(X_train)
    
    pred_mean = y_train.mean()
    pred_std = y_train.std()
    
    y_train = (y_train - pred_mean) / pred_std
    y_train = torch.tensor(y_train,dtype=torch.float32)

    
    y_test = (y_test - pred_mean) / pred_std
    y_test = torch.tensor(y_test,dtype=torch.float32)
    return X_train, X_test, y_train, y_test

def denormalize_features(X_train, X_test, y_train, y_test):
    global train_mean, train_std, pred_mean, pred_std
    
    
    for i in range(0, X_train.shape[1]-1):
        X_train = (X_train*train_std[i])+train_mean[i]
        X_test = (X_test*train_std[i])+train_mean[i]
    
    y_train = (y_train * pred_std)+pred_mean
    y_train = torch.tensor(y_train,dtype=torch.float32)

    
    y_test = (y_test*pred_std) + pred_mean
    y_test = torch.tensor(y_test, dtype=torch.float32)
    return X_train, X_test, y_train, y_test

X_train_N,X_test_N,y_train_n,y_test_n = normalize_features(X_train,X_test,y_train,y_test)   #normalizing the dataset

tensor([[ 0.5489, -0.7218, -0.8481,  ..., -0.1573, -0.6644,  1.0465],
        [ 0.7786, -0.7544, -0.8271,  ..., -0.2415, -0.7467,  0.9508],
        [ 0.5872, -0.7036, -0.8481,  ..., -0.2089, -0.7544,  1.0082],
        ...,
        [ 0.7403, -0.7237, -0.8290,  ..., -0.2185, -0.7429,  0.9700],
        [ 0.2810, -0.7314, -0.8328,  ..., -0.1668, -0.7008,  1.2762],
        [ 0.4532, -0.7314, -0.8099,  ..., -0.1707, -0.7371,  0.9317]])


  y_train = torch.tensor(y_train,dtype=torch.float32)
  y_test = torch.tensor(y_test,dtype=torch.float32)


## Training

Initialize the weights and bias of the linear regression

In [5]:
n_features = X_train.shape[1] 
weights = torch.randn((n_features),requires_grad=True) # initialize a random tensor of weights, one weight for each feature
bias = torch.zeros(1, requires_grad=True)

Some functions definitions

In [6]:
from matplotlib import pyplot as plt


def linreg(X, w, b):        #linear regression
    return torch.matmul(X, w) + b

def sgd(params, lr, batch_size):    #we will use stochastic GD
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size
            param.grad.zero_()
            
criterion = nn.MSELoss()    #the loss function will be mean squared error

def data_iter(batch_size, features, labels):    #to divide a single batch in multiple minibatches
    num_examples = len(features)
    indices = list(range(num_examples))
    # The examples are read at random, in no particular order
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(indices[i:min(i +
                                                   batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices]
        
def squared_loss(y_hat, y):
    return (y_hat - y.reshape(y_hat.shape))**2 / 2

def print_result(y_true, y_pred):       #plot the graphs
    plt.clf()
    plt.plot(y_pred, 'ro', label='Predictions', alpha=0.5)
    plt.plot(y_true, 'go', label='True', alpha=0.5)
    plt.legend(loc='best')
    plt.colorbar
    plt.show()
    
def print_correctness(ground_truth,predicted_labels):       #prints the accuracy
    print('Accuracy:\f',metrics.accuracy_score(ground_truth,predicted_labels))
    print('Precision:\f',metrics.precision_score(ground_truth,predicted_labels))
    print('Recall:\f',metrics.recall_score(ground_truth,predicted_labels))

Training loop

In [7]:
# MINI BATCH, calculate for each round the derivative for each minibatch, and then sum up them together
num_iterations = 512
batch_size = 16
lr = 1e-3
print('Training loss:')
for i in range(num_iterations):
    for X, y in data_iter(batch_size, X_train_N, y_train_n):    
        loss = squared_loss(linreg(X,weights,bias),y)    #calculate the prediction, i.e. X(train features) * weights +b; then the loss w.r.t. labels
        loss.sum().backward() #derivate calc
        sgd([weights,bias],lr,batch_size)   #update weights
    with torch.no_grad():
        train_l = squared_loss(linreg(X, weights, bias), y) #loss of the final batch of the round
        print(f'epoch {i + 1}, loss {float(train_l.mean()):f}')
    

Training loss:
epoch 1, loss 1.711766
epoch 2, loss 2.693683
epoch 3, loss 1.239074
epoch 4, loss 1.148880
epoch 5, loss 1.024060
epoch 6, loss 0.798032
epoch 7, loss 0.610080
epoch 8, loss 0.584177
epoch 9, loss 0.491468
epoch 10, loss 0.609354
epoch 11, loss 0.518583
epoch 12, loss 0.430067
epoch 13, loss 0.614614
epoch 14, loss 0.480848
epoch 15, loss 0.715679
epoch 16, loss 0.465754
epoch 17, loss 0.218598
epoch 18, loss 0.585542
epoch 19, loss 0.447429
epoch 20, loss 0.571421
epoch 21, loss 0.249899
epoch 22, loss 0.212254
epoch 23, loss 0.521194
epoch 24, loss 0.475753
epoch 25, loss 0.439228
epoch 26, loss 0.310040
epoch 27, loss 0.307268
epoch 28, loss 0.351382
epoch 29, loss 0.319454
epoch 30, loss 0.473707
epoch 31, loss 0.230299
epoch 32, loss 0.393220
epoch 33, loss 0.460009
epoch 34, loss 0.382549
epoch 35, loss 0.284608
epoch 36, loss 0.581834
epoch 37, loss 0.891545
epoch 38, loss 0.283827
epoch 39, loss 0.498715
epoch 40, loss 0.305557
epoch 41, loss 0.353747
epoch 42, 

## Testing

We need to calculate the loss also over the test set

In [8]:
# test on the test dataset
with torch.no_grad():
    test_l = 0
    for X, y in data_iter(batch_size, X_test_N, y_test_n):
        test_l += squared_loss(linreg(X, weights, bias), y).sum()
    test_l /= len(X_test)
    print(f'loss on the test dataset {float(test_l):f}')

loss on the test dataset 0.370630


## Results

Finally, the obtained predictions are the following

In [9]:
test_predictions_N = []
with torch.no_grad():
    for X in X_test_N:
        test_predictions_N.append(linreg(X,weights,bias))       #we calculate all the predictions on the test set

_,_,test_predictions,y_test = denormalize_features(X_train_N,X_test_N,torch.tensor(test_predictions_N),y_test_n)    #we denormalize the features

test_predictions = test_predictions.round()     #aproximation, since we need int values

with torch.no_grad():
    print_correctness(y_test,test_predictions)  #prints accuracy
    print_result(y_test,test_predictions)

Accuracy: 0.5475


  y_train = torch.tensor(y_train,dtype=torch.float32)
  y_test = torch.tensor(y_test, dtype=torch.float32)


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].