In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler  # for normalization
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.svm import SVC
import torch

In [2]:
df = pd.read_csv('WhiteWineQuality.csv')
df.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
1,6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9...
2,8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;1...
3,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...
4,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 1 columns):
 #   Column                                                                                                                                                                   Non-Null Count  Dtype 
---  ------                                                                                                                                                                   --------------  ----- 
 0   fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"  4898 non-null   object
dtypes: object(1)
memory usage: 38.4+ KB


In [4]:
cols = list(df.columns)[0].split(';')
cols = [item.replace('"', '') for item in cols]
cols

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [5]:
vals = []
for i in range(len(df)):
    vals.append(list(df.values[i])[0].split(';'))
vals[0]

['7',
 '0.27',
 '0.36',
 '20.7',
 '0.045',
 '45',
 '170',
 '1.001',
 '3',
 '0.45',
 '8.8',
 '6']

In [6]:
data = pd.DataFrame(vals, columns = cols)
data = data.astype(float)
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6.0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6.0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   float64
dtypes: float64(12)
memory usage: 459.3 KB


In [26]:
# initially implementing binary classification
data = data[(data['quality'] == 6) | (data['quality'] == 5)]
x = data.drop('quality', axis = 1)
y = pd.to_numeric(data['quality'])
xnorm = MinMaxScaler().fit_transform(x)
xnorm[:5]

array([[0.30769231, 0.21468927, 0.21686747, 0.30828221, 0.10682493,
        0.33333333, 0.48059701, 0.26622875, 0.25688073, 0.26506024,
        0.13333333],
       [0.24038462, 0.24858757, 0.20481928, 0.01533742, 0.11869436,
        0.09302326, 0.36716418, 0.13098918, 0.53211009, 0.31325301,
        0.25      ],
       [0.41346154, 0.2259887 , 0.24096386, 0.09662577, 0.12166172,
        0.21705426, 0.26268657, 0.15224111, 0.49541284, 0.25301205,
        0.35      ],
       [0.32692308, 0.16949153, 0.19277108, 0.12116564, 0.14540059,
        0.34883721, 0.52835821, 0.16190108, 0.43119266, 0.20481928,
        0.31666667],
       [0.32692308, 0.16949153, 0.19277108, 0.12116564, 0.14540059,
        0.34883721, 0.52835821, 0.16190108, 0.43119266, 0.20481928,
        0.31666667]])

In [27]:
y = np.where(y == 5, -1, 1)
xtr, xtest, ytr, ytest = train_test_split(xnorm, y, test_size = 0.15, random_state = 42)
xtr.shape, ytr.shape, xtest.shape, ytest.shape

((3106, 11), (3106,), (549, 11), (549,))

In [29]:
xtr = torch.tensor(xtr, dtype = torch.float32, requires_grad = False)
ytr = torch.tensor(ytr, dtype = torch.float32, requires_grad = False)
xtest = torch.tensor(xtest, dtype = torch.float32, requires_grad = False)
xtr.shape, ytr.shape

(torch.Size([3106, 11]), torch.Size([3106]))

In [40]:
# hinge loss function - L(y,f(x))=max(0,1−y⋅f(x))
# the J(w,b) = 1/2 * det(w) **2 + c * sum(max(0, 1 - yi * (wT @ xi + b)))
# here the output of yi * (wT @ xi + b) is an [1] vector, moreover the shapes of yi is [1], wT is [1, m] and xi is [m, 1] where m is the no of featueres
# and shape of b is [1]. For easier implementation xi @ w, which will give the exact same result because shape of xi in each iteration will be [1, m] and
# shape of w is [m, 1].
# dJ/dw = w if max(0, 1 - yi * (wT @ xi + b)) == 0
#         w - c * sum(yi * xi) otherwise
# dJ/db = -c * sum(yi)

In [None]:
# implementing using stochastic gradient descent - where the cost is computed separately for each variable
# hyper parameters
c = 1.0
learning_rate = 0.01
num_epochs = 25
w = torch.randn(xtr.shape[1], requires_grad = True)
b = torch.ones(1, requires_grad = True)
print(w.shape, b.shape)

for i in range (num_epochs):
    total_loss = 0.0
    for j in range(len(xtr)):
        margin = ytr[j] * (torch.dot(xtr[j], w) + b)
        loss = torch.max(torch.tensor(0.0), 1 - margin)
        total_loss += loss.item()
        optimizer.zero_grad()
        if margin < 1:
            w = w - learning_rate * (w - c * ytr[j] * xtr[j])
            b = b - learning_rate * (-c * ytr[j])
            optimizer.step()
        else:
            w = w - learning_rate * w
            optimizer.step()
    print(f'loss for iteration {i} is {total_loss}')
w = w.numpy()
b = b.item()

In [64]:
def predict(x, w, b):
    decision_values = (x @ w) + b
    return np.where(decision_values >= 0, 1, -1)

pred = predict(xtest, w, b)
print(f'recall score {recall_score(ytest, pred)}')
print(f'precision score {precision_score(ytest, pred)}')

recall score 0.7635782747603834
precision score 0.7308868501529052


  decision_values = (x @ w) + b


In [67]:
import torch

# Hyperparameters
c = 1.0
learning_rate = 0.01
num_epochs = 25

# Initialize parameters with requires_grad=True
w = torch.randn(xtr.shape[1], requires_grad=True)
b = torch.ones(1, requires_grad=True)

optimizer = torch.optim.SGD([w, b], lr=learning_rate)

for epoch in range(num_epochs):
    total_loss = 0.0
    for j in range(len(xtr)):
        margin = ytr[j] * (torch.dot(xtr[j], w) + b)
        loss = torch.max(torch.tensor(0.0), 1 - margin)
        total_loss += loss.item()
        
        optimizer.zero_grad()
        if margin < 1:
            loss.backward()
            optimizer.step()

    print(f'Loss for epoch {epoch} is {total_loss}')

w = w.detach().numpy()
b = b.item()


Loss for epoch 0 is 2855.444017112284
Loss for epoch 1 is 2462.565380201442
Loss for epoch 2 is 2462.565380201442
Loss for epoch 3 is 2462.565380201442
Loss for epoch 4 is 2462.565380201442
Loss for epoch 5 is 2462.565380201442
Loss for epoch 6 is 2462.565380201442
Loss for epoch 7 is 2462.565380201442
Loss for epoch 8 is 2462.565380201442
Loss for epoch 9 is 2462.565380201442
Loss for epoch 10 is 2462.565380201442
Loss for epoch 11 is 2462.565380201442
Loss for epoch 12 is 2462.565380201442
Loss for epoch 13 is 2462.565380201442


KeyboardInterrupt: 