### Pytorch
Prepare data

In [155]:
import pandas as pd

df = pd.read_csv('data/cleaned_data.csv', delimiter = ';')
# Shuffle the DataFrame
df = df.sample(frac = 1, random_state=27)
df.head(10)

Unnamed: 0.1,Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
3687,4188,5.3,0.33,0.3,1.2,0.048,25.0,119.0,0.99045,3.32,0.62,11.3,6,0
1070,1224,7.2,0.23,0.39,2.3,0.033,29.0,102.0,0.9908,3.26,0.54,12.3,7,0
4101,4636,6.5,0.51,0.25,1.7,0.048,39.0,177.0,0.99212,3.28,0.57,10.566667,5,0
2289,2636,7.2,0.24,0.27,11.4,0.034,40.0,174.0,0.99773,3.2,0.44,9.0,5,0
3756,4263,7.0,0.55,0.05,8.0,0.036,19.0,164.0,0.99269,3.26,0.46,12.2,6,0
3505,3985,6.8,0.11,0.27,8.6,0.044,45.0,104.0,0.99454,3.2,0.37,9.9,6,0
1,2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,0
4480,610,8.8,0.24,0.54,2.5,0.083,25.0,57.0,0.9983,3.39,0.54,9.2,5,1
3366,3827,5.1,0.23,0.18,1.0,0.053,13.0,99.0,0.98956,3.22,0.39,11.5,5,0
4055,4585,5.0,0.33,0.23,11.8,0.03,23.0,158.0,0.99322,3.41,0.64,11.8,6,0


In [156]:
# aj taketo riesenie existuje pip install sklearn
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

train_size = int(len(df) * 0.64)
test_size = int(len(df) * 0.8)

# training set - 64% from all data
train_df = df[:train_size]

# validation set - 16% from all data
val_df = df[train_size:test_size]

# testing set - 20% from all data
test_df = df[test_size:]

print('df -', df.shape[0])
print('---------------')
print('train_df -', train_df.shape[0])
print('val_df -', val_df.shape[0])
print('test_df -', test_df.shape[0])

df - 4907
---------------
train_df - 3140
val_df - 785
test_df - 982


In [157]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

### Neural network architecture

- Number of inputs for first layer is same as the number of attributes (columns) in our dataset
- Number of outputs in last layer needs to be equal to the number of values that network is expected to predict. In our case we are trying to predict wine quality (range <3,9>)
- The number of neurons (inputs and outputs) per layer in hidden layers can vary - we need to try different values to find better network performace. A good rule of thumb is to start with a smaller number of neurons and gradually increase the number of neurons until you find the optimal number that gives you the best performance.
- We choose **ReLU as actiovation function**. In general, it works well
- MSELoss (Mean squared error) is commonly use when you want to penalize larger errors more than smaller ones (common choise for regression problems)
- We choose **SGD optimizer** because we have small dataset and problem with small complexity 

In [158]:
class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = torch.tensor(self.df.iloc[idx].values.astype('float32'))
        y = torch.tensor(self.df.iloc[idx]['quality'].astype('float32'))
        y_onehot = torch.zeros(7)
        y_onehot[int(y) - 3] = 1.0
        return x, y_onehot

# Define your neural network architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(14, 8)
        self.fc2 = nn.Linear(8, 4)
        self.fc3 = nn.Linear(4, 7)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

net = Net()
net.train()

# Define your loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

batch_size = 64
train_dataset = MyDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = MyDataset(val_df)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(10):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_dataloader, 0):
        optimizer.zero_grad()
        outputs = net(inputs)
        # Labels shape and output shape must be same
        # print(outputs.shape, labels.shape)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 100 == 0:
            print(f'Training - epoch: {epoch + 1} loss: {round(running_loss / 100, 4)}')
            running_loss = 0.0

# Validation
net.eval()
with torch.no_grad():
    val_loss = 0.0
    for inputs, labels in val_dataloader:
        outputs = net(inputs)
        val_loss += criterion(outputs, labels).item()
    print(f'Validation - epoch: {epoch + 1}, loss: {round(val_loss / len(val_dataloader), 4)}')
print('Finished Training')

Training - epoch: 1 loss: 0.0029
Training - epoch: 2 loss: 0.0026
Training - epoch: 3 loss: 0.0022
Training - epoch: 4 loss: 0.0018
Training - epoch: 5 loss: 0.0016
Training - epoch: 6 loss: 0.0014
Training - epoch: 7 loss: 0.0013
Training - epoch: 8 loss: 0.0012
Training - epoch: 9 loss: 0.0012
Training - epoch: 10 loss: 0.0011
Validation - epoch: 10, loss: 0.1071
Finished Training


### What to do in the next week
- Try 2 hidden layers
- Try different number of neurons in hidden layers
- Try different activation functions
- Use validation data for tunning pytorch NN
- TensorFlow implementation


configs, wandb, try other orchitecture (vacsi mode, sirsi model, inu aktivacnu funkciu)
regresiu, asi neriesiť outliers, accuracy riesit tak ze predikujeme napr 4.6, tak to zaokruhlime na 5 a pozrieme ci sme sa trafili
povedal, ze nemusime mat validacny set
- validaciu musime volat v cykle po kazdej epoche
- len testovaciu preprocesingovat
- learning rate treba dynamicky menit a dobre je to nastaviť na mensi
- spravit dobry shuffel