<a href="https://colab.research.google.com/github/Mainakdeb/Wine_Quality_Prediction/blob/master/Wine_Connoisseur.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import torch.utils.data as data
import torch
from torch import nn, optim
import torch.nn.functional as F
import tqdm
from tqdm import tnrange
import copy

# Constants

In [None]:
USE_MPS_GPU = False  # device whether to use MPS gpu if available

# connect to GPU if available

In [None]:
import platform

platform.platform()

In [None]:
if torch.has_mps and USE_MPS_GPU:
    device = torch.device("mps")
else:
    device = torch.device("cpu")

device

In [None]:
# GPU operations have a separate seed we also want to set
if torch.cuda.is_available(): 
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

if torch.backends.mps.is_available() and USE_MPS_GPU:
    torch.backends.mps.manual_seed(42)
    torch.backends.mps.manual_seed_all(42)
    
# Additionally, some operations on a GPU are implemented stochastic for efficiency
# We want to ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False
torch.backends.mps.deterministic = True
torch.backends.mps.benchmark = False

### The Data :

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', delimiter=";")
df.head()

### Convert all values into float:

In [None]:
df = df.astype(float)

### Scale all values :

In [None]:
quality_backup = df["quality"]
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
df_scaled['quality'] = quality_backup #restore quality values

In [None]:
df_scaled.describe()

### Balance Data :

In [None]:
# df3['quality'].value_counts().sort_index().plot(kind='bar', sort_columns=False)

In [None]:
# max_size = df_scaled['quality'].value_counts().max()
# lst = [df_scaled]
# for class_index, group in df_scaled.groupby('quality'):
#     lst.append(group.sample(max_size-len(group), replace=True))
# frame_new = pd.concat(lst)
# df_scaled2=frame_new
# df_scaled2["quality"].value_counts()

### Shuffle Data :

In [None]:
# df_scaled3=df_scaled2.sample(frac=1)
# df_scaled3 = df_scaled

### Split into train, test and val set :

In [None]:
train = df_scaled.iloc[:3686]
val = df_scaled.iloc[3686:3886]
test = df_scaled.iloc[3886:]

In [None]:
# train=df.sample(frac=0.8,random_state=23) #random state is a seed value
# test=df.drop(train.index)

### Split features and labels :

In [None]:
X_train, y_train= train.drop('quality', axis=1), train['quality']
print(X_train.shape)

X_val, y_val = val.drop('quality', axis=1), val['quality']
print(X_val.shape)

X_test, y_test = test.drop("quality", axis=1), test["quality"]
print(X_test.shape)

### Split into batches :

In [None]:
# train_data_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=True)

In [None]:
f=20 # no. of batches

train_batch = np.array_split(X_train, f) 
label_batch = np.array_split(y_train, f) # 50 sections/batches

val_batch = np.array_split(X_val, f)
val_label_batch = np.array_split(y_val, f)

test_batch = np.array_split(X_test,f) 
test_label_batch  = np.array_split(y_test, f)


for i in range(len(train_batch)):
    train_batch[i] = torch.from_numpy(train_batch[i].values).float()
for i in range(len(label_batch)):
    label_batch[i] = torch.from_numpy(label_batch[i].values).float().view(-1, 1)
    
for i in range(len(val_batch)):
    val_batch[i] = torch.from_numpy(val_batch[i].values).float()
for i in range(len(val_label_batch)):
    val_label_batch[i] = torch.from_numpy(val_label_batch[i].values).float().view(-1, 1)
    
    
for i in range(len(test_batch)):
    test_batch[i] = torch.from_numpy(test_batch[i].values).float()
for i in range(len(test_label_batch)):
    test_label_batch[i] = torch.from_numpy(test_label_batch[i].values).float().view(-1, 1)
    
print("Batch size:", len(train_batch[0]))


### The Model :

In [None]:
class Regressor(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.fc1 = nn.Linear(11, 22)
        self.fc2 = nn.Linear(22, 44)
        self.fc3 = nn.Linear(44, 88)
        self.fc4 = nn.Linear(88, 176)
        self.fc5 = nn.Linear(176, 88)
        self.fc6 = nn.Linear(88, 22)
        self.fc7 = nn.Linear(22, 1)

        self.dropout = nn.Dropout(0.20)
       
    def forward(self, x):
        
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        x = F.relu(self.fc4(x))
        x = self.dropout(x)
        x = F.relu(self.fc5(x))
        x = self.dropout(x)
        x = F.relu(self.fc6(x))
        x = self.dropout(x)
        x = F.relu(self.fc7(x))

        return x

model = Regressor()
train_losses, val_losses = [], []
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) # 0.015 87
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min',factor=0.1, patience=15) 
total_epochs=0

### The training loop :

In [None]:
best_model = copy.deepcopy(model)
best_val_loss = None

In [None]:
#print(model)
epochs = 1000
total_epochs+=epochs

model.train()

for e in tnrange(epochs):
    
    train_loss = 0
    val_loss_1 = 0
    val_loss_sum = 0
    
    for i in range(len(train_batch)):
        
        output = model(train_batch[i])
        loss = criterion(output, label_batch[i])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
        with torch.no_grad():
            
            for j in range(len(val_batch)):
                
                val_output = model(val_batch[j])
                val_loss =  criterion(val_output, val_label_batch[j])
                val_loss_1+=val_loss.item()
        val_loss_sum=val_loss_1/len(val_batch)
        
    # saving best model
    val_loss_divided = val_loss_sum/len(val_batch)
    if best_val_loss is None or val_loss_divided < best_val_loss:
        print('Model replaced')
        best_val_loss = val_loss_divided
        best_model = copy.deepcopy(model)
    print("Epoch :", e, "train_loss :", train_loss/len(train_batch), "Val loss: ", val_loss_divided)    
    val_losses.append(val_loss_divided)    
    train_losses.append(train_loss/len(train_batch))   

### Training Metrics :

In [None]:
frm=10 # does not 
plt.plot(train_losses[frm:], label='Training loss')
plt.plot(val_losses[frm:], label='Validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.grid()
plt.legend()

### How does the model perform on the test-set?

#### best model

In [None]:
best_model.eval()
correct=0
i=0
res=[]
with torch.no_grad():
    for batch in test_batch :
        for j in range(len(batch)):
            x = best_model(batch[j])
            res.append(round(x.item()))

true_labels = list(test["quality"])

for i in range(len(res)):
    if res[i]==int(true_labels[i]):
        correct+=1
        
print("Accuracy:", 100 * (correct/len(res)), "%")

#### model from last iteration

In [None]:
model.eval()
correct=0
i=0
res=[]
with torch.no_grad():
    for batch in test_batch :
        for j in range(len(batch)):
            x = model(batch[j])
            #print(round(x.item()))
            res.append(round(x.item()))

true_labels= list(test["quality"])

for i in range(len(res)):
    if res[i]==int(true_labels[i]):
        correct+=1
        
print("Accuracy:", 100*(correct/len(res)), "%")