# Autoencoder - Artificial Neural Network

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from matplotlib.pyplot import scatter, annotate, show
import numpy as np

In [3]:
data = pd.read_csv('wine_quality.csv')
x = data.iloc[:, :-1].to_numpy()
y = data.iloc[:, -1].to_numpy()

labels = {
    'Medium': 0,
    'Good': 1,
    'Excellent': 2
}
y = np.array([labels[i] for i in y])

print(x.shape)
print(y.shape)

(4898, 11)
(4898,)


In [6]:
#Model
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
import torch

class WineDataset(Dataset):
  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self, idx):
    tx = torch.tensor(self.x, dtype=torch.float32)
    ty = torch.tensor(self.y, dtype=torch.long)
    return tx[idx], ty[idx]

class MyModel(nn.Module):
  def __init__(self, num_hid, hid_sz):
    self.num_hid = num_hid
    self.hid_sz = hid_sz
    self.accuracy_on_val = -1
    super().__init__()
    layers = [nn.Linear(11, hid_sz)] #input layer
    for i in range(num_hid-1):
      layers.append(nn.Sigmoid())
      layers.append(nn.Linear(hid_sz, hid_sz))
    layers.append(nn.Sigmoid())
    layers.append(nn.Linear(hid_sz, 3)) #output layer
    self.layers = nn.ModuleList(layers)

  def forward(self, x):
    for layer in self.layers:
      x = layer(x)
    return x

  def train(self, dataset, b_sz=300, num_epoch=10):
    opt = optim.Adam(self.parameters())
    loss_fn = nn.CrossEntropyLoss()
    loader = DataLoader(dataset, batch_size=b_sz, shuffle=True)

    for epoch in range(num_epoch):
      for i, (bx, by) in enumerate(loader):
        output = self(bx)
        loss = loss_fn(output, by)
        loss.backward()
        if epoch%10==0 and i==0:
          print('Epoch: %d, Loss: %f' % (epoch, loss.item()))

        opt.step()
        opt.zero_grad()

  def evaluate(self, dataset):
    loader = DataLoader(dataset, batch_size=100, shuffle=False)
    correct = 0
    for bx, by in loader:
      output = self(bx)
      pred = torch.argmax(output, dim=1)
      correct += (pred==by).sum().item()
    self.accuracy_on_val = correct/len(dataset)
    return self.accuracy_on_val

tmp = torch.rand(100, 11)
model = MyModel(3, 50)
print(model(tmp).shape)

torch.Size([100, 3])


In [21]:
m = x.shape[0]
m_train = int(m*0.8)
m_val = int(m*0.1)
m_test = m-m_train-m_val

ds_train = WineDataset(x[:m_train], y[:m_train])
ds_val = WineDataset(x[m_train:m_train+m_val], y[m_train:m_train+m_val])
ds_test = WineDataset(x[m_train+m_val:], y[m_train+m_val:])

In [22]:
num_hids = [2, 3, 4]
hid_szs = [25, 50, 100]

models = []
for num_hid in num_hids:
  for hid_sz in hid_szs:
    model = MyModel(num_hid, hid_sz)
    print('\n\nTraining model with num_hid=%d, hid_sz=%d' % (num_hid, hid_sz))
    model.train(ds_train, num_epoch=50)
    models.append(model)

for model in models:
  print('Accuracy on validation set: %f' % model.evaluate(ds_val))



Training model with num_hid=2, hid_sz=25
Epoch: 0, iter: 0, Loss: 1.224852
Epoch: 10, iter: 0, Loss: 1.053966
Epoch: 20, iter: 0, Loss: 1.024002
Epoch: 30, iter: 0, Loss: 0.997552
Epoch: 40, iter: 0, Loss: 0.948823


Training model with num_hid=2, hid_sz=50
Epoch: 0, iter: 0, Loss: 1.107925
Epoch: 10, iter: 0, Loss: 1.060706
Epoch: 20, iter: 0, Loss: 1.012447
Epoch: 30, iter: 0, Loss: 0.968128
Epoch: 40, iter: 0, Loss: 0.942396


Training model with num_hid=2, hid_sz=100
Epoch: 0, iter: 0, Loss: 1.086631
Epoch: 10, iter: 0, Loss: 0.998154
Epoch: 20, iter: 0, Loss: 0.951479
Epoch: 30, iter: 0, Loss: 0.975676
Epoch: 40, iter: 0, Loss: 0.934791


Training model with num_hid=3, hid_sz=25
Epoch: 0, iter: 0, Loss: 1.176106
Epoch: 10, iter: 0, Loss: 1.040063
Epoch: 20, iter: 0, Loss: 1.033023
Epoch: 30, iter: 0, Loss: 1.030554
Epoch: 40, iter: 0, Loss: 0.949053


Training model with num_hid=3, hid_sz=50
Epoch: 0, iter: 0, Loss: 1.100878
Epoch: 10, iter: 0, Loss: 1.017025
Epoch: 20, iter: 0,

In [23]:
for model in models:
  print('Accuracy of model (num_hid=%d, hid_sz=%d): %f' % (model.num_hid, model.hid_sz, model.accuracy_on_val))

Accuracy of model (num_hid=2, hid_sz=25): 0.558282
Accuracy of model (num_hid=2, hid_sz=50): 0.529652
Accuracy of model (num_hid=2, hid_sz=100): 0.484663
Accuracy of model (num_hid=3, hid_sz=25): 0.568507
Accuracy of model (num_hid=3, hid_sz=50): 0.601227
Accuracy of model (num_hid=3, hid_sz=100): 0.390593
Accuracy of model (num_hid=4, hid_sz=25): 0.515337
Accuracy of model (num_hid=4, hid_sz=50): 0.588957
Accuracy of model (num_hid=4, hid_sz=100): 0.617587


In [25]:
best_model = models[0]

for model in models:
  if model.accuracy_on_val > best_model.accuracy_on_val:
    best_model = model

print('Accuracy on test set: %f' % best_model.evaluate(ds_test))
print('Accuracy on training set: %f' % best_model.evaluate(ds_train))
print('Number of hidden layers: %d' % best_model.num_hid)
print('Size of hidden layers: %d' % best_model.hid_sz)

Accuracy on test set: 0.535642
Accuracy on training set: 0.533946
Number of hidden layers: 3
Size of hidden layers: 50


### PCA

In [26]:
pca = PCA(n_components=5)
pca.fit(x)
new_x = pca.transform(x)

print(new_x.shape)

# print(new_x[:5])

(4898, 5)


In [27]:
m = new_x.shape[0]
m_train = int(m*0.8)
m_val = int(m*0.1)
m_test = m-m_train-m_val

ds_train = WineDataset(x[:m_train], y[:m_train])
ds_val = WineDataset(x[m_train:m_train+m_val], y[m_train:m_train+m_val])
ds_test = WineDataset(x[m_train+m_val:], y[m_train+m_val:])

In [28]:
num_hids = [3]
hid_szs = [50]

models = []
for num_hid in num_hids:
  for hid_sz in hid_szs:
    model = MyModel(num_hid, hid_sz)
    print('\n\nTraining model with num_hid=%d, hid_sz=%d' % (num_hid, hid_sz))
    model.train(ds_train, num_epoch=50)
    models.append(model)

for model in models:
  print('Accuracy on validation set: %f' % model.evaluate(ds_val))
  print('Accuracy of model (num_hid=%d, hid_sz=%d): %f' % (model.num_hid, model.hid_sz, model.accuracy_on_val))





Training model with num_hid=3, hid_sz=50
Epoch: 0, iter: 0, Loss: 1.085901
Epoch: 10, iter: 0, Loss: 1.057894
Epoch: 20, iter: 0, Loss: 0.973926
Epoch: 30, iter: 0, Loss: 0.943851
Epoch: 40, iter: 0, Loss: 0.939765
Accuracy on validation set: 0.415133
Accuracy of model (num_hid=3, hid_sz=50): 0.415133


### Autoencoder

In [29]:
# Autoencoder model with 3 hidden layers and 50 hidden units
class Autoencoder(nn.Module):
  def __init__(self):
    super().__init__()
    self.encoder = nn.Sequential(
      nn.Linear(11, 50),
      nn.Sigmoid(),
      nn.Linear(50, 25),
      nn.Sigmoid(),
      nn.Linear(25, 5)
    )
    self.decoder = nn.Sequential(
      nn.Linear(5, 25),
      nn.Sigmoid(),
      nn.Linear(25, 50),
      nn.Sigmoid(),
      nn.Linear(50, 11)
    )

  def forward(self, x):
    x = self.encoder(x)
    x = self.decoder(x)
    return x
  
  def train(self, dataset, b_sz=300, num_epoch=10):
    opt = optim.Adam(self.parameters())
    loss_fn = nn.MSELoss()
    loader = DataLoader(dataset, batch_size=b_sz, shuffle=True)

    for epoch in range(num_epoch):
      for i, (bx, by) in enumerate(loader):
        output = self(bx)
        loss = loss_fn(output, bx)
        loss.backward()
        if epoch%10==0 and i==0:
          print('Epoch: %d, Loss: %f' % (epoch, i, loss.item()))

        opt.step()
        opt.zero_grad()
  
  def evaluate(self, dataset):
    loader = DataLoader(dataset, batch_size=100, shuffle=False)
    loss_fn = nn.MSELoss()
    loss = 0
    for bx, by in loader:
      output = self(bx)
      loss += loss_fn(output, bx).item()
    return loss/len(dataset)
  
  def encode(self, x):
    return self.encoder(x)
  
  def decode(self, x):
    return self.decoder(x)

In [31]:
autoencoder = Autoencoder()
autoencoder.train(ds_train, num_epoch=50)
print('Loss on validation set: %f' % autoencoder.evaluate(ds_val))


TypeError: not all arguments converted during string formatting