# Learnig the representations

In [None]:
# Dependecies
import importlib
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
# device agnostic code
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## pasted models
here for convenience

In [62]:
'''
    Representation learning:
    Goal: use unsupervised learning techniques to learn a representation of given data.
    The hope is that this representation will be useful to reduce the amount of data that is needed for training the supervised model for solving the actual task.
    To verify how good the learned representation is, train a supervised model using these representations that predicts the available pretrain labels.

    Methology:
    1. Create a several neural networks that learn to encode the data into a representation.
    2. Train a supervised model on each of the learned representations. The superverised model trained on the different representations should be very shallow (1 or two fully connected layers) and should be trained for a very short time. The goal is to make the performance of the encoders comparable.
'''

# Dependencies
import torch
import torch.nn as nn

'''
    Autoencoder for dimensionality reduction:
    Both encoder and decoder using three linear layers
'''

# for this to make sense the encoding dimension should be significantly smaller than the input dimension
# specifically, the encoding_dim*3 shold be smaller than the input_size
class LinearAutoencoder(nn.Module):
    def __init__(self, input_size, encoding_dim):
        super(LinearAutoencoder, self).__init__()
        # encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_size, encoding_dim*3),
            nn.BatchNorm1d(encoding_dim*3),
            nn.ReLU(),
            nn.Linear(int(encoding_dim*3), int(encoding_dim*2)),
            nn.BatchNorm1d(int(encoding_dim*2)),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
        )
        # decoder
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, encoding_dim*2),
            nn.BatchNorm1d(encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim*3),
            nn.BatchNorm1d(encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, input_size),
            nn.Sigmoid() # the feature values are between 0 and 1
        )

        # fully connected layer for pretrain task
        self.fc = nn.Sequential(
            nn.Linear(encoding_dim, 1)
        )

    def forward(self, x, pretrain = False):
        if pretrain:
            x = self.encoder(x)
            x = self.fc(x)
            x = x.squeeze(1)
        else:
            x = self.encoder(x)
            x = self.decoder(x)
        return x



In [69]:
class ConvAutoencoder(nn.Module):
    def __init__(self, number_filters):
        super(ConvAutoencoder, self).__init__()
        self.number_filters = number_filters
        # encoder
        self.encoder = nn.Sequential(
            nn.Conv1d(1, number_filters*3, kernel_size=3, stride=2, padding=0),
            nn.BatchNorm1d(number_filters*3),
            nn.ReLU(),
            nn.Conv1d(number_filters*3, number_filters*2, kernel_size=3, stride=2, padding=0),
            nn.BatchNorm1d(number_filters*2),
            nn.ReLU(),
            nn.Conv1d(number_filters*2, number_filters, kernel_size=3, stride=2, padding=0),
            nn.BatchNorm1d(number_filters)
        )
        # decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(number_filters, number_filters*2, kernel_size=3, stride=2, padding=0),
            nn.BatchNorm1d(number_filters*2),
            nn.ReLU(),
            nn.ConvTranspose1d(number_filters*2, number_filters*3, kernel_size=3, stride=2, padding=0),
            nn.BatchNorm1d(number_filters*3),
            nn.ReLU(),
            nn.ConvTranspose1d(number_filters*3, 1, kernel_size=3, stride=2, padding=0, output_padding=1), 
            nn.Sigmoid() # the feature values are between 0 and 1
        )
        # fully connected layer for pretrain task
        self.fc = nn.Sequential(
            nn.Linear(124*number_filters, 1)
        )
        
    def forward(self, x, pretrain=False):
        if pretrain:
          x = x.unsqueeze(1)
          x = self.encoder(x)
          x = x.view(-1, 124*self.number_filters)     
          x = self.fc(x)
          x = x.squeeze(1)
        else:
          x = x.unsqueeze(1)
          x = self.encoder(x)
          x = self.decoder(x)
          x = x.squeeze(1)
        return x
       

In [64]:
'''
    Autocoder for dimensionality reduction:
    Using two convolutional/deconvolutional layers and one fully connected layer for both encoder and decoder
'''
class ConvLinearAutoencoder(nn.Module):
    def __init__(self, number_filters, encoding_dim):
        super(ConvLinearAutoencoder, self).__init__()
        self.number_filters = number_filters
        # encoder
        self.encoder = nn.Sequential(
            nn.Conv1d(1, number_filters*2, kernel_size=3, stride=2, padding=0),
            nn.BatchNorm1d(number_filters*2),
            nn.ReLU(),
            nn.Conv1d(number_filters*2, number_filters, kernel_size=3, stride=2, padding=0),
            nn.BatchNorm1d(number_filters),
            nn.ReLU(),
        ) 
        # bottleneck layer
        self.fcencoder = nn.Sequential(
            nn.Linear(249*number_filters, encoding_dim),
            nn.BatchNorm1d(encoding_dim),
        )
        self.fcdecoder = nn.Sequential(
            nn.Linear(encoding_dim, 249*number_filters),
            nn.BatchNorm1d(249*number_filters),
        )

        # decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(number_filters, number_filters*2, kernel_size=3, stride=2, padding=0),
            nn.BatchNorm1d(number_filters*2),
            nn.ReLU(),
            nn.ConvTranspose1d(number_filters*2, 1, kernel_size=3, stride=2, padding=0, output_padding=1),
            nn.Sigmoid() # the feature values are between 0 and 1
        )

        # fully connected layer for pretrain task
        self.fc = nn.Sequential(
            nn.Linear(encoding_dim, 1),
            nn.BatchNorm1d(1),
        )

    def forward(self, x, pretrain = False):
        if pretrain:
            x = x.unsqueeze(1)
            x = self.encoder(x)
            x = x.view(-1, 249*self.number_filters)
            x = self.fcencoder(x)
            x = self.fc(x)
            x = x.squeeze(1)
        else:
            x = x.unsqueeze(1)
            x = self.encoder(x)
            x = x.view(-1, 249*self.number_filters)
            x = self.fcencoder(x)
            x = self.fcdecoder(x)
            x = x.view(-1,self.number_filters, 249)
            x = self.decoder(x)
            x = x.squeeze(1)
        return x


In [None]:
# contractive loss function
def contractive_loss(W, x, recons_x, h, lam=1e-4):
    mse_loss = nn.MSELoss()(recons_x, x)
    
    dh = h * (1 - h) # Derivative of sigmoid
    w_sum = torch.sum(Variable(W)**2, dim=1)
    w_sum = w_sum.unsqueeze(1) # Shape to 2D tensor
    contractive_loss = torch.sum(torch.mm(dh**2, w_sum), 0)
    return mse_loss + contractive_loss.mul_(lam)

## load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil
shutil.unpack_archive("drive/MyDrive/SolarEnergyMaterials/task4.zip", "/content/data")
shutil.unpack_archive("data/task4_hr35z9/pretrain_features.csv.zip", "/content/data")
shutil.unpack_archive("data/task4_hr35z9/pretrain_labels.csv.zip", "/content/data")
shutil.unpack_archive("data/task4_hr35z9/train_features.csv.zip", "/content/data")
shutil.unpack_archive("data/task4_hr35z9/train_labels.csv.zip", "/content/data")

In [None]:
def load_pretrain_data(batch_size = 64):
    batch_size = 64

    random.seed(17)
    test_ind = set()

    pre_train_size = 50000

    while len(test_ind) < 10000: 
        test_ind.add(random.randint(0, pre_train_size-1))

    features =[]
    labels = []

    with open("data/pretrain_features.csv", 'r') as f:
        for row in f:
            features.append(row)

    with open("data/pretrain_labels.csv", 'r') as f:
        for row in f:
            labels.append(row)

    # remove header
    features = features[1:]
    labels = labels[1:]

    # first try to note use representation of the molecules, only the extracted features
    features = [list(map(float,row.split(',')[2:])) for row in features]
    labels = [float(row.split(',')[1]) for row in labels]

    train_features = []
    train_labels = []
    test_features = []
    test_labels = []


    for i in range(len(features)):
        if i in test_ind:
            test_features.append(features[i])
            test_labels.append(labels[i])
        else:
            train_features.append(features[i])
            train_labels.append(labels[i])

    # does not seem to make sense to normalize the data since it is very sparse
    # normalize train_features
    # train_features = (train_features - np.mean(train_features, axis=0)) / (np.std(train_features, axis=0)+EPSILON)

    # normalize test_features
    # test_features = (test_features - np.mean(test_features, axis=0)) / (np.std(test_features, axis=0)+EPSILON)

    # convert into tensor dataset
    train_features = torch.tensor(train_features, dtype=torch.float)
    train_labels = torch.tensor(train_labels, dtype=torch.float)
    test_features = torch.tensor(test_features, dtype=torch.float)
    test_labels = torch.tensor(test_labels, dtype=torch.float)

    train_dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    test_dataset = torch.utils.data.TensorDataset(test_features, test_labels) 
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader

In [None]:
train_loader, test_loader = load_pretrain_data(batch_size = 64)

## Train/Test loop

In [34]:
# train loop:
def train_encoder(model, dataloader, epochs, pretrain=False):
    if pretrain:
      freeze_weights(model.encoder)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    for epoch in range(epochs):
        epoch_loss=0
        for batch, (X, y) in enumerate(dataloader):
          if pretrain:
            optimizer.zero_grad()
            y_pred = model(X, pretrain=True)
            loss = loss_fn(y, y_pred)
            epoch_loss+=loss.item()
            loss.backward()  
            optimizer.step()
          else:
            optimizer.zero_grad()
            X_pred = model(X)
            loss = loss_fn(X_pred, X)
            epoch_loss+=loss.item()
            loss.backward()  
            optimizer.step()

        print('average loss per batch in epoch [{}/{}], Loss: {:.6f}'.format(epoch+1, epochs, epoch_loss/len(dataloader)))

In [57]:
# test loop
def test_model(model, data_loader):
  loss_fn = nn.MSELoss() 
  model.to(device)
  Y = torch.tensor([]).to(device)
  Y_pred = torch.tensor([]).to(device)
  with torch.no_grad():
    for batch, (X,y) in enumerate(data_loader):
      X = X.to(device)
      y = y.to(device)
      y_pred = model(X, pretrain=True)
      Y = torch.cat((Y, y))
      Y_pred = torch.cat((Y_pred, y_pred))
    loss = torch.sqrt(loss_fn(y_pred, y))
    print(f"average batch loss: {loss.item()}")

In [None]:
def freeze_weights(model):
    for param in model.parameters():
        param.requires_grad = False


## Linear Autoencoder:

In [63]:
LinearEncoder = LinearAutoencoder(1000, 128)
train_encoder(LinearEncoder, train_loader, 10)

average loss per batch in epoch [1/10], Loss: 0.017154
average loss per batch in epoch [2/10], Loss: 0.007236
average loss per batch in epoch [3/10], Loss: 0.005278
average loss per batch in epoch [4/10], Loss: 0.004157
average loss per batch in epoch [5/10], Loss: 0.003436
average loss per batch in epoch [6/10], Loss: 0.002930
average loss per batch in epoch [7/10], Loss: 0.002569
average loss per batch in epoch [8/10], Loss: 0.002290
average loss per batch in epoch [9/10], Loss: 0.002094
average loss per batch in epoch [10/10], Loss: 0.001923


In [66]:
torch.save(LinearEncoder.state_dict(), 'drive/MyDrive/SolarEnergyMaterials/PretrainedModels/LinearEncoder.pth')

## Convolutional Autoencoder:

In [72]:
ConvEncoder = ConvAutoencoder(4)
test = next(iter(train_loader))
train_encoder(ConvEncoder, train_loader, 10)

average loss per batch in epoch [1/10], Loss: 0.029119
average loss per batch in epoch [2/10], Loss: 0.002585
average loss per batch in epoch [3/10], Loss: 0.001260
average loss per batch in epoch [4/10], Loss: 0.000651
average loss per batch in epoch [5/10], Loss: 0.000424
average loss per batch in epoch [6/10], Loss: 0.000330
average loss per batch in epoch [7/10], Loss: 0.000280
average loss per batch in epoch [8/10], Loss: 0.000253
average loss per batch in epoch [9/10], Loss: 0.000229
average loss per batch in epoch [10/10], Loss: 0.000211


In [None]:
torch.save(ConvEncoder.state_dict(), 'drive/MyDrive/SolarEnergyMaterials/PretrainedModels/ConvEncoder.pth')

## Convolutional Autoencoder with Linear Layer:

In [67]:
ConvLinearEncoder = ConvLinearAutoencoder(6, 90) # almost no compression if product close to 1000
train_encoder(ConvLinearEncoder, train_loader, 10)

average loss per batch in epoch [1/10], Loss: 0.088443
average loss per batch in epoch [2/10], Loss: 0.009094
average loss per batch in epoch [3/10], Loss: 0.004613
average loss per batch in epoch [4/10], Loss: 0.003140
average loss per batch in epoch [5/10], Loss: 0.002420
average loss per batch in epoch [6/10], Loss: 0.001989
average loss per batch in epoch [7/10], Loss: 0.001686
average loss per batch in epoch [8/10], Loss: 0.001491
average loss per batch in epoch [9/10], Loss: 0.001327
average loss per batch in epoch [10/10], Loss: 0.001202


In [None]:
torch.save(ConvEncoder.state_dict(), 'drive/MyDrive/SolarEnergyMaterials/PretrainedModels/ConvEncoder.pth')

## Evaluating the representations
With the pretrain data

### Linear Model


In [65]:
train_encoder(LinearEncoder, train_loader, 10, pretrain=True)
test_model(LinearEncoder, train_loader)
print('---')
test_model(LinearEncoder, test_loader)

average loss per batch in epoch [1/10], Loss: 0.224725
average loss per batch in epoch [2/10], Loss: 0.077986
average loss per batch in epoch [3/10], Loss: 0.066437
average loss per batch in epoch [4/10], Loss: 0.059136
average loss per batch in epoch [5/10], Loss: 0.053170
average loss per batch in epoch [6/10], Loss: 0.050927
average loss per batch in epoch [7/10], Loss: 0.047129
average loss per batch in epoch [8/10], Loss: 0.045912
average loss per batch in epoch [9/10], Loss: 0.044380
average loss per batch in epoch [10/10], Loss: 0.043440
average batch loss: 0.19463780522346497
---
average batch loss: 0.20404937863349915


### Conv Model


In [74]:
train_encoder(ConvEncoder, train_loader, 10, pretrain=True)
test_model(ConvEncoder, train_loader)
print('---')
test_model(ConvEncoder, test_loader)

average loss per batch in epoch [1/10], Loss: 0.201428
average loss per batch in epoch [2/10], Loss: 0.046438
average loss per batch in epoch [3/10], Loss: 0.037808
average loss per batch in epoch [4/10], Loss: 0.034626
average loss per batch in epoch [5/10], Loss: 0.032844
average loss per batch in epoch [6/10], Loss: 0.031865
average loss per batch in epoch [7/10], Loss: 0.031174
average loss per batch in epoch [8/10], Loss: 0.030758
average loss per batch in epoch [9/10], Loss: 0.030439
average loss per batch in epoch [10/10], Loss: 0.030784
average batch loss: 0.1744474172592163
---
average batch loss: 0.16500823199748993


### Conv Linear Model

In [73]:
train_encoder(ConvLinearEncoder, train_loader, 10, pretrain=True)
test_model(ConvLinearEncoder, train_loader)
print('---')
test_model(ConvLinearEncoder, test_loader)

average loss per batch in epoch [1/10], Loss: 9.198197
average loss per batch in epoch [2/10], Loss: 5.928121
average loss per batch in epoch [3/10], Loss: 3.672421
average loss per batch in epoch [4/10], Loss: 2.110269
average loss per batch in epoch [5/10], Loss: 1.092218
average loss per batch in epoch [6/10], Loss: 0.490598
average loss per batch in epoch [7/10], Loss: 0.185035
average loss per batch in epoch [8/10], Loss: 0.061175
average loss per batch in epoch [9/10], Loss: 0.025112
average loss per batch in epoch [10/10], Loss: 0.018650
average batch loss: 0.1384148895740509
---
average batch loss: 0.10748357325792313
