# Learnig the representations

In [76]:
# Dependecies
import importlib
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import RepresentationModels as RM
importlib.reload(RepresentationModels)

<module 'RepresentationModels' from '/home/paul/Documents/3AI/SolarEnergyMaterials/RepresentationModels.py'>

## pasted models
here for convenience

In [None]:
'''
    Representation learning:
    Goal: use unsupervised learning techniques to learn a representation of given data.
    The hope is that this representation will be useful to reduce the amount of data that is needed for training the supervised model for solving the actual task.
    To verify how good the learned representation is, train a supervised model using these representations that predicts the available pretrain labels.

    Methology:
    1. Create a several neural networks that learn to encode the data into a representation.
    2. Train a supervised model on each of the learned representations. The superverised model trained on the different representations should be very shallow (1 or two fully connected layers) and should be trained for a very short time. The goal is to make the performance of the encoders comparable.
'''

# Dependencies
import torch
import torch.nn as nn

'''
    Autoencoder for dimensionality reduction:
    Both encoder and decoder using three linear layers
'''

# for this to make sense the encoding dimension should be significantly smaller than the input dimension
# specifically, the encoding_dim*3 shold be smaller than the input_size
class LinearAutoencoder(nn.Module):
    def __init__(self, input_size, encoding_dim):
        super(LinearAutoencoder, self).__init__()
        # encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_size, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(int(encoding_dim*3), int(encoding_dim*2)),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim)
        )
        # decoder
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, input_size),
            nn.Sigmoid() # the feature values are between 0 and 1
        )
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


'''
    Autocoder for dimensionality reduction:
    Using three convolutional/deconvolutional layers for encoder/decoder
'''
class ConvAutoencoder(nn.Module):
    def __init__(self, number_filters):
        super(ConvAutoencoder, self).__init__()
        # encoder
        self.encoder = nn.Sequential(
            nn.Conv1d(1, number_filters*3, kernel_size=3, stride=2, padding=0),
            nn.ReLU(),
            nn.Conv1d(number_filters*3, number_filters*2, kernel_size=3, stride=2, padding=0),
            nn.ReLU(),
            nn.Conv1d(number_filters*2, number_filters, kernel_size=3, stride=2, padding=0)
        )
        # decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(number_filters, number_filters*2, kernel_size=3, stride=2, padding=0),
            nn.ReLU(),
            nn.ConvTranspose1d(number_filters*2, number_filters*3, kernel_size=3, stride=2, padding=0),
            nn.ReLU(),
            nn.ConvTranspose1d(number_filters*3, 1, kernel_size=3, stride=2, padding=0, output_padding=1), # need out padding to get the right size
            nn.Sigmoid() # the feature values are between 0 and 1
        )
    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.encoder(x)
        x = self.decoder(x)
        x = x.squeeze(1)
        return x
    
'''
    Autocoder for dimensionality reduction:
    Using two convolutional/deconvolutional layers and one fully connected layer for both encoder and decoder
'''
class ConvLinearAutoencoder(nn.Module):
    def __init__(self, number_filters, encoding_dim):
        super(ConvLinearAutoencoder, self).__init__()
        self.number_filters = number_filters
        # encoder
        self.encoder = nn.Sequential(
            nn.Conv1d(1, number_filters*2, kernel_size=3, stride=2, padding=0),
            nn.ReLU(),
            nn.Conv1d(number_filters*2, number_filters, kernel_size=3, stride=2, padding=0),
            nn.ReLU(),
        ) 
        # bottleneck layer
        self.fcencoder = nn.Linear(249*number_filters, encoding_dim)
        self.fcdecoder = nn.Linear(encoding_dim, 249*number_filters)

        # decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(number_filters, number_filters*2, kernel_size=3, stride=2, padding=0),
            nn.ReLU(),
            nn.ConvTranspose1d(number_filters*2, 1, kernel_size=3, stride=2, padding=0),
            nn.Sigmoid() # the feature values are between 0 and 1
        )
    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.encoder(x)
        x = x.view(-1, 249*self.number_filters)
        x = self.fcencoder(x)
        x = self.fcdecoder(x)
        x = x.view(-1,self.number_filters, 249)
        x = self.decoder(x)
        x = x.squeeze(1)
        return x
    
# contractive loss function
def contractive_loss(W, x, recons_x, h, lam=1e-4):
    mse_loss = nn.MSELoss()(recons_x, x)
    
    dh = h * (1 - h) # Derivative of sigmoid
    w_sum = torch.sum(Variable(W)**2, dim=1)
    w_sum = w_sum.unsqueeze(1) # Shape to 2D tensor
    contractive_loss = torch.sum(torch.mm(dh**2, w_sum), 0)
    return mse_loss + contractive_loss.mul_(lam)

## load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import shutil
shutil.unpack_archive("drive/MyDrive/SolarEnergyMaterials/task4.zip", "/content/data")
shutil.unpack_archive("data/task4_hr35z9/pretrain_features.csv.zip", "/content/data")
shutil.unpack_archive("data/task4_hr35z9/pretrain_labels.csv.zip", "/content/data")
shutil.unpack_archive("data/task4_hr35z9/train_features.csv.zip", "/content/data")
shutil.unpack_archive("data/task4_hr35z9/train_labels.csv.zip", "/content/data")

In [8]:
def load_pretrain_data(batch_size = 64):
    batch_size = 64

    random.seed(17)
    test_ind = set()

    pre_train_size = 50000

    while len(test_ind) < 10000: 
        test_ind.add(random.randint(0, pre_train_size-1))

    features =[]
    labels = []

    with open("data/pretrain_features.csv", 'r') as f:
        for row in f:
            features.append(row)

    with open("data/pretrain_labels.csv", 'r') as f:
        for row in f:
            labels.append(row)

    # remove header
    features = features[1:]
    labels = labels[1:]

    # first try to note use representation of the molecules, only the extracted features
    features = [list(map(float,row.split(',')[2:])) for row in features]
    labels = [float(row.split(',')[1]) for row in labels]

    train_features = []
    train_labels = []
    test_features = []
    test_labels = []


    for i in range(len(features)):
        if i in test_ind:
            test_features.append(features[i])
            test_labels.append(labels[i])
        else:
            train_features.append(features[i])
            train_labels.append(labels[i])

    # does not seem to make sense to normalize the data since it is very sparse
    # normalize train_features
    # train_features = (train_features - np.mean(train_features, axis=0)) / (np.std(train_features, axis=0)+EPSILON)

    # normalize test_features
    # test_features = (test_features - np.mean(test_features, axis=0)) / (np.std(test_features, axis=0)+EPSILON)

    # convert into tensor dataset
    train_features = torch.tensor(train_features, dtype=torch.float)
    train_labels = torch.tensor(train_labels, dtype=torch.float)
    test_features = torch.tensor(test_features, dtype=torch.float)
    test_labels = torch.tensor(test_labels, dtype=torch.float)

    train_dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    test_dataset = torch.utils.data.TensorDataset(test_features, test_labels) 
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader

In [17]:
train_loader, test_loader = load_pretrain_data(batch_size = 64)

## Train/Test loop

In [None]:
# train loop:
def train_linear_encoder(model, dataloader, epochs):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    for epoch in range(epochs):
        for batch, (X, y) in enumerate(dataloader):
            optimizer.zero_grad()
            X_pred = model(X)
            loss = loss_fn(X_pred, X)
            loss.backward()  
            optimizer.step()

            if batch % 100 == 0:
                print('Epoch [{}/{}], Batch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, batch+1, len(dataloader), loss.item()))

## Linear Autoencoder:

In [None]:
LinearEncoder = RM.LinearAutoencoder(1000, 128)
train_linear_encoder(LinearEncoder, train_loader, 10)

## Convolutional Autoencoder:

In [60]:
ConvEncoder = RM.ConvAutoencoder(32)
test = next(iter(train_loader))
train_linear_encoder(ConvEncoder, train_loader, 10)

Epoch [1/10], Batch [1/625], Loss: 0.2654
Epoch [1/10], Batch [101/625], Loss: 0.0485
Epoch [1/10], Batch [201/625], Loss: 0.0480
Epoch [1/10], Batch [301/625], Loss: 0.0474
Epoch [1/10], Batch [401/625], Loss: 0.0482
Epoch [1/10], Batch [501/625], Loss: 0.0465
Epoch [1/10], Batch [601/625], Loss: 0.0477
Epoch [2/10], Batch [1/625], Loss: 0.0485
Epoch [2/10], Batch [101/625], Loss: 0.0474
Epoch [2/10], Batch [201/625], Loss: 0.0476
Epoch [2/10], Batch [301/625], Loss: 0.0484
Epoch [2/10], Batch [401/625], Loss: 0.0469
Epoch [2/10], Batch [501/625], Loss: 0.0492
Epoch [2/10], Batch [601/625], Loss: 0.0480
Epoch [3/10], Batch [1/625], Loss: 0.0474
Epoch [3/10], Batch [101/625], Loss: 0.0483
Epoch [3/10], Batch [201/625], Loss: 0.0407
Epoch [3/10], Batch [301/625], Loss: 0.0203
Epoch [3/10], Batch [401/625], Loss: 0.0225
Epoch [3/10], Batch [501/625], Loss: 0.0220
Epoch [3/10], Batch [601/625], Loss: 0.0221
Epoch [4/10], Batch [1/625], Loss: 0.0230
Epoch [4/10], Batch [101/625], Loss: 0.0

KeyboardInterrupt: 

## Convolutional Autoencoder with Linear Layer:

In [75]:
importlib.reload(RepresentationModels)
ConvLinearEncoder = RM.ConvLinearAutoencoder(6, 90)
test = next(iter(train_loader))[0]
p = ConvLinearEncoder(test)

torch.Size([64, 6, 249])
torch.Size([64, 6, 249])
