## Plan of action.
We have too little data to train a model dircetly on the data. It will either be too stupid or terribly overfit.

To work with this, we will use different data, which is label with that slighly correspond to the labels we are interested in. We will train the net, then remove the last layers then and add new layers ontop. Then finetune.

Ideas:
1) don't use pooling but stride to reduce amount of parameters.
2) normailze
3) batch norm
4) dropout

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## get data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil
shutil.unpack_archive("drive/MyDrive/SolarEnergyMaterials/task4_hr35z9.zip", "/content/data")
shutil.unpack_archive("data/task4_hr35z9/pretrain_features.csv.zip", "/content/data")
shutil.unpack_archive("data/task4_hr35z9/pretrain_labels.csv.zip", "/content/data")

In [16]:
import random
import numpy as np
import pandas as pd
import torch
EPSILON = 1e-10
def load_pretrain_data(batch_size = 64):
    batch_size = 64

    random.seed(17)
    test_ind = set()

    pre_train_size = 50000

    while len(test_ind) < 10000: 
        test_ind.add(random.randint(0, pre_train_size-1))

    features =[]
    labels = []

    with open("data/pretrain_features.csv", 'r') as f:
        for row in f:
            features.append(row)

    with open("data/pretrain_labels.csv", 'r') as f:
        for row in f:
            labels.append(row)

    # remove header
    features = features[1:]
    labels = labels[1:]

    # first try to note use representation of the molecules, only the extracted features
    features = [list(map(float,row.split(',')[2:])) for row in features]
    labels = [float(row.split(',')[1]) for row in labels]

    train_features = []
    train_labels = []
    test_features = []
    test_labels = []


    for i in range(len(features)):
        if i in test_ind:
            test_features.append(features[i])
            test_labels.append(labels[i])
        else:
            train_features.append(features[i])
            train_labels.append(labels[i])

    # does not seem to make sense to normalize the data since it is very sparse
    # normalize train_features
    # train_features = (train_features - np.mean(train_features, axis=0)) / (np.std(train_features, axis=0)+EPSILON)

    # normalize test_features
    # test_features = (test_features - np.mean(test_features, axis=0)) / (np.std(test_features, axis=0)+EPSILON)

    # convert into tensor dataset
    train_features = torch.tensor(train_features, dtype=torch.float)
    train_labels = torch.tensor(train_labels, dtype=torch.float)
    test_features = torch.tensor(test_features, dtype=torch.float)
    test_labels = torch.tensor(test_labels, dtype=torch.float)

    train_dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    test_dataset = torch.utils.data.TensorDataset(test_features, test_labels) 
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader

In [17]:
train_loader, test_loader = load_pretrain_data(batch_size = 64)

## train model


In [18]:
# train loop
def train_model(model, data_loader, epochs, lr=0.1, optim=None):
  if optim is None:
    optimizer = torch.optim.SGD(params=model.parameters(), lr=lr)
  else:
    optimizer = optim
  loss_fn = nn.MSELoss()
  
  model.to(device)
  for epoch in range(epochs):
    epoch_loss = 0
    for batch, (X, y) in enumerate(data_loader):
      X = X.to(device)
      y = y.to(device)
      y_pred = model(X)
      loss = loss_fn(y_pred, y)
      epoch_loss += loss.item()
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    print(f"average batch loss in {epoch+1}: {epoch_loss/len(data_loader)}")
# test loop
def test_model(model, data_loader):
  loss_fn = nn.MSELoss() 
  model.to(device)
  with torch.no_grad():
    for batch, (X,y) in enumerate(data_loader):
      X = X.to(device)
      y = y.to(device)
      y_pred = model(X)
      loss = loss_fn(y_pred, y)
    print(f"average batch loss: {loss.item()/len(data_loader)}")

In [28]:
class net(nn.Module):
    def __init__(self):
        torch.manual_seed = 17
        super(net, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=6, stride=2, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=6, out_channels=16, stride=2, kernel_size=3)
        self.conv3 = nn.Conv1d(in_channels=16, out_channels=120, stride=2, kernel_size=3)
        self.fc1 = nn.Linear(120*124, 84)
        self.fc2 = nn.Linear(84, 1)

    def forward(self, x):
        x = x.unsqueeze_(dim=1)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(-1, 120*124)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        x = x.squeeze_(dim=1)
        return x
    
# using batch normalization
class normalized_net(nn.Module):
    def __init__(self):
        torch.manual_seed = 17
        super(normalized_net, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=6, stride=2, kernel_size=3)
        self.bn1 = nn.BatchNorm1d(6)
        self.conv2 = nn.Conv1d(in_channels=6, out_channels=16, stride=2, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(in_channels=16, out_channels=120, stride=2, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(120)
        self.fc1 = nn.Linear(120*124, 84)
        self.fc2 = nn.Linear(84, 1)

    def forward(self, x):
        x = x.unsqueeze_(dim=1) # need the x = x.unsqueeze_(dim=1) so gradient computation works
        x = torch.relu(self.bn1(self.conv1(x)))
        x = torch.relu(self.bn2(self.conv2(x)))
        x = torch.relu(self.bn3(self.conv3(x)))
        x = x.view(-1, 120*124)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        x = x.squeeze_(dim=1)    
        return x


In [20]:
test = next(iter(train_loader))[0]
dev_model = net()
out = dev_model(test)
print(out.shape)

torch.Size([64])


In [22]:
dev_model = net()
test_model(dev_model, train_loader)
print('---')
train_model(dev_model, train_loader, epochs=10, lr=0.01)

average batch loss: 0.018515733337402345
---
average batch loss in 1: 0.21782863270044328
average batch loss in 2: 0.06911615184545516
average batch loss in 3: 0.03942899322360754
average batch loss in 4: 0.03278568062335253
average batch loss in 5: 0.02913047333359718
average batch loss in 6: 0.027561757965385913
average batch loss in 7: 0.026323740927875042
average batch loss in 8: 0.02579538719803095
average batch loss in 9: 0.025134900061786175
average batch loss in 10: 0.023965707942843438


In [27]:
dev_model =  normalized_net()
test_model(dev_model, train_loader)
print('---')
train_model(dev_model, train_loader, epochs=10, lr=0.01)

average batch loss: 0.017563113403320313
---
average batch loss in 1: 1.3660547191858292
average batch loss in 2: 0.08406449862122535
average batch loss in 3: 0.02069993316680193
average batch loss in 4: 0.01761895297318697
average batch loss in 5: 0.01619205273911357
average batch loss in 6: 0.015241541486978531
average batch loss in 7: 0.01458274949491024
average batch loss in 8: 0.013889670934528113
average batch loss in 9: 0.013930011636018752
average batch loss in 10: 0.013357451595366
