## Plan of action.
We have too little data to train a model dircetly on the data. It will either be too stupid or terribly overfit.

To work with this, we will use different data, which is label with that slighly correspond to the labels we are interested in. We will train the net, then remove the last layers then and add new layers ontop. Then finetune.

Ideas:
1. don't use pooling but stride to reduce amount of parameters.
2. normailze
3. batch norm
4. dropout

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

In [64]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## get data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import shutil
shutil.unpack_archive("drive/MyDrive/SolarEnergyMaterials/task4.zip", "/content/data")
shutil.unpack_archive("data/task4_hr35z9/pretrain_features.csv.zip", "/content/data")
shutil.unpack_archive("data/task4_hr35z9/pretrain_labels.csv.zip", "/content/data")

In [7]:
import random
import numpy as np
import pandas as pd
import torch
EPSILON = 1e-10
def load_pretrain_data(batch_size = 64):
    batch_size = 64

    random.seed(17)
    test_ind = set()

    pre_train_size = 50000

    while len(test_ind) < 10000: 
        test_ind.add(random.randint(0, pre_train_size-1))

    features =[]
    labels = []

    with open("data/pretrain_features.csv", 'r') as f:
        for row in f:
            features.append(row)

    with open("data/pretrain_labels.csv", 'r') as f:
        for row in f:
            labels.append(row)

    # remove header
    features = features[1:]
    labels = labels[1:]

    # first try to note use representation of the molecules, only the extracted features
    features = [list(map(float,row.split(',')[2:])) for row in features]
    labels = [float(row.split(',')[1]) for row in labels]

    train_features = []
    train_labels = []
    test_features = []
    test_labels = []


    for i in range(len(features)):
        if i in test_ind:
            test_features.append(features[i])
            test_labels.append(labels[i])
        else:
            train_features.append(features[i])
            train_labels.append(labels[i])

    # does not seem to make sense to normalize the data since it is very sparse
    # normalize train_features
    # train_features = (train_features - np.mean(train_features, axis=0)) / (np.std(train_features, axis=0)+EPSILON)

    # normalize test_features
    # test_features = (test_features - np.mean(test_features, axis=0)) / (np.std(test_features, axis=0)+EPSILON)

    # convert into tensor dataset
    train_features = torch.tensor(train_features, dtype=torch.float)
    train_labels = torch.tensor(train_labels, dtype=torch.float)
    test_features = torch.tensor(test_features, dtype=torch.float)
    test_labels = torch.tensor(test_labels, dtype=torch.float)

    train_dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    test_dataset = torch.utils.data.TensorDataset(test_features, test_labels) 
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader

In [8]:
train_loader, test_loader = load_pretrain_data(batch_size = 64)

## train model


In [66]:
# train loop
def train_model(model, data_loader, epochs, lr=0.1, optim=None, weight_decay=0.001):
  model.to(device)
  if optim is None:
    optimizer = torch.optim.SGD(params=model.parameters(), lr=lr, weight_decay=weight_decay)
  else:
    optimizer = optim(model.parameters(), lr=lr, weight_decay=weight_decay)

  loss_fn = nn.MSELoss()
  epoch_loss = []
  for epoch in range(epochs):
    epoch_loss.append(0)
    for batch, (X, y) in enumerate(data_loader):
      X = X.to(device)
      y = y.to(device)
      y_pred = model(X)
      loss = loss_fn(y_pred, y)
      epoch_loss[-1] += loss.item()
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    print(f"average batch loss in {epoch+1}: {epoch_loss[-1]/len(data_loader)}")
      



# test loop
def test_model(model, data_loader):
  wrong = 0
  right = 0
  loss_fn = nn.MSELoss() 
  model.to(device)
  with torch.no_grad():
    for batch, (X,y) in enumerate(data_loader):
      X = X.to(device)
      y = y.to(device)
      y_pred = model(X)
      y_diff = y-y_pred
      for diff in y_diff:
        if abs(diff) > 0.1:
          wrong+=1
        else:
          right+=1
      loss = loss_fn(y_pred, y)
    print(f"average batch loss: {loss.item()/len(data_loader)} | accuracy: {right}/{right+wrong} | accuracy in percent {100*right/(right+wrong)}")

In [43]:
class net(nn.Module):
    def __init__(self):
        torch.manual_seed = 17
        super(net, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=6, stride=2, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=6, out_channels=16, stride=2, kernel_size=3)
        self.conv3 = nn.Conv1d(in_channels=16, out_channels=120, stride=2, kernel_size=3)
        self.fc1 = nn.Linear(120*124, 84)
        self.fc2 = nn.Linear(84, 1)

    def forward(self, x):
        x = x.unsqueeze_(dim=1)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(-1, 120*124)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        x = x.squeeze_(dim=1)
        return x
    
# using batch normalization
class normalized_net(nn.Module):
    def __init__(self):
        torch.manual_seed = 17
        super(normalized_net, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=6, stride=2, kernel_size=3)
        self.bn1 = nn.BatchNorm1d(6)
        self.conv2 = nn.Conv1d(in_channels=6, out_channels=16, stride=2, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(in_channels=16, out_channels=32, stride=2, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(32)
        self.conv4 = nn.Conv1d(in_channels=32, out_channels=64, stride=2, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(64)
        self.conv5 = nn.Conv1d(in_channels=64, out_channels=128, stride=2, kernel_size=3)
        self.bn5 = nn.BatchNorm1d(128)
        self.fc1 = nn.Linear(128*30, 84)
        self.fc2 = nn.Linear(84, 1)

    def forward(self, x):
        x = x.unsqueeze_(dim=1) # need the x = x.unsqueeze_(dim=1) so gradient computation works
        x = torch.relu(self.bn1(self.conv1(x)))
        x = torch.relu(self.bn2(self.conv2(x)))
        x = torch.relu(self.bn3(self.conv3(x)))
        x = torch.relu(self.bn4(self.conv4(x)))
        x = torch.relu(self.bn5(self.conv5(x)))
        x = x.view(-1, 128*30)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        x = x.squeeze_(dim=1)    
        return x


In [39]:
test = next(iter(train_loader))[0]
dev_model = normalized_net()
out = dev_model(test)

torch.Size([64, 128, 30])


In [60]:
dev_model =  normalized_net()
train_model(dev_model, train_loader, epochs=15, optim = torch.optim.Adam, lr=0.001, weight_decay=0.002)
print('---')
test_model(dev_model, train_loader)
print('---')
test_model(dev_model, test_loader)

average batch loss in 1: 0.04832211886197329
average batch loss in 2: 0.015955748527497052
average batch loss in 3: 0.0138593875028193
average batch loss in 4: 0.013848679214715958
average batch loss in 5: 0.013406210947781801
average batch loss in 6: 0.013128400990366936
average batch loss in 7: 0.013793663085997105
average batch loss in 8: 0.013128213630616664
average batch loss in 9: 0.013136810804903507
average batch loss in 10: 0.01320596416592598
average batch loss in 11: 0.013335453416407109
average batch loss in 12: 0.012644454278796912
average batch loss in 13: 0.012736681837588549
average batch loss in 14: 0.012520046799629926
average batch loss in 15: 0.012515485768765211
---
average batch loss: 1.4018318057060243e-05 | accuracy: 27602/40000 | accuracy in percent 69.005
---
average batch loss: 7.285112455771986e-05 | accuracy: 6773/10000 | accuracy in percent 67.73


In [67]:
dev_model =  normalized_net()

train_model(dev_model, train_loader, epochs=15, optim = torch.optim.Adagrad, lr=0.001, weight_decay=0.001)
print('---')
test_model(dev_model, train_loader)
print('---')
test_model(dev_model, test_loader)


average batch loss in 1: 0.052456893715262416
average batch loss in 2: 0.018803144995868205
average batch loss in 3: 0.015965714767575263
average batch loss in 4: 0.014213927049189806
average batch loss in 5: 0.013095104674994945
average batch loss in 6: 0.012271047741174698
average batch loss in 7: 0.01156871896237135
average batch loss in 8: 0.01102673379331827
average batch loss in 9: 0.01059519100189209
average batch loss in 10: 0.010167258009314537
average batch loss in 11: 0.009883010215312243
average batch loss in 12: 0.009588859386742116
average batch loss in 13: 0.009299005978554487
average batch loss in 14: 0.00901506588086486
average batch loss in 15: 0.008847360903769731
---
average batch loss: 9.972722083330155e-06 | accuracy: 29922/40000 | accuracy in percent 74.805
---
average batch loss: 8.344114016575418e-05 | accuracy: 7031/10000 | accuracy in percent 70.31


In [44]:
dev_model =  normalized_net()
train_model(dev_model, train_loader, epochs=15, lr=0.01)
print('---')
test_model(dev_model, train_loader)
print('---')
test_model(dev_model, test_loader)


average batch loss in 1: 0.4968699249774218
average batch loss in 2: 0.03491322933137417
average batch loss in 3: 0.02677759014368057
average batch loss in 4: 0.022675443471968173
average batch loss in 5: 0.01888897882774472
average batch loss in 6: 0.01708962717205286
average batch loss in 7: 0.014661124294251204
average batch loss in 8: 0.014020600125193596
average batch loss in 9: 0.013597100345045328
average batch loss in 10: 0.012233984691649675
average batch loss in 11: 0.012298735515773297
average batch loss in 12: 0.012327293568104506
average batch loss in 13: 0.01067632727175951
average batch loss in 14: 0.010132453045248985
average batch loss in 15: 0.010068597088754178
---
average batch loss: 9.880167245864868e-06 | accuracy: 30787/40000 | accuracy in percent 76.9675
---
average batch loss: 0.00010748181468362262 | accuracy: 7219/10000 | accuracy in percent 72.19


In [None]:
model = 