In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.metrics import classification_report

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim import Adam
import torch

In [None]:
from tqdm import tqdm
import random

In [None]:
import gc

# Set SEED

In [None]:
seed = 43
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

In [None]:
train_df = pd.read_pickle('/content/drive/MyDrive/master_thesis/dataset_data/embedded_data/train_emb.pkl')

# Dataset

In [None]:
class ExperimentDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        item = self.df.iloc[index]
        inputs = np.concatenate([item['abstract_embedding'], item['ChallengeDescription_embedding']])
        return torch.tensor(inputs), torch.tensor(item['label'])

# Model

In [None]:
from torch import nn
from torch import cat

class Classifier(nn.Module):
    def __init__(self, embedding_size=1536):
        super().__init__()
        self.dropout = nn.Dropout(0.2)
        self.h_1 = nn.Linear(embedding_size, 512)
        self.h_2 = nn.Linear(512, 128)
        self.o = nn.Linear(128, 1)

    def forward(self, input):
        h1 = self.dropout(self.h_1(input))
        h2 = self.dropout(self.h_2(h1))
        return self.o(h2)

In [None]:
def train(model, train_dataloader, epochs=3):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    criterion = nn.BCEWithLogitsLoss()
    optimizer = Adam(model.parameters())

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()
            
    total_loss_train = 0
    model.train()

    for epoch in range(epochs):
      print("Epoch: ", epoch)
      for i, train_data in enumerate(tqdm(train_dataloader)):
          inputs, labels = train_data
          train_label = labels.unsqueeze(1).to(device)
          output = model(inputs.float().to(device))

          batch_loss = criterion(output.float(), train_label.float())
          loss_value = batch_loss.item()
          total_loss_train += loss_value

          model.zero_grad()
          batch_loss.backward()
          optimizer.step()

          if (i%20 == 0):
            print(f"loss: {loss_value}")


In [None]:
def make_pred_for_eval(model, dataloader):
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  model.eval()
  pred = []
  with torch.no_grad():
    for inputs, labels in tqdm(dataloader):
        eval_label = labels.unsqueeze(1).to(device)
        output = torch.sigmoid(model(inputs.float().to(device)))

        pred.append(output.cpu().detach().numpy())
  return pred

In [None]:
train_ds = ExperimentDataset(train_df)
train_dataloader = DataLoader(train_ds, batch_size=16, shuffle=True)

In [None]:
model = Classifier()

In [None]:
print(model)

In [None]:
print(sum(p.numel() for p in model.parameters()))

In [None]:
train(model, train_dataloader)

In [None]:
torch.save(model.state_dict(), f"/content/drive/MyDrive/master_thesis/bert_embedding_nn/linear/{seed}")