In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
sys.path.insert(0,'/content/drive/MyDrive/altegrad_datachallenge')
from src.dataloader import load_data
import torch
from torchtext import data
from src.model import build_model
from src.train_manager import TrainManager
import numpy as np
import random

# Doc2Vec

In [74]:
def set_seed(seed: int, random_seed: int):
    """
    Set the random seed for modules torch, numpy and random.
    :param seed: random seed
    """
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(random_seed)

class Config:
    def __init__(self, name_model):
        self.model_dir = "/content/drive/MyDrive/altegrad_datachallenge/"
        self.data_path = "data"
        self.splitpaths = ['d2v.train', 'd2v.valid', 'd2v.test']
        self.seed = 42
        self.author_dim = 128
        self.paper_dim = 256
        self.use_cuda = True
        # Training Parameters
        self.epochs = 60
        self.batch_size = 64
        self.eval_batch_size = 64
        self.validation_freq = 150
        self.training_freq = 50
        # Learning Rate Tunable
        self.lr = 0.0007
        self.patience = 1
        self.factor = 0.8
        # Model
        self.num_heads = 8
        self.num_layers = 2
        self.dropout = 0.4
        self.name_model = name_model

In [72]:
cfg_data = Config(None)
train_data, dev_data = load_data(cfg_data)
total_data = data.Dataset(train_data.examples + dev_data.examples, train_data.fields)

**In order to do a ensemble model, we are going to split in 5 different way the labeled data into train/dev split.
We can add more but we just need to insure that they have approximately the same performance.**

In [None]:
cfg = Config(f'model_d2v_normal')
model = build_model(cfg)
trainer = TrainManager(model, cfg)
trainer.train_and_validate(train_data, dev_data)

In [None]:
split_seeds = [0, 5, 10, 15]
for split_seed in split_seeds:
    cfg = Config(f'model_d2v_{split_seed}')
    set_seed(cfg.seed, split_seed)
    train_data_, dev_data_ = total_data.split(split_ratio=0.9, random_state=random.getstate())
    model = build_model(cfg)
    trainer = TrainManager(model, cfg)
    trainer.train_and_validate(train_data_, dev_data_)

# SentenceTransformers

In [None]:
def set_seed(seed: int, random_seed: int):
    """
    Set the random seed for modules torch, numpy and random.
    :param seed: random seed
    """
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(random_seed)

class Config:
    def __init__(self, name_model):
        self.model_dir = "/content/drive/MyDrive/altegrad_datachallenge/"
        self.data_path = "data"
        self.splitpaths = ['data.train', 'data.valid', 'data.test']
        self.seed = 42
        self.author_dim = 128
        self.paper_dim = 256
        self.use_cuda = True
        # Training Parameters
        self.epochs = 60
        self.batch_size = 64
        self.eval_batch_size = 64
        self.validation_freq = 150
        self.training_freq = 50
        # Learning Rate Tunable
        self.lr = 0.0007
        self.patience = 1
        self.factor = 0.8
        # Model
        self.num_heads = 8
        self.num_layers = 2
        self.dropout = 0.4
        self.name_model = name_model

In [None]:
cfg_data = Config(None)
train_data, dev_data = load_data(cfg_data)
total_data = data.Dataset(train_data.examples + dev_data.examples, train_data.fields)

In [None]:
cfg = Config(f'model_ST_normal')
model = build_model(cfg)
trainer = TrainManager(model, cfg)
trainer.train_and_validate(train_data, dev_data)

In [None]:
split_seeds = [0, 5, 10, 15]
for split_seed in split_seeds:
    cfg = Config(f'model_ST_{split_seed}')
    set_seed(cfg.seed, split_seed)
    train_data_, dev_data_ = total_data.split(split_ratio=0.9, random_state=random.getstate())
    model = build_model(cfg)
    trainer = TrainManager(model, cfg)
    trainer.train_and_validate(train_data_, dev_data_)