In [1]:
import fasttext
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer

from config import PROBLEM_TEST, CHECKPOINTS_DIR, ENGLISH_TRAIN, ENGLISH_TEST
from data import TextDataset, HatefulTweets
from functools import partial
from nn import train_model, BinaryMLP

from experiment import (
    run_repeated_labse_single,
    run_repeated_labse_multi,
    test_inference_time,
    calculate_memory_usage,
)

In [2]:
run_repeated_labse_single(name="labse_single_polish")

Global seed set to 1
Global seed set to 2
Global seed set to 3
Global seed set to 4
Global seed set to 5
Global seed set to 6
Global seed set to 7
Global seed set to 8
Global seed set to 9
Global seed set to 10


{'test/loss': '0.3486 ± 0.0164',
 'test/f1': '0.4444 ± 0.0283',
 'test/acc': '0.8822 ± 0.0066',
 'test/precision': '0.6048 ± 0.0451',
 'test/recall': '0.3515 ± 0.0218',
 'train/loss': '0.1310 ± 0.0354',
 'train/f1': '0.9113 ± 0.0285',
 'train/acc': '0.9845 ± 0.0053',
 'train/precision': '0.8894 ± 0.0444',
 'train/recall': '0.9350 ± 0.0143',
 'train_time': '17.9384 ± 1.8442'}

In [3]:
run_repeated_labse_single(
    name="labse_single_english",
    train_path=ENGLISH_TRAIN,
    test_path=ENGLISH_TEST,
)

Global seed set to 1
Global seed set to 2
Global seed set to 3
Global seed set to 4
Global seed set to 5
Global seed set to 6
Global seed set to 7
Global seed set to 8
Global seed set to 9
Global seed set to 10


{'test/loss': '0.6740 ± 0.0334',
 'test/f1': '0.6120 ± 0.0128',
 'test/acc': '0.5952 ± 0.0358',
 'test/precision': '0.5193 ± 0.0349',
 'test/recall': '0.7614 ± 0.1021',
 'train/loss': '0.6136 ± 0.0354',
 'train/f1': '0.6276 ± 0.0347',
 'train/acc': '0.6580 ± 0.0318',
 'train/precision': '0.5812 ± 0.0333',
 'train/recall': '0.6838 ± 0.0502',
 'train_time': '8.4854 ± 0.4961'}

In [4]:
run_repeated_labse_multi(name="labse_multi")

Global seed set to 1
Global seed set to 2
Global seed set to 3
Global seed set to 4
Global seed set to 5
Global seed set to 6
Global seed set to 7
Global seed set to 8
Global seed set to 9
Global seed set to 10


{'english_test/loss': '0.6740 ± 0.0334',
 'english_test/f1': '0.6120 ± 0.0128',
 'english_test/acc': '0.5952 ± 0.0358',
 'english_test/precision': '0.5193 ± 0.0349',
 'english_test/recall': '0.7614 ± 0.1021',
 'english_train/loss': '0.6136 ± 0.0354',
 'english_train/f1': '0.6276 ± 0.0347',
 'english_train/acc': '0.6580 ± 0.0318',
 'english_train/precision': '0.5812 ± 0.0333',
 'english_train/recall': '0.6838 ± 0.0502',
 'polish_pre_training_test/loss': '0.5710 ± 0.0540',
 'polish_pre_training_test/f1': '0.2984 ± 0.0284',
 'polish_pre_training_test/acc': '0.7462 ± 0.0712',
 'polish_pre_training_test/precision': '0.2598 ± 0.0468',
 'polish_pre_training_test/recall': '0.4157 ± 0.1552',
 'polish_test/loss': '0.3433 ± 0.0142',
 'polish_test/f1': '0.4422 ± 0.0241',
 'polish_test/acc': '0.8831 ± 0.0043',
 'polish_test/precision': '0.6143 ± 0.0326',
 'polish_test/recall': '0.3463 ± 0.0264',
 'polish_train/loss': '0.1188 ± 0.0363',
 'polish_train/f1': '0.9204 ± 0.0277',
 'polish_train/acc': '0.

In [5]:
embeddings_model = SentenceTransformer("sentence-transformers/LaBSE")
get_embeddings = lambda x: embeddings_model.encode(
    x,
    convert_to_numpy=False,
    convert_to_tensor=True,
    batch_size=128,
).cpu()

dataset = TextDataset(PROBLEM_TEST, get_embeddings)
loader = DataLoader(
    dataset,
    batch_size=128,
    pin_memory=True,
    shuffle=False,
    num_workers=0,
    drop_last=True,
)

checkpoint_file = CHECKPOINTS_DIR / "labse_single_polish_1.ckpt"
model = BinaryMLP.load_from_checkpoint(
    checkpoint_file,
    emb_dim=768,
    hidden_dims=[256, 128],
    learning_rate=1e-4,
).cuda()

In [6]:
test_inference_time(model, loader)

'0.0002 ± 0.0001'

In [7]:
calculate_memory_usage(model)

'0.883 MB'