In [3]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

<IPython.core.display.Javascript object>

In [4]:
import data
from collections import Counter
import retrieval as ret
import lstm
from trainer import Trainer
import pandas as pd
import numpy as np
import torch
import mlp
from torch.utils.data import DataLoader
import torch.optim as optim
from sklearn.model_selection import train_test_split

# torch.multiprocessing.set_start_method("spawn", force=True)  # multiprocessing
# from sentence_transformers import SentenceTransformer, util
# from transformers import AutoTokenizer, AutoModelForMaskedLM
# import simpletransformers
# import spacy
# import pytextrank

<IPython.core.display.Javascript object>

In [5]:
# outdir = "../data/clean/"
# for file in tqdm(index.keys()):
#     wiki = data.get_wiki(file)
#     lines = wiki["lines"].apply(lambda l: "<SPLIT>".join(data.clean_article(l)))
#     wiki["text"] = lines
#     wiki = wiki.drop("lines", axis=1).reset_index()
#     new_file = outdir + file.split("/")[-1]
#     wiki.to_json(new_file, orient="records", lines=True)

<IPython.core.display.Javascript object>

In [6]:
train = data.get_train("../data/train.jsonl")
train = train.explode("evidence").reset_index()
train, test = train_test_split(train)

<IPython.core.display.Javascript object>

In [7]:
embedder = ret.SentEmbed("distilroberta-base-msmarco-v2")

<IPython.core.display.Javascript object>

In [8]:
train_dataset = data.SentenceDataset(train, embedder, "../data/wiki.db", 4)
test_dataset = data.SentenceDataset(test, embedder, "../data/wiki.db", 4)

<IPython.core.display.Javascript object>

In [9]:
train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=train_dataset.collate,
    num_workers=0,
)
test_loader = DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=test_dataset.collate,
    num_workers=0,  # doesn't work with more than 1 and a sqlite connection
)

<IPython.core.display.Javascript object>

In [11]:
# General
device = "cuda" if torch.cuda.is_available() else "cpu"
# Model params
EMBEDDING_DIM = embedder.model.get_sentence_embedding_dimension()
HIDDEN_DIM = 100
OUTPUT_DIM = 3  # refute, not enough info, support
N_LAYERS = 2
DROPOUT = 1e-1
BIDIRECTIONAL = True
# Loss fn params
WEIGHT_DECAY = 1e-4
N_EPOCHS = 3
LR = 1e-3
LR_DECAY = 1e-3

<IPython.core.display.Javascript object>

In [12]:
model = lstm.LSTMClassifier(
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
    n_layers=N_LAYERS,
    dropout=DROPOUT,
    bidirectional=BIDIRECTIONAL,
    pad_idx=train_dataset.input_pad_idx,
)
model.to(device)
state_dict = torch.load("../models/bilstm-nli-model-2.pt")
model.load_state_dict(state_dict)
optimizer = optim.Adam(model.parameters(), weight_decay=WEIGHT_DECAY, lr=LR)
loss_fn = torch.nn.CrossEntropyLoss(
    ignore_index=train_dataset.output_pad_idx,
    reduction="sum",
)

<IPython.core.display.Javascript object>

In [13]:
trainer = Trainer(model, optimizer, loss_fn, device, log_every_n=1)
labels = {0: "REFUTES", 1: "NOT ENOUGH INFO", 2: "SUPPORT"}

<IPython.core.display.Javascript object>

In [151]:
# trainer.fit(
#     train_loader=train_loader,
#     valid_loader=test_loader,
#     labels=labels,
#     n_epochs=N_EPOCHS,
# )

<IPython.core.display.Javascript object>

In [152]:
# torch.save(model.state_dict(), "../models/bilstm-nli-model-3.pt")

<IPython.core.display.Javascript object>

In [14]:
# Small test dataset/loader
small_test_dataset = data.SentenceDataset(test[:1000], embedder, "../data/wiki.db", 4)
small_test_loader = DataLoader(
    small_test_dataset,
    batch_size=4,
    shuffle=False,
    collate_fn=test_dataset.collate,
    num_workers=0,  # doesn't work with more than 1 and a sqlite connection
)

<IPython.core.display.Javascript object>

In [15]:
loss, running_loss = trainer.evaluate(small_test_loader, labels)

100%|██████████| 215/215 [01:22<00:00,  2.61it/s]

Evaluation loss: 11.962440080420919
Classification report after epoch:
Evidence accuracy: 0.320139697322468
Number correct: 275 out of 859
Fever score: 0.36321303841676367
Number right: 312 out of 859
Label accuracy: 0.6507566938300349





<IPython.core.display.Javascript object>

In [21]:
claim = "George Lucas and Mark Hamill have worked together"
a = "Luke Skywalker is a protagonist in Star Wars"
b = "George Lucas directs Star Wars"
c = "Mark Hamill plays Luke Skywalker"
d = b + " and " + c
e = "Mark Hamill and George Lucas were part of Star Wars"

<IPython.core.display.Javascript object>

In [22]:
embedder.compare(claim, [a, b, c, d, e])

tensor([[0.1483, 0.3690, 0.4195, 0.4155, 0.7016]])

<IPython.core.display.Javascript object>

In [117]:
mlp_train_dataset = data.MLPSentenceDataset(train, embedder, "../data/wiki.db", 4)
mlp_test_dataset = data.MLPSentenceDataset(test[:100], embedder, "../data/wiki.db", 4)

mlp_train_loader = DataLoader(
    mlp_train_dataset,
    batch_size=512,
    shuffle=True,
    collate_fn=mlp_train_dataset.collate,
    num_workers=0,
)
mlp_test_loader = DataLoader(
    mlp_test_dataset,
    batch_size=20,
    shuffle=False,
    collate_fn=mlp_test_dataset.collate,
    num_workers=0,  # doesn't work with more than 1 and a sqlite connection
)


<IPython.core.display.Javascript object>

In [118]:
# General
device = "cuda" if torch.cuda.is_available() else "cpu"
# Model params
EMBEDDING_DIM = embedder.model.get_sentence_embedding_dimension()
HIDDEN_DIMS = [300, 50]
OUTPUT_DIM = 3  # refute, not enough info, support
DROPOUT = 1e-1
# Loss fn params
WEIGHT_DECAY = 1e-4
N_EPOCHS = 3
LR = 1e-2
LR_DECAY = 1e-3
# Class weights
lens = train["evidence"].apply(len)
labs = train["label"].apply(lambda x: [x])
# Assuming we have 15 selected sentences per claim (and 100% recall)
# We would have X of them be the correct label (number of evidence in the evidence column)
# and 15 - X would be NEI. This is a way to adjust the class weights to account for that
nei = train["label"].apply(lambda x: [1])
frequencies = ((15 - lens) * nei + (labs * lens)).explode().value_counts().sort_index()
class_weights = 1 + torch.softmax(-torch.log2(torch.Tensor(frequencies)), dim=0).to(
    device
)

<IPython.core.display.Javascript object>

In [119]:
mlp_model = mlp.MLPClassifier(
    embedding_dim=EMBEDDING_DIM,
    hidden_dims=HIDDEN_DIMS,
    output_dim=OUTPUT_DIM,
    dropout=DROPOUT,
    pad_idx=mlp_train_dataset.input_pad_idx,
)
mlp_model.to(device)

MLPClassifier(
  (fc1): Linear(in_features=1537, out_features=300, bias=True)
  (fc2): Linear(in_features=300, out_features=3, bias=True)
  (activation): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
)

<IPython.core.display.Javascript object>

In [120]:
optimizer = optim.Adam(mlp_model.parameters(), weight_decay=WEIGHT_DECAY, lr=LR)
loss_fn = torch.nn.CrossEntropyLoss(
    ignore_index=train_dataset.output_pad_idx, reduction="sum", weight=class_weights
)

<IPython.core.display.Javascript object>

In [1]:
labels = {0: "REFUTES", 1: "NOT ENOUGH INFO", 2: "SUPPORT"}
trainer = Trainer(mlp_model, optimizer, loss_fn, device, log_every_n=1)
trainer.fit(
    train_loader=mlp_train_loader,
    valid_loader=mlp_test_loader,
    labels=labels,
    n_epochs=N_EPOCHS,
)