In [1]:
# in a terminal run
# > make run-redis NS=train
# > make run-redis NS=test
# to allow access to the train and test namespaces

In [2]:
import os
import sys
import pandas as pd
import numpy as np

In [3]:
sys.path.append("..")
os.environ["USER_PATH"] = "../userdata"

In [4]:
from misc.redis import set_redis_slow_mode
from misc.util import highest_number
from model.datagenerator import create_train_test
from system.namespace.store import get_namespace

In [5]:
import torch

is_cuda = torch.cuda.is_available()
is_cuda

True

In [6]:
set_redis_slow_mode("never")
ns_test = get_namespace("test")
ns_train = get_namespace("train")
now = pd.Timestamp("2022-12-17", tz="UTC")
train_plan = [
    {
        "left": {"mode": "valid", "flip_pc": 0.5},
        "right": {"mode": "valid", "flip_pc": 0.0},
        "min_text_length": 20,
        "skip_weak": False,
        "flip_lr": 0.5,
        "first_epoch": 10,
        "last_epoch": None,
        "weight": 100,
    },
    {
        "left": {"mode": "valid", "flip_pc": 0.5},
        "right": {"mode": "valid", "flip_pc": 0.0},
        "min_text_length": None,
        "skip_weak": True,
        "flip_lr": 0.5,
        "first_epoch": 10,
        "last_epoch": None,
        "weight": 100,
    },
    {
        "left": {"mode": "random", "flip_pc": 0.0},
        "right": {"mode": "path", "flip_pc": 0.0},
        "min_text_length": 20,
        "skip_weak": False,
        "flip_lr": 0.5,
        "first_epoch": None,
        "last_epoch": None,
        "weight": 60,
    },
    {
        "left": None,
        "right": {"mode": "path", "flip_pc": 0.0},
        "min_text_length": 20,
        "skip_weak": False,
        "flip_lr": 0.5,
        "first_epoch": None,
        "last_epoch": None,
        "weight": 40,
    },
     {
        "left": {"mode": "random", "flip_pc": 0.0},
        "right": {"mode": "path", "flip_pc": 0.0},
        "min_text_length": None,
        "skip_weak": True,
        "flip_lr": 0.5,
        "first_epoch": 5,
        "last_epoch": None,
        "weight": 60,
    },
    {
        "left": None,
        "right": {"mode": "path", "flip_pc": 0.0},
        "min_text_length": None,
        "skip_weak": True,
        "flip_lr": 0.5,
        "first_epoch": 5,
        "last_epoch": None,
        "weight": 40,
    },
    {
        "left": {"mode": "random", "flip_pc": 0.0},
        "right": {"mode": "valid", "flip_pc": 0.0},
        "min_text_length": None,
        "skip_weak": True,
        "flip_lr": 0.5,
        "first_epoch": None,
        "last_epoch": None,
        "weight": 60,
    },
    {
        "left": None,
        "right": {"mode": "valid", "flip_pc": 0.0},
        "min_text_length": None,
        "skip_weak": True,
        "flip_lr": 0.5,
        "first_epoch": None,
        "last_epoch": None,
        "weight": 40,
    }
]
eval_plan = [
    {
        "left": {"mode": "random", "flip_pc": 0.0},
        "right": {"mode": "valid", "flip_pc": 0.0},
        "min_text_length": 20,
        "skip_weak": False,
        "flip_lr": 0.5,
        "weight": 60,
    },
    {
        "left": None,
        "right": {"mode": "valid", "flip_pc": 0.0},
        "min_text_length": 20,
        "skip_weak": False,
        "flip_lr": 0.5,
        "weight": 40,
    },
    {
        "left": {"mode": "random", "flip_pc": 0.0},
        "right": {"mode": "valid", "flip_pc": 0.0},
        "min_text_length": None,
        "skip_weak": True,
        "flip_lr": 0.5,
        "weight": 60,
    },
    {
        "left": None,
        "right": {"mode": "valid", "flip_pc": 0.0},
        "min_text_length": None,
        "skip_weak": True,
        "flip_lr": 0.5,
        "weight": 40,
    },
]
ttgen = create_train_test(
    train_ns=ns_train,
    train_validation_ns=ns_train,
    test_ns=ns_test,
    test_validation_ns=ns_test,
    train_learning_plan=train_plan,
    train_val_learning_plan=eval_plan,
    test_learning_plan=eval_plan,
    test_val_learning_plan=eval_plan,
    batch_size=4 if is_cuda else 8,
    epoch_batches=5000 if is_cuda else 500,
    train_val_size=10000 if is_cuda else 1000,
    test_size=10000 if is_cuda else 1000,
    test_val_size=10000 if is_cuda else 1000,
    compute_batch_size=100 if is_cuda else 100,
    now=now)

In [7]:
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel

In [8]:
device = torch.device("cuda") if is_cuda else torch.device("cpu")
device

device(type='cuda')

In [9]:
from typing import Literal, TypedDict

ProviderRole = Literal["child", "parent"]

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
EMBED_SIZE = 768

TokenizedInput = TypedDict('TokenizedInput', {
    "input_ids": torch.Tensor,
    "attention_mask": torch.Tensor,
})


def tokens(texts: list[str]) -> TokenizedInput:
    res = tokenizer(texts.tolist(), return_tensors="pt", padding=True, truncation=True)
    return {k: v.to(device) for k, v in res.items()}


class Noise(nn.Module):
    def __init__(self, std: float = 1.0, p: float = 0.5) -> None:
        super().__init__()
        self._std = std
        self._dropout = nn.Dropout(p)
        self._dhold = nn.Parameter(torch.Tensor([0.0]))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if not self.training:
            return x
        return x + self._dropout(torch.normal(
            mean=0.0, std=self._std, size=x.shape, device=self._dhold.device))


class Model(nn.Module):
    def __init__(self, version: int) -> None:
        super().__init__()
        self._bert_parent = DistilBertModel.from_pretrained(
            "distilbert-base-uncased")
        self._bert_child = DistilBertModel.from_pretrained(
            "distilbert-base-uncased")
        if version == 1 or version >= 3:
            self._pdense: nn.Sequential | None = nn.Sequential(
                nn.Linear(EMBED_SIZE, EMBED_SIZE),
                nn.Dropout(p=0.5),
                nn.ReLU(),
                nn.Linear(EMBED_SIZE, EMBED_SIZE))
            self._cdense: nn.Sequential | None = nn.Sequential(
                nn.Linear(EMBED_SIZE, EMBED_SIZE),
                nn.Dropout(p=0.5),
                nn.ReLU(),
                nn.Linear(EMBED_SIZE, EMBED_SIZE))
        else:
            self._pdense = None
            self._cdense = None
        if version < 4:
            self._noise = None
        else:
            self._noise = Noise(std=0.01, p=0.5)
        if version < 2:
            self._cos = None
        else:
            self._cos = torch.nn.CosineSimilarity()
        self._version = version

    def get_version(self) -> int:
        return self._version

    def get_parent_embed(
            self,
            input_ids: torch.Tensor,
            attention_mask: torch.Tensor) -> torch.Tensor:
        outputs_parent = self._bert_parent(
            input_ids=input_ids, attention_mask=attention_mask)
        out = outputs_parent.last_hidden_state[:, 0]
        if self._pdense is not None:
            out = self._pdense(out)
        if self._noise is not None:
            out = self._noise(out)
        return out

    def get_child_embed(
            self,
            input_ids: torch.Tensor,
            attention_mask: torch.Tensor) -> torch.Tensor:
        outputs_child = self._bert_child(
            input_ids=input_ids, attention_mask=attention_mask)
        out = outputs_child.last_hidden_state[:, 0]
        if self._cdense is not None:
            out = self._cdense(out)
        if self._noise is not None:
            out = self._noise(out)
        return out

    def forward(self, x: dict[ProviderRole, TokenizedInput]) -> torch.Tensor:
        parent_cls = self.get_parent_embed(
            input_ids=x["parent"]["input_ids"],
            attention_mask=x["parent"]["attention_mask"])
        child_cls = self.get_child_embed(
            input_ids=x["child"]["input_ids"],
            attention_mask=x["child"]["attention_mask"])
        if self._cos is not None:
            return self._cos(parent_cls, child_cls).reshape([-1, 1])
        batch_size = parent_cls.shape[0]
        return torch.bmm(
            parent_cls.reshape([batch_size, 1, -1]),
            child_cls.reshape([batch_size, -1, 1])).reshape([-1, 1])


class TrainingHarness(nn.Module):
    def __init__(self, model: Model) -> None:
        super().__init__()
        self._model = model
        self._softmax = nn.Softmax(dim=1)
        self._loss = nn.BCELoss()

    def get_version(self) -> int:
        return self._model.get_version()

    def forward(
            self,
            left: TokenizedInput,
            right: TokenizedInput,
            labels: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        out_left = self._model(left)
        out_right = self._model(right)
        preds = self._softmax(torch.hstack((out_left, out_right)))
        return preds, self._loss(preds, labels)

In [10]:
from torch.optim import AdamW

model = Model(version=4)
model.to(device)
harness = TrainingHarness(model)
harness.to(device)

FORCE_RESTART = True

folder = "checkpoints"
postfix = "_lg" if is_cuda else ""
version_tag = "" if harness.get_version() == 0 else f"_v{harness.get_version()}"
mprev = highest_number(os.listdir(folder), prefix=f"harness{version_tag}{postfix}_", postfix=".pkl")
if not FORCE_RESTART and mprev is not None:
    prev_fname, prev_epoch = mprev
    harness.load_state_dict(torch.load(os.path.join(folder, prev_fname), map_location=device))
    epoch_offset = prev_epoch + 1
else:
    epoch_offset = 0

optimizer = AdamW(harness.parameters(), lr=5e-5)
mprev, epoch_offset

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- T

(('harness_v4_lg_8.pkl', 8), 0)

In [11]:
from transformers import get_scheduler
# from tqdm.notebook import tqdm
from tqdm.auto import tqdm
import evaluate

def compute(df):
    plefts = tokens(df["parent_left"])
    clefts = tokens(df["child_left"])
    prights = tokens(df["parent_right"])
    crights = tokens(df["child_right"])
    labels = torch.tensor([~df["correct_is_right"], df["correct_is_right"]], dtype=torch.float32).T.to(device)
    return harness({"parent": plefts, "child": clefts}, {"parent": prights, "child": crights}, labels)

num_epochs = max((100 if is_cuda else 10) - epoch_offset, 3)
num_training_steps = num_epochs * ttgen.get_epoch_train_size()
warmup = 10000 if is_cuda else 10
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=warmup,
    num_training_steps=num_training_steps - warmup)
ttgen.set_epoch(epoch_offset)

for _ in range(num_epochs):
    epoch = ttgen.get_epoch()
    print(f"epoch {epoch}")
    
    model.train()
    harness.train()
    metric_train = evaluate.load("accuracy")
    train_loss = []
    with tqdm(desc="train", total=ttgen.get_epoch_train_size()) as progress_bar:
        for train_df in ttgen.train_dfs():
            preds, loss = compute(train_df)
            train_loss.append(loss.item())
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(train_df.shape[0])
            
            predictions = torch.argmax(preds, dim=-1)
            metric_train.add_batch(predictions=predictions, references=train_df["correct_is_right"].astype(int))

    folder = "checkpoints"
    postfix = "_lg" if is_cuda else ""
    version_tag = "" if harness.get_version() == 0 else f"_v{harness.get_version()}"
    torch.save(harness.state_dict(), os.path.join(folder, f"harness{version_tag}{postfix}_{epoch}.pkl"))
            
    model.eval()
    harness.eval()
    with torch.no_grad():
        metric_val_train = evaluate.load("accuracy")
        train_val_loss = []
        with tqdm(desc="train val", total=ttgen.get_epoch_train_validation_size()) as progress_bar:
            for train_validation_df in ttgen.train_validation_dfs():
                preds, loss = compute(train_validation_df)
                train_val_loss.append(loss.item())
                predictions = torch.argmax(preds, dim=-1)
                metric_val_train.add_batch(
                    predictions=predictions, references=train_validation_df["correct_is_right"].astype(int))
                progress_bar.update(train_validation_df.shape[0])
        
        metric_test = evaluate.load("accuracy")
        test_loss = []
        with tqdm(desc="test", total=ttgen.get_epoch_test_size()) as progress_bar:
            for test_df in ttgen.test_dfs():
                preds, loss = compute(test_df)
                test_loss.append(loss.item())
                predictions = torch.argmax(preds, dim=-1)
                metric_test.add_batch(
                    predictions=predictions, references=test_df["correct_is_right"].astype(int))
                progress_bar.update(test_df.shape[0])
        
        print(f"train: {metric_train.compute()} loss: {np.mean(train_loss)}")
        print(f"train val: {metric_val_train.compute()} loss: {np.mean(train_val_loss)}")
        print(f"test: {metric_test.compute()} loss: {np.mean(test_loss)}")
    ttgen.advance_epoch()

epoch 0


train:   0%|          | 0/20000 [00:00<?, ?it/s]

train val:   0%|          | 0/10000 [00:00<?, ?it/s]

test:   0%|          | 0/10000 [00:00<?, ?it/s]

train: {'accuracy': 0.68185} loss: 0.5873909183979035
train val: {'accuracy': 0.6917} loss: 0.5480215743899345
test: {'accuracy': 0.6391} loss: 0.6578032352447509
epoch 1


train:   0%|          | 0/20000 [00:00<?, ?it/s]

train val:   0%|          | 0/10000 [00:00<?, ?it/s]

test:   0%|          | 0/10000 [00:00<?, ?it/s]

train: {'accuracy': 0.74305} loss: 0.5166275749832392
train val: {'accuracy': 0.6932} loss: 0.5714148891210556
test: {'accuracy': 0.5848} loss: 0.6870836307406425
epoch 2


train:   0%|          | 0/20000 [00:00<?, ?it/s]

train val:   0%|          | 0/10000 [00:00<?, ?it/s]

test:   0%|          | 0/10000 [00:00<?, ?it/s]

train: {'accuracy': 0.7264} loss: 0.5286102749675512
train val: {'accuracy': 0.652} loss: 0.5427525677800179
test: {'accuracy': 0.4834} loss: 0.6804437789916992
epoch 3


train:   0%|          | 0/20000 [00:00<?, ?it/s]

train val:   0%|          | 0/10000 [00:00<?, ?it/s]

test:   0%|          | 0/10000 [00:00<?, ?it/s]

train: {'accuracy': 0.65845} loss: 0.5562302485615015
train val: {'accuracy': 0.6177} loss: 0.5642337084710598
test: {'accuracy': 0.5031} loss: 0.6838018469691276
epoch 4


train:   0%|          | 0/20000 [00:00<?, ?it/s]

train val:   0%|          | 0/10000 [00:00<?, ?it/s]

test:   0%|          | 0/10000 [00:00<?, ?it/s]

train: {'accuracy': 0.62035} loss: 0.6001691032350064
train val: {'accuracy': 0.6592} loss: 0.6928291609048843
test: {'accuracy': 0.5237} loss: 0.693063754415512
epoch 5


train:   0%|          | 0/20000 [00:00<?, ?it/s]

train val:   0%|          | 0/10000 [00:00<?, ?it/s]

ValueError: error in compute thread

In [None]:
folder = "."
postfix = "_lg" if is_cuda else ""
version_tag = "" if harness.get_version() == 0 else f"_v{harness.get_version()}"
torch.save(model.state_dict(), os.path.join(folder, f"model{version_tag}{postfix}.pkl"))
torch.save(harness.state_dict(), os.path.join(folder, f"harness{version_tag}{postfix}.pkl"))
torch.save(optimizer.state_dict(), os.path.join(folder, f"optimizer{version_tag}{postfix}.pkl"))

In [None]:
ttgen.reset()
model.eval()
harness.eval()
dfs = []
with torch.no_grad():
    metric_val_test = evaluate.load("accuracy")
    test_val_loss = []
    with tqdm(desc="test val", total=ttgen.get_epoch_test_validation_size()) as progress_bar:
        for test_val_df in ttgen.test_validation_dfs():
            preds, loss = compute(test_val_df)
            test_val_loss.append(loss.item())
            predictions = torch.argmax(preds, dim=-1)
            metric_val_test.add_batch(
                predictions=predictions, references=test_val_df["correct_is_right"].astype(int))
            cur_df = test_val_df.copy()
            cur_df["logit_left"] = preds[:, 0].cpu()
            cur_df["logit_right"] = preds[:, 1].cpu()
            cur_df["preds"] = predictions.cpu()
            cur_df["truth"] = test_val_df["correct_is_right"].astype(int)
            dfs.append(cur_df)
            progress_bar.update(test_val_df.shape[0])
print(f"test val: {metric_val_test.compute()} loss: {np.mean(test_val_loss)}")
validation_df = pd.concat(dfs)

In [None]:
postfix = "_lg" if is_cuda else ""
version_tag = "" if harness.get_version() == 0 else f"_v{harness.get_version()}"
validation_df.to_csv(os.path.join(folder, f"validation{version_tag}{postfix}.csv"))

In [None]:
validation_df[validation_df["preds"] == validation_df["truth"]].head()

In [None]:
validation_df[validation_df["preds"] != validation_df["truth"]].head()

In [None]:
ttgen.reset()
model.eval()
harness.eval()
with torch.no_grad():
    count = 0
    for test_val_df in ttgen.test_validation_dfs():
        plefts = tokens(test_val_df["parent_left"])
        clefts = tokens(test_val_df["child_left"])
        prights = tokens(test_val_df["parent_right"])
        crights = tokens(test_val_df["child_right"])
        display(model.get_child_embed(
            clefts["input_ids"],
            clefts["attention_mask"]))
        display(model.get_child_embed(
            crights["input_ids"],
            crights["attention_mask"]))
        count += 1
        if count >= 5:
            break