In [1]:
import gc
import math
import os
import random

import numpy as np
import pandas as pd
import torch
from einops import einsum, rearrange, reduce, repeat
from einops._torch_specific import allow_ops_in_compiled_graph  # requires einops>=0.6.1
from einops.layers.torch import Rearrange, Reduce
from fastai.vision.all import DataLoaders, GradientClip, Learner, Metric
from sklearn.model_selection import KFold
from torch import Tensor
from torch import functional as F
from torch import nn
from torch.utils.data import BatchSampler, DataLoader, TensorDataset

allow_ops_in_compiled_graph()

In [2]:
BATCH_SIZE = 128
SEED = 0
WORKERS = 0
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [3]:
input = pd.read_csv("cl_2_input.csv", sep="\t", header=None)
input.rename(columns={0: "book", 1: "chapter", 2: "verse", 3: "text"}, inplace=True)
input

Unnamed: 0,book,chapter,verse,text
0,Genesis,1,1,B R>CJT BR> >LHJM >T H CMJM W >T H >RY
1,Genesis,1,2,W H >RY HJTH THW W BHW W XCK <L PNJ THWM W RWX >LHJM MRXPT <L PNJ H MJM
2,Genesis,1,3,W J>MR >LHJM JHJ >WR W JHJ >WR
3,Genesis,1,4,W JR> >LHJM >T H >WR KJ VWB W JBDL >LHJM BJN H >WR W BJN H XCK
4,Genesis,1,5,W JQR> >LHJM L >WR JWM W L XCK QR> LJLH W JHJ <RB W JHJ BQR JWM >XD
...,...,...,...,...
23208,2_Chronicles,36,19,W JFRPW >T BJT H >LHJM W JNTYW >T XWMT JRWCLM W KL >RMNWTJH FRPW B >C W KL KLJ MXMDJH L HCXJT
23209,2_Chronicles,36,20,W JGL H C>RJT MN H XRB >L BBL W JHJW LW W L BNJW L <BDJM <D MLK MLKWT PRS
23210,2_Chronicles,36,21,L ML>WT DBR JHWH B PJ JRMJHW <D RYTH H >RY >T CBTWTJH KL JMJ H CMH CBTH L ML>WT CB<JM CNH
23211,2_Chronicles,36,22,W B CNT >XT L KWRC MLK PRS L KLWT DBR JHWH B PJ JRMJHW H<JR JHWH >T RWX KWRC MLK PRS W J<BR QWL B KL MLKWTW W GM B MKTB L >MR


In [4]:
output = pd.read_csv("cl_2_output.csv", sep="\t", header=None)
output.rename(columns={0: "book", 1: "chapter", 2: "verse", 3: "text"}, inplace=True)
output

Unnamed: 0,book,chapter,verse,text
0,Genesis,1,1,x xxxxx xxx xxxxx xx x xxxx x xx x xxxA
1,Genesis,1,2,x x xxx xxxx xxx x xxxA x xxx xx xxx xxxxA x xxx xxxxx xxxxx xx xxx x xxxA
2,Genesis,1,3,x xxxx xxxxxA xxx xxxA x xxx xxxA
3,Genesis,1,4,x xxx xxxxx xx x xxxA xx xxxA x xxxx xxxxx xxx x xxx x xxx x xxxA
4,Genesis,1,5,x xxxx xxxxx x xxx xxxA x x xxx xxx xxxxA x xxx xxxA x xxx xxxA xxx xxxA
...,...,...,...,...
23208,2_Chronicles,36,19,x xxxxx xx xxx x xxxxxA x xxxxx xx xxxx xxxxxxA x xx xxxxxxxx xxxx x xxA x xx xxx xxxxxxA x xxxxxA
23209,2_Chronicles,36,20,x xxx x xxxxx xx x xxx xx xxxA x xxxx xx x x xxxx x xxxxxA xx xxx xxxxx xxxA
23210,2_Chronicles,36,21,x xxxxx xxx xxxx x xx xxxxxxA xx xxxx x xxx xx xxxxxxxA xx xxx x xxx xxxxA x xxxxx xxxxx xxxA
23211,2_Chronicles,36,22,x x xxx xxx x xxxx xxx xxxA x xxxx xxx xxxx x xx xxxxxxA xxxx xxxx xx xxx xxxx xxx xxxA x xxxx xxx x xx xxxxxxA x xx x xxxxA x xxxA


In [5]:
def build_vocab(verses):
    vocab = set()
    for verse in verses:
        for word in verse.split():
            vocab.add(word)
    return vocab

In [6]:
vocab = build_vocab(input.text)
len(vocab)

24438

In [7]:
word_map = {word: i + 1 for i, word in enumerate(vocab)}

In [8]:
mapped_input = [[word_map[word] for word in verse.split()] for verse in input.text]

In [9]:
mapped_output = [[int("A" in word) for word in verse.split()] for verse in output.text]

In [10]:
wrong_len = []
for i, (input_verse, output_verse) in enumerate(zip(mapped_input, mapped_output)):
    if len(input_verse) != len(output_verse):
        wrong_len.append(i)
        print("wrong len", i, input_verse, output_verse)

n = 0
for i in wrong_len:
    mapped_input.pop(i - n)
    mapped_output.pop(i - n)
    n += 1

wrong len 8129 [14533, 551, 22503, 10967, 14435, 20339, 15727, 2785, 16208, 24391, 16391, 6639, 9937, 12614, 24391, 14890] [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1]
wrong len 12244 [2590, 8159, 379, 19526, 14533, 8964, 12473, 23862, 6639, 19526, 18121, 3990, 21809, 20022, 12473, 4693] [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
wrong len 12994 [14533, 2590, 12473, 5000, 7915, 12473, 10751, 5344, 12473, 14415, 24391, 20840, 15307, 19989, 6639, 17750, 6080, 5116, 5344, 23547] [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1]
wrong len 19710 [14533, 12919, 22534, 15145, 5344, 7675, 15361] [0, 0, 1, 1, 0, 0, 1, 1]
wrong len 19722 [14533, 12919, 15520, 12473, 11688, 12473, 24421, 22371, 16171, 22610, 17750, 4716, 19670, 3803, 4716, 5211] [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1]


In [11]:
max_words_per_verse = max(map(len, mapped_input))
max_words_per_verse

74

In [12]:
x = [verse + [0] * (max_words_per_verse - len(verse)) for verse in mapped_input]
x = Tensor(x).to(DEVICE).to(torch.int64)
x

tensor([[24391,  6756, 18013,  ...,     0,     0,     0],
        [14533, 12473, 18030,  ...,     0,     0,     0],
        [14533,  2479, 13548,  ...,     0,     0,     0],
        ...,
        [ 6639,  9525, 19989,  ...,     0,     0,     0],
        [14533, 24391,  9149,  ...,     0,     0,     0],
        [ 8163, 17750, 20108,  ...,     0,     0,     0]], device='cuda:0')

In [13]:
y = [verse + [np.nan] * (max_words_per_verse - len(verse)) for verse in mapped_output]
y = Tensor(y).to(DEVICE)
y

tensor([[0., 0., 0.,  ..., nan, nan, nan],
        [0., 0., 0.,  ..., nan, nan, nan],
        [0., 0., 1.,  ..., nan, nan, nan],
        ...,
        [0., 0., 0.,  ..., nan, nan, nan],
        [0., 0., 0.,  ..., nan, nan, nan],
        [0., 0., 0.,  ..., nan, nan, nan]], device='cuda:0')

In [14]:
class SinusoidalPosEmb(nn.Module):
    def __init__(self, dim=16, M=100):
        super().__init__()
        self.dim = dim
        self.M = M

    def forward(self, x):
        device = x.device
        half_dim = self.dim // 2
        emb = math.log(self.M) / half_dim
        emb = torch.exp(torch.arange(half_dim, device=device) * (-emb))
        emb = x[..., None] * emb[None, ...]
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb

In [15]:
class ClauseFinder(nn.Module):
    def __init__(self, d_model, nhead, nlayer, vocab_size, nrepeat=1):
        super().__init__()
        self.nrepeat = nrepeat
        self.d_model = d_model
        self.nhead = nhead
        self.emb = nn.Embedding(vocab_size, d_model // 2)
        self.pos_enc = SinusoidalPosEmb(d_model // 2)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=d_model,
                nhead=nhead,
                dim_feedforward=4 * d_model,
                dropout=0.1,
                batch_first=True,
                norm_first=True,
            ),
            nlayer,
            nn.LayerNorm(d_model),
        )

        self.proj_out = nn.Sequential(
            nn.Linear(d_model, 1),
            nn.Sigmoid(),
            Rearrange("verse word pred -> verse (word pred)"),
        )

    def forward(self, x):
        x = x.to(DEVICE)
        mask = x == 0
        x = rearrange(
            [self.emb(x), self.pos_enc(x)], "src verse word emb -> verse word (src emb)"
        )
        for _ in range(self.nrepeat):
            x = self.transformer(x, src_key_padding_mask=mask)
        x = self.proj_out(x)
        return x

In [16]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [17]:
def loss(pred, target, eps=0.0001):
    notna = ~target.isnan()
    target = target[notna]
    pred = pred[notna]
    return -torch.log(eps + (1 - target - pred).abs()).mean()

In [18]:
class DeviceDataLoader:
    def __init__(self, dataloader, device="cuda"):
        self.dataloader = dataloader
        self.device = device

    def __len__(self):
        return len(self.dataloader)

    def __iter__(self):
        for batch in self.dataloader:
            yield [el.to(self.device) for el in batch]

In [19]:
class Correctness(Metric):
    def __init__(self):
        self.reset()

    def reset(self):
        self.incorrect = 0
        self.total = 0

    def accumulate(self, learn):
        pred = learn.pred
        y = learn.y
        notna = ~y.isnan()
        pred = pred[notna]
        y = y[notna]
        self.incorrect += (y - pred.round()).abs().sum()
        self.total += len(y)

    @property
    def value(self):
        return (self.total - self.incorrect) / self.total


class Recall(Metric):
    def __init__(self):
        self.reset()

    def reset(self):
        self.correct = 0
        self.total = 0

    def accumulate(self, learn):
        pred = learn.pred
        y = learn.y
        events = y == 1
        self.correct += pred.round()[events].sum()
        self.total += events.sum()

    @property
    def value(self):
        return self.correct / self.total


class Precision(Metric):
    def __init__(self):
        self.reset()

    def reset(self):
        self.true_positives = 0
        self.positives = 0

    def accumulate(self, learn):
        pred = learn.pred
        y = learn.y
        notna = ~y.isnan()
        y = y[notna]
        pred = pred[notna]
        positives = pred.round() == 1.0
        self.true_positives += y[positives].sum()
        self.positives += positives.sum()

    @property
    def value(self):
        return self.true_positives / self.positives

In [20]:
gc.collect()
seed_everything(SEED)
split = list(KFold(n_splits=5, random_state=SEED, shuffle=True).split(x))
split_train = split[0][0]
split_test = split[0][1]

x_train = x[split_train]
x_test = x[split_test]
y_train = y[split_train]
y_test = y[split_test]
ds_train = TensorDataset(x_train, y_train)
ds_test = TensorDataset(x_test, y_test)

dl_train = DeviceDataLoader(
    DataLoader(
        ds_train,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=WORKERS,
        persistent_workers=WORKERS > 0,
    )
)
dl_val = DeviceDataLoader(
    DataLoader(
        ds_test,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=WORKERS,
        persistent_workers=WORKERS > 0,
    )
)

data = DataLoaders(dl_train, dl_val)
model = ClauseFinder(
    d_model=1024, nhead=16, nlayer=4, vocab_size=len(vocab) + 1, nrepeat=3
).to(DEVICE)

learn = Learner(
    data,
    model,
    loss_func=loss,
    cbs=[GradientClip(3.0)],
    metrics=[Correctness(), Precision(), Recall()],
)

learn.fit_one_cycle(16, lr_max=1e-4, wd=0.1, pct_start=0.02)
gc.collect()



epoch,train_loss,valid_loss,correctness,precision,recall,time
0,0.391996,0.403115,0.762181,0.466233,0.706386,02:12
1,0.367892,0.364325,0.80324,0.552087,0.466329,02:12
2,0.344914,0.359355,0.808494,0.563275,0.500054,02:13
3,0.31808,0.3558,0.813055,0.585342,0.457775,02:13
4,0.281999,0.375479,0.808035,0.551949,0.58554,02:13
5,0.230424,0.397065,0.814724,0.595015,0.442192,02:13
6,0.180331,0.438724,0.814618,0.581344,0.502888,02:13
7,0.136195,0.534446,0.816992,0.58708,0.511496,02:13
8,0.102612,0.611013,0.816569,0.584788,0.516509,02:13
9,0.081778,0.660713,0.811515,0.561942,0.573118,02:13


  return torch._transformer_encoder_layer_fwd(


0

### Model's metrics

**loss**: difference between the model's prediction and the training data, you need to choose how it is calculated
==> in this model, word by word and the model tries to predict a word with or without A
==> the model gives a probability that the word has an A, cost in bits of the prediction of the A
==> here, the loss is the **cross entropy loss** (entropy: cost of the prediction, related to information theory, information measured in bits and 
correspond to the cost needed to stock an information...)

**train loss**: median value of the loss on training data

**valid_loss**: median value of the loss on test data

**correctness**: percentage of words predicted correctly (word has an A or not)

**precision**: number of time A is predicted and there is actually an A (true positive divided by all the positives, true and false)

**recall**: percentage of true positives found / all relevant elements (elements that should have been found as positive)