In [1]:
import itertools
import logging
from tqdm import tqdm

import matplotlib.pyplot as plt
from datamaestro import prepare_dataset
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import torch
from typing import List
import time
logging.basicConfig(level=logging.INFO)
from pathlib import Path

ds = prepare_dataset('org.universaldependencies.french.gsd')


# Format de sortie décrit dans
# https://pypi.org/project/conllu/

class Vocabulary:
    """Permet de gérer un vocabulaire.

    En test, il est possible qu'un mot ne soit pas dans le
    vocabulaire : dans ce cas le token "__OOV__" est utilisé.
    Attention : il faut tenir compte de cela lors de l'apprentissage !

    Utilisation:

    - en train, utiliser v.get("blah", adding=True) pour que le mot soit ajouté
      automatiquement s'il n'est pas connu
    - en test, utiliser v["blah"] pour récupérer l'ID du mot (ou l'ID de OOV)
    """
    OOVID = 1
    PAD = 0

    def __init__(self, oov: bool):
        """ oov : autorise ou non les mots OOV """
        self.oov =  oov
        self.id2word = [ "PAD"]
        self.word2id = { "PAD" : Vocabulary.PAD}
        if oov:
            self.word2id["__OOV__"] = Vocabulary.OOVID
            self.id2word.append("__OOV__")

    def __getitem__(self, word: str):
        if self.oov:
            return self.word2id.get(word, Vocabulary.OOVID)
        return self.word2id[word]

    def get(self, word: str, adding=True):
        try:
            return self.word2id[word]
        except KeyError:
            if adding:
                wordid = len(self.id2word)
                self.word2id[word] = wordid
                self.id2word.append(word)
                return wordid
            if self.oov:
                return Vocabulary.OOVID
            raise

    def __len__(self):
        return len(self.id2word)

    def getword(self,idx: int):
        if idx < len(self):
            return self.id2word[idx]
        return None

    def getwords(self,idx: List[int]):
        return [self.getword(i) for i in idx]



class TaggingDataset():
    def __init__(self, data, words: Vocabulary, tags: Vocabulary, adding=True):
        self.sentences = []

        for s in data:
            self.sentences.append(([words.get(token["form"], adding) for token in s], [tags.get(token["upostag"], adding) for token in s]))
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self, ix):
        return self.sentences[ix]


def collate_fn(batch):
    """Collate using pad_sequence"""
    return tuple(pad_sequence([torch.LongTensor(b[j]) for b in batch]) for j in range(2))

def collate_fn_ukn(batch, id_ukn=None, replace_ukn_rate = 0.0):
    """Collate using pad_sequence with ukn token replacement"""
    X, Y = [], []
    for b in batch:
        # replace tokens with id_unknown with Bernoulli distribution
        data_x = torch.LongTensor(b[0])
        p = replace_ukn_rate
        mask = torch.distributions.Bernoulli(probs=(1-p)).sample(data_x.size())
        data_x[~mask.bool()] = id_ukn
        # concatenate the data
        X.append(data_x)
        Y.append(torch.LongTensor(b[1]))
    return (pad_sequence(X), pad_sequence(Y))

logging.info("Loading datasets...")
words = Vocabulary(True)
tags = Vocabulary(False)
train_data = TaggingDataset(ds.train, words, tags, True)
dev_data = TaggingDataset(ds.validation, words, tags, True)
test_data = TaggingDataset(ds.test, words, tags, False)


logging.info("Vocabulary size: %d", len(words))


BATCH_SIZE=100

train_loader = DataLoader(train_data, collate_fn=collate_fn, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_data, collate_fn=collate_fn, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, collate_fn=collate_fn, batch_size=BATCH_SIZE)


# Only in train and dev 
ID_OOV = words.word2id['__OOV__']
P_OOV = 0.01
train_loader_ukn = DataLoader(train_data, collate_fn=lambda x : collate_fn_ukn(x, ID_OOV, P_OOV),\
                              batch_size=BATCH_SIZE, shuffle=True)
dev_loader_ukn = DataLoader(dev_data, collate_fn=lambda x : collate_fn_ukn(x, ID_OOV, P_OOV),\
                            batch_size=BATCH_SIZE)

#  TODO:  Implémenter le modèle et la boucle d'apprentissage (en utilisant les LSTMs de pytorch)


INFO:root:Loading datasets...
INFO:root:Vocabulary size: 42926


In [2]:
class model_tagging(nn.Module):
    def __init__(self, vocab_size, in_size, vocab_out_size, hidden_size, num_layers):
        super().__init__()
        # embedding
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=in_size)
        # encoder
        self.encoder = nn.LSTM(input_size=in_size, hidden_size=hidden_size, num_layers=num_layers)
        self.decoder = nn.Linear(hidden_size, vocab_out_size)
        
    def forward(self, x):
        """
            x: tensor 3-d (Len, Batch)
        """
        # embedding
        x = self.embedding(x)
        # encode
        out, (h_n, c_n) = self.encoder(x)
        out_final = self.decoder(out)
        return out_final

In [3]:
@torch.no_grad()
def eval_model(dataloader, model, loss_fn, device):
    model.eval()
    loss_test = 0.
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        y_hat = tagger(x)
        loss = loss_fn(y_hat.permute(0, 2, 1), y)
        loss_test += loss.item()
    model.train()
    return loss_test/len(dataloader)

In [4]:
class State:
    def __init__(self, model, optim):
        self.model = model
        self.optim = optim
        self.epoch = 0
        self.iteration = 0

In [5]:
#@title seq2seq pour le tagging with ukn
savepath = Path("TAGGER.plt")

EPOCHS = 20
hidden_size = 200
VOCAB_SIZE_WORD = len(words)
VOCAB_SIZE_TAG = len(tags)
num_layers = 2
VECT_EMB_SIZE = 150
LR = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if savepath.is_file():
    with savepath.open('rb') as fp:
        state = torch.load(fp)
else:
    # RNN tagger
    tagger = model_tagging(vocab_size=VOCAB_SIZE_WORD,\
                                   in_size=VECT_EMB_SIZE,\
                                   vocab_out_size=VOCAB_SIZE_TAG,\
                                   hidden_size=hidden_size,\
                                   num_layers=num_layers)
    # optimizer
    tagger.to(device)
    optim = torch.optim.Adam(tagger.parameters(), lr=LR)
    state = State(model = tagger, optim = optim)
    
# loss function
loss_fn = nn.CrossEntropyLoss(ignore_index=tags.word2id['PAD'])

In [7]:
# use loader with UKN (cell 1)
state.model.train()
writer = SummaryWriter()
for epoch in tqdm(range(state.epoch, EPOCHS)):
    for x, y in train_loader_ukn:
        x, y = x.to(device), y.to(device)
        y_hat = state.model(x)
        state.optim.zero_grad()
        loss = loss_fn(y_hat.permute(0, 2, 1), y)
        loss.backward()
        state.optim.step()
        state.iteration += 1
        writer.add_scalar("Loss/train", loss, state.iteration)

    with savepath.open('wb') as fp:
        state.epoch = epoch + 1
        torch.save(state, fp)
        
    loss_eval = eval_model(test_loader, state.model, loss_fn, device)
    writer.add_scalar("Loss/test", loss_eval, state.epoch)
writer.close()

2023-11-13 13:01:01.986175: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-13 13:01:04.053898: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-13 13:01:08.718732: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /Infos/lmd/2021/licence/ue/LU3IN029-2021oct/kO6/bin/gcc/lib64::/usr/local/cuda-11.6/lib64:/usr/local/cuda-12.2/lib64:/opt/gurobi801/linux64/lib
2023-11-13 13:01:08.721637: W tensorflow/stream_executor/platform/de

In [19]:
with torch.no_grad():
    state.model.eval()
    nb_visualisation = 5
    x, y = next(iter(test_loader))
    # predict
    x= x.to(device)
    yhat = state.model(x)
    pred = torch.argmax(torch.softmax(yhat, dim=-1), dim=-1)
    # visualiser
    x, pred, y = x.permute(1, 0), pred.permute(1, 0), y.permute(1, 0) # batch, len
    idx_random = torch.randint(len(x),(nb_visualisation,))
    x_rand, pred_rand, y_rand = x[idx_random], pred[idx_random], y[idx_random]
    for i in range(len(x_rand)):
        print("Sentence")
        print(words.getwords(x_rand[i]))
        print("Prediction")
        print(tags.getwords(pred_rand[i]))
        print("Ground truth")
        print(tags.getwords(y_rand[i]))

Sentence
['Louis', 'Bastien', '(', 'né', 'le', '26', 'octobre', '1881', 'à', 'Paris', 'et', 'mort', 'le', '13', 'août', '1963', 'à', '__OOV__', ')', 'est', 'un', 'coureur', 'cycliste', 'et', 'escrimeur', 'français', 'du', 'de', 'le', 'début', 'du', 'de', 'le', 'XXe', 'siècle', ',', 'dont', 'la', 'spécialité', 'était', 'le', 'cyclisme', 'sur', 'piste', '.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
Prediction
['PROPN', 'PROPN', 'PUNCT', 'VERB', 'DET', 'NUM', 'NOUN', 'NUM', 'ADP', 'PROPN', 'CCONJ', 'VERB', 'DET', 'NUM', 'NOUN', 'NUM', 'ADP', 'PROPN', 'PUNCT', 'AUX', 'DET', 'NOUN', 'ADJ', 'CCONJ', 'VERB', 'ADJ', '_', 'ADP', 'DET', 'NOUN', '_', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'PRON', 'DET', 'NOUN', 'AUX', 'DET', 'NOUN', 'ADP', 'NOUN', 'PUNCT', 'PROPN', 'DET', 'DET', 'DET', 'DET', 'DET', 'DET', 'DET

In [9]:
%load_ext tensorboard
%tensorboard --logdir runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
