# Transfer learning w NLP - dopasuj model do własnego problemu


## AI&NLP Day 2021

### Patryk Pilarski
```
1patryk.pilarski@gmail.com
p.pilarski@sages.com.pl
```

## Dane
### Allegro Reviews

**Pobranie zbioru**

In [None]:
!wget https://klejbenchmark.com/static/data/klej_ar.zip
!unzip klej_ar.zip -d klej_ar

**Wczytanie danych**

In [None]:
import os
os.listdir("klej_ar")

In [None]:
with open("klej_ar/train.tsv", "r") as f:
    raw_train = f.readlines()

In [None]:
with open("klej_ar/dev.tsv", "r") as f:
    raw_dev = f.readlines()

In [None]:
len(raw_train), len(raw_dev)

**Zbadanie danych**

In [None]:
raw_train[0]

In [None]:
raw_train[1]

In [None]:
raw_train[-1]

In [None]:
labels = []
problematic = []

for doc in raw_train[1:]:
    record = doc.strip().split("\t")
    if len(record) != 2:
        problematic.append(doc)
        continue
    label = record[1]
    labels.append(label)   

In [None]:
from collections import Counter
Counter(labels)

In [None]:
len(problematic)

In [None]:
problematic[:5]

**Przygotowanie danych**

In [None]:
def prepare_data(raw_data):
    corpus = []
    labels = []
    mapping = {1: 0, 2: 0, 5: 1}
    for doc in raw_data:
        record = doc.strip().split("\t")
        if len(record) != 2:
            continue
        text, target = record
        label = int(float(target))
        if label in mapping: # uproszczenie problemu do klasyfikacji binarnej
            corpus.append(text)
            labels.append(mapping[label])
    return corpus, labels

In [None]:
train_corpus, train_labels = prepare_data(raw_train[1:])

In [None]:
test_corpus, test_labels = prepare_data(raw_dev[1:])

In [None]:
train_data = list(zip(train_corpus, train_labels))
test_data = list(zip(test_corpus, test_labels))

## herBERT

**KLEJ: Comprehensive Benchmark for Polish Language Understanding**   
Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik

https://www.aclweb.org/anthology/2020.acl-main.111.pdf

https://huggingface.co/transformers/master/model_doc/herbert.html

https://huggingface.co/transformers/master/model_doc/roberta.html?highlight=robertamodel#transformers.RobertaModel

- RobertaModel
- RobertaForCausalLM
- RobertaForMaskedLM
- RobertaForSequenceClassification
- RobertaForMultipleChoice
- RobertaForTokenClassification
- RobertaForQuestionAnswering

In [None]:
!pip install transformers --quiet

In [None]:
import random
import torch
from tqdm import tqdm
from sklearn.metrics import classification_report
from transformers import RobertaForSequenceClassification, HerbertTokenizer

torch.manual_seed(42)
random.seed(42)

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

**Tokenizator**

In [None]:
tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

In [None]:
train_corpus[0]

In [None]:
tokens = tokenizer.tokenize(train_corpus[0])

In [None]:
tokens

In [None]:
len(tokens)

In [None]:
tokenizer(train_corpus[0])

In [None]:
tokenizer(train_corpus[0], return_tensors="pt")

**Model**

In [None]:
model = RobertaForSequenceClassification.from_pretrained("allegro/herbert-klej-cased-v1", 
                                                         num_labels=2, hidden_dropout_prob=0.5, 
                                                         attention_probs_dropout_prob=0.5)

In [None]:
model

In [None]:
train_corpus[0]

[Parametry](https://huggingface.co/transformers/master/model_doc/roberta.html?highlight=robertamodel#transformers.RobertaModel.forward)

In [None]:
outputs = model(**tokenizer(train_corpus[0], return_tensors="pt"))

In [None]:
outputs

**Pomocnicze funkcje**

In [None]:
PAD_TOKEN_ID = tokenizer.pad_token_id

def documents_to_batch(docs, max_len):
    tokenized = tokenizer(docs)
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]
    for i, (inp, att) in enumerate(zip(input_ids, attention_mask)):
        inp_len = len(inp)
        inp = inp[:max_len] + [PAD_TOKEN_ID] * (max_len - inp_len)
        att = att[:max_len] + [0] * (max_len - inp_len)
        input_ids[i], attention_mask[i] = inp, att
    X = torch.LongTensor(input_ids).to(DEVICE)
    ATT = torch.FloatTensor(attention_mask).to(DEVICE)
    return X, ATT

In [None]:
def train_on_batch(model, optimizer, X, ATT, Y):
    model.train()
    optimizer.zero_grad()
    output = model(input_ids=X, attention_mask=ATT, labels=Y)
    loss = output["loss"]
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()
    return loss.item()

In [None]:
def predict_on_batch(model, X, ATT, Y):
    model.eval()
    output = model(input_ids=X, attention_mask=ATT, labels=Y)
    decision = output["logits"].topk(1).indices.squeeze()
    loss = output["loss"].item()
    equal = decision == Y
    correct = sum(equal).item()
    return correct, decision, loss

**Hiperparametry**

In [None]:
model = model.to(DEVICE)
learning_rate = 0.000005
epochs = 2
batch_size = 10
max_len = 120
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
num_train_batches = len(train_data) // batch_size + int(bool(len(train_data) % batch_size))
num_test_batches = len(test_data) // batch_size + int(bool(len(test_data) % batch_size))

best_acc = 0

**Trenowanie modelu**

In [None]:
for epoch in range(epochs):
    random.shuffle(train_data)
    total_loss = 0
    for n in tqdm(range(num_train_batches)):
        datapoints = train_data[n * batch_size:(n + 1) * batch_size]
        documents, labels = list(zip(*datapoints))
        Y = torch.LongTensor(labels).to(DEVICE)
        X, ATT = documents_to_batch(documents, max_len)
        loss = train_on_batch(model, optimizer, X, ATT, Y)
        total_loss += loss
    print(total_loss)
    with torch.no_grad():
        total = 0
        correct = 0
        dev_loss = 0
        for n in tqdm(range(num_test_batches)):
            datapoints = test_data[n * batch_size:(n + 1) * batch_size]
            documents, labels = list(zip(*datapoints))
            Y = torch.LongTensor(labels).to(DEVICE)
            X, ATT = documents_to_batch(documents, max_len)
            result, _, loss = predict_on_batch(model, X, ATT, Y)
            dev_loss += loss
            total += batch_size
            correct += result
        acc = correct/total * 100
        print(f"acc: {acc}")
        print(f"loss: {dev_loss}")
        if acc > best_acc:
            best_acc = acc
            torch.save(model, "herbert_ar.model")

In [None]:
model = torch.load("herbert_ar.model", map_location=DEVICE)
model.eval()

In [None]:
preds = []
for n in tqdm(range(num_test_batches)):
    datapoints = test_data[n * batch_size:(n + 1) * batch_size]
    documents, labels = list(zip(*datapoints))
    Y = torch.LongTensor(labels).to(DEVICE)
    X, ATT = documents_to_batch(documents, max_len)
    _, pred, _ = predict_on_batch(model, X, ATT, Y)
    preds.append(pred)

In [None]:
preds[:2]

In [None]:
preds = [p for t in preds for p in t.tolist()]
documents, labels = list(zip(*test_data))

In [None]:
print(classification_report(labels, preds))

-------

## Zadanie

Przygotuj dane i wytrenuj model na danych **PolEmo2.0-IN**

In [None]:
!wget https://klejbenchmark.com/static/data/klej_polemo2.0-in.zip
!unzip klej_polemo2.0-in.zip -d klej_polemo2.0in