<a href="https://colab.research.google.com/github/patrycjalazna/transformers/blob/main/emotion_detection_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importy💅🏻💅🏻💅🏻

In [None]:
!pip install 'transformers==4.12.5' 'tokenizers==0.10.3' 'sentencepiece==0.1.96' 'datasets==1.16.1' 'accelerate==0.5.1' 'sacremoses==0.0.46' 'sacrebleu==2.0.0' 'torch';

Collecting transformers==4.12.5
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.2 MB/s 
[?25hCollecting tokenizers==0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 36.3 MB/s 
[?25hCollecting sentencepiece==0.1.96
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 15.3 MB/s 
[?25hCollecting datasets==1.16.1
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 39.7 MB/s 
[?25hCollecting accelerate==0.5.1
  Downloading accelerate-0.5.1-py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 6.1 MB/s 
[?25hCollecting sacremoses==0.0.46
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |███████

In [None]:
import torch
from torch import nn
from torch.nn import MSELoss, CrossEntropyLoss, BCEWithLogitsLoss
from transformers import RobertaForSequenceClassification, RobertaModel
from transformers.modeling_outputs import SequenceClassifierOutput
import json
from pathlib import Path
from typing import Dict, List
from datasets import load_dataset
import os
import random

## 🤗 Dataset

Dataset *emotion* jest zbiorem danych angielskich wiadomości na Twitterze zawierających sześć podstawowych emocji: gniew, strach, radość, miłość, smutek i zaskoczenie.

Link do datasetu: [hugginface](https://huggingface.co/datasets/emotion)

Przykład:

```
{
    "label": 0,
    "text": "im feeling quite sad and sorry for myself but ill snap out of it soon"
}
```



In [None]:
dataset = load_dataset('emotion')

Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset emotion/default (download: 1.97 MiB, generated: 2.07 MiB, post-processed: Unknown size, total: 4.05 MiB) to /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705...


Downloading: 0.00B [00:00, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset emotion downloaded and prepared to /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Dane mamy automatycznie podzielone train set, validation set i test set w stosunku 8:1:1.

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


Następnie tworzymy folder, w którym zapiszemy dane.

In [None]:
if not os.path.exists("./data"):
    os.makedirs("./data")

In [None]:
train_path = Path('data/train.json')
valid_path = Path('data/valid.json')
test_path = Path('data/test.json')

In [None]:
data_train_list, data_valid_list, data_test_list = [], [], []

for data_line, data_list in [
  (dataset['train'], data_train_list),
  (dataset['test'], data_test_list),
  (dataset['validation'], data_valid_list)
]:
  for i, data in enumerate(data_line):
    line = {
      'label': int(data['label']),
      'text': data['text'],
    }
    data_list.append(line)

print(f'Train: {len(data_train_list)}')
print(f'Test: {len(data_valid_list)}')
print(f'Validation: {len(data_test_list)}')

Train: 16000
Test: 2000
Validation: 2000


In [None]:
# Zależy czy mapujemy tylko na pozytywne i negatywne czy na 6 co są w datasecie
def get_map_label_translation(num_classes = 6):
    '''
    Possible numbers [2, 6]
    '''
    if(num_classes == 2):
        return {
            0: 'negative',
            1: 'positive',
            2: 'positive',
            3: 'negative',
            4: 'negative',
            5: 'positive',
        }
    elif(num_classes == 6):
        return {
            0: 'sadness',
            1: 'joy',
            2: 'love',
            3: 'anger',
            4: 'fear',
            5: 'suprise',
        }

MAP_LABEL_TRANSLATION = get_map_label_translation(6)

In [None]:
data_class_test = {}
data_class_train = {}
data_class_validation = {}

for label in MAP_LABEL_TRANSLATION:
  if(MAP_LABEL_TRANSLATION[label] not in data_class_test):
    data_class_test[MAP_LABEL_TRANSLATION[label]] = []
    data_class_validation[MAP_LABEL_TRANSLATION[label]] = []
    data_class_train[MAP_LABEL_TRANSLATION[label]] = []

for data in data_valid_list:
  data_class_validation[MAP_LABEL_TRANSLATION[int(data['label'])]].append(data)
for data in data_train_list:
  data_class_train[MAP_LABEL_TRANSLATION[int(data['label'])]].append(data)
for data in data_test_list:
  data_class_test[MAP_LABEL_TRANSLATION[int(data['label'])]].append(data)

print('-- Stats for train set --')
for label in data_class_train:
  print(f'Label {label}: {len(data_class_train[label]):6d}')
print('-- Stats for test set --')
for label in data_class_test:
  print(f'Label {label}: {len(data_class_test[label]):6d}')
print('-- Stats for validation set --')
for label in data_class_validation:
  print(f'Label {label}: {len(data_class_validation[label]):6d}')


-- Stats for train set --
Label sadness:   4666
Label joy:   5362
Label love:   1304
Label anger:   2159
Label fear:   1937
Label suprise:    572
-- Stats for test set --
Label sadness:    581
Label joy:    695
Label love:    159
Label anger:    275
Label fear:    224
Label suprise:     66
-- Stats for validation set --
Label sadness:    550
Label joy:    704
Label love:    178
Label anger:    275
Label fear:    212
Label suprise:     81


In [None]:
   
def remove_if_exists(f):
    if(Path(f).exists()):
        f.unlink()

def save_unchanged(f, data):
    remove_if_exists(f)
    print(f'Saving into: {f}')
    with open(f, 'wt') as f_write:
        for data_line in data:
            data_line_str = json.dumps(data_line)
            f_write.write(f'{data_line_str}\n')

def save_as_translations(f, data_classes, num_entries):
    file_name = 'translations-' + f.name
    file_path = f.parent / file_name
    stats = {}
    remove_if_exists(Path(file_path))
    print(f'Saving into: {file_path}')
    
    with open(file_path, 'wt') as f_write:
        for class_list in data_classes:
            if(num_entries > len(data_classes[class_list])):
                samples = data_classes[class_list]
            else:
                samples = random.sample(data_classes[class_list], num_entries)

            stats[f'{class_list} entries'] = len(samples)

            for data_line in samples:
                data_line['label'] = class_list
                data_line_str = json.dumps(data_line)
                f_write.write(f'{data_line_str}\n')
        print(stats)

In [None]:
# Rozmiar zbiorów, podana wartość to ilość lini dla każdegj klasy, jeżeli dana klasa nie posiada danej ilości lini, wszystkie linie zostaja przekazane.
def get_num_of_samples(set_name):
    if(set_name == 'train'):
        return 1000
    else:
        return 100

for file_path, data_to_save, data_classes, num_entries in [ (train_path, data_train_list, data_class_train, get_num_of_samples('train') ), (valid_path, data_valid_list, data_class_validation, get_num_of_samples('valid')), (test_path, data_test_list, data_class_test, get_num_of_samples('test'))]:
  save_unchanged(file_path, data_to_save)
  save_as_translations(file_path, data_classes, num_entries)

Saving into: data/train.json
Saving into: data/translations-train.json
{'sadness entries': 1000, 'joy entries': 1000, 'love entries': 1000, 'anger entries': 1000, 'fear entries': 1000, 'suprise entries': 572}
Saving into: data/valid.json
Saving into: data/translations-valid.json
{'sadness entries': 100, 'joy entries': 100, 'love entries': 100, 'anger entries': 100, 'fear entries': 100, 'suprise entries': 81}
Saving into: data/test.json
Saving into: data/translations-test.json
{'sadness entries': 100, 'joy entries': 100, 'love entries': 100, 'anger entries': 100, 'fear entries': 100, 'suprise entries': 66}


In [None]:
 !wget 'https://raw.githubusercontent.com/huggingface/transformers/v4.12.5/examples/pytorch/text-classification/run_glue_no_trainer.py' -O 'original_run_glue_no_trainer.py'

--2022-02-20 19:18:32--  https://raw.githubusercontent.com/huggingface/transformers/v4.12.5/examples/pytorch/text-classification/run_glue_no_trainer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21091 (21K) [text/plain]
Saving to: ‘original_run_glue_no_trainer.py’


2022-02-20 19:18:33 (16.6 MB/s) - ‘original_run_glue_no_trainer.py’ saved [21091/21091]



In [None]:
%pycat original_run_glue_no_trainer.py

In [None]:
!python original_run_glue_no_trainer.py --help

usage: original_run_glue_no_trainer.py [-h]
                                       [--task_name {cola,mnli,mrpc,qnli,qqp,rte,sst2,stsb,wnli}]
                                       [--train_file TRAIN_FILE]
                                       [--validation_file VALIDATION_FILE]
                                       [--max_length MAX_LENGTH]
                                       [--pad_to_max_length]
                                       --model_name_or_path MODEL_NAME_OR_PATH
                                       [--use_slow_tokenizer]
                                       [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE]
                                       [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
                                       [--learning_rate LEARNING_RATE]
                                       [--weight_decay WEIGHT_DECAY]
                                       [--num_train_epochs NUM_TRAIN_EPOCHS]
                                      

# 💻 **Enkoder**

##RoBERTa

Model RoBERTa został zaproponowany w książce RoBERTa: A Robustly Optimized BERT Pretraining Approach przez Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. Jest on oparty na modelu BERT firmy Google wydanym w 2018 roku.

Kod poniżej odnosi się do oryginalnego modelu RoBERTa.



In [None]:
! python run_glue_no_trainer.py \
  --model_name_or_path roberta-base \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 24 \
  --per_device_eval_batch_size 24 \
  --max_length 128 \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --output_dir out/emotion/roberta

02/20/2022 19:44:42 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Use FP16 precision: False

100% 3/3 [00:00<00:00, 875.58it/s]
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2"

Kod poniżej odnosi się do customowego modelu zapisanego w pliku roberta.py. Zwiększona została liczba epok, a także zmniejszony batchsize na zbiorze treningowym i eval. Dodatkowo zostały zamrożone wagi (nie w głowie klasyfikacji). Learning rate i maksymalna długość sekwencji pozostała taka sama.

In [None]:
! python run_glue_no_trainer.py \
  --model_name_or_path roberta-base \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 16 \
  --per_device_eval_batch_size 16 \
  --max_length 128 \
  --freeze_model \
  --learning_rate 2e-5 \
  --num_train_epochs 8 \
  --custom_model \
  --output_dir out/emotion/roberta3

02/20/2022 20:28:36 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Use FP16 precision: False

100% 3/3 [00:00<00:00, 895.64it/s]
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2"

Kod poniżej odnosi się do pliku robertaforward.py. Batch został zmniejszony do 12, maksymalna długość sekwencji (max_length) zostaje taka sama. Dodatkowo, dodany został pooling layer. Wagi (nie w głowie klasyfikacji) zostały zamrożone. Model był trenowany w 12 epokach.

In [None]:
! python run_glue_no_trainer2.py \
  --model_name_or_path roberta-base \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 12 \
  --per_device_eval_batch_size 12 \
  --max_length 128 \
  --freeze_model \
  --learning_rate 2e-5 \
  --num_train_epochs 12 \
  --custom_model \
  --output_dir out/emotion/roberta4
   
  # --lr_scheduler_type "linear" \


02/20/2022 21:01:01 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Use FP16 precision: False

100% 3/3 [00:00<00:00, 834.69it/s]
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2"

Kod poniżej odnosi się do pliku robertasmallhead.py. Batchsize został zwiększony do 32, dodatkowo learning rate został zmieniony na 3*10^-5.

In [None]:
! python run_glue_no_trainer3.py \
  --model_name_or_path roberta-base \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 32 \
  --per_device_eval_batch_size 32 \
  --max_length 128 \
  --freeze_model \
  --learning_rate 3e-5 \
  --num_train_epochs 12 \
  --custom_model \
  --output_dir out/emotion/roberta5

02/20/2022 22:03:06 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Use FP16 precision: False

100% 3/3 [00:00<00:00, 749.88it/s]
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2"

Kod poniżej opiera się o plik roberta.py. Zmieniona została liczba epok, batch size i learning rate. Użyte zostały hidden states.

In [47]:
! python run_glue_no_trainer.py \
  --model_name_or_path roberta-base \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 26 \
  --per_device_eval_batch_size 26 \
  --max_length 128 \
  --return_hidden_states \
  --learning_rate 2e-7 \
  --num_train_epochs 8 \
  --custom_model \
  --output_dir out/emotion/roberta6

02/20/2022 22:47:13 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Use FP16 precision: False

100% 3/3 [00:00<00:00, 871.39it/s]
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2"

Ewaluacja 

In [48]:
!python run_glue.py \
--model_name_or_path out/emotion/roberta \
--output_dir out/emotion/roberta-eval \
--return_hidden_states --custom_model \
--train_file data/train.json --validation_file data/valid.json \
--do_eval \
--per_device_eval_batch_size 24 --max_seq_length 128 \
--return_hidden_states --custom_model

02/20/2022 23:43:12 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=out/emotion/roberta-eval/runs/Feb20_23-

# 🔉 Seq2seq

### T5