<a href="https://colab.research.google.com/github/patrycjalazna/transformers/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importy💅🏻💅🏻💅🏻

In [276]:
!pip install 'transformers==4.12.5' 'tokenizers==0.10.3' 'sentencepiece==0.1.96' 'datasets==1.16.1' 'accelerate==0.5.1' 'sacremoses==0.0.46' 'sacrebleu==2.0.0' 'torch';

ERROR: Invalid requirement: "'transformers==4.12.5'"
You should consider upgrading via the 'c:\users\masob\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [277]:
import torch
from torch import nn
from torch.nn import MSELoss, CrossEntropyLoss, BCEWithLogitsLoss
from transformers import RobertaForSequenceClassification, RobertaModel
from transformers.modeling_outputs import SequenceClassifierOutput
import json
from pathlib import Path
from typing import Dict, List
from datasets import load_dataset
import os
import random

## 🤗 Dataset

Dataset *emotion* jest zbiorem danych angielskich wiadomości na Twitterze zawierających sześć podstawowych emocji: gniew, strach, radość, miłość, smutek i zaskoczenie.

Link do datasetu: [hugginface](https://huggingface.co/datasets/emotion)

Przykład:

```
{
    "label": 0,
    "text": "im feeling quite sad and sorry for myself but ill snap out of it soon"
}
```



In [278]:
dataset = load_dataset('emotion')

Using custom data configuration default
Reusing dataset emotion (C:\Users\masob\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)
100%|██████████| 3/3 [00:00<00:00, 499.60it/s]


Dane mamy automatycznie podzielone train set, validation set i test set w stosunku 8:1:1.

In [279]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


Następnie tworzymy folder, w którym zapiszemy dane.

In [280]:
if not os.path.exists("./data"):
    os.makedirs("./data")

In [281]:
train_path = Path('data/train.json')
valid_path = Path('data/valid.json')
test_path = Path('data/test.json')

In [282]:
data_train_list, data_valid_list, data_test_list = [], [], []

for data_line, data_list in [
  (dataset['train'], data_train_list),
  (dataset['test'], data_test_list),
  (dataset['validation'], data_valid_list)
]:
  for i, data in enumerate(data_line):
    line = {
      'label': int(data['label']),
      'text': data['text'],
    }
    data_list.append(line)

print(f'Train: {len(data_train_list)}')
print(f'Test: {len(data_valid_list)}')
print(f'Validation: {len(data_test_list)}')

Train: 16000
Test: 2000
Validation: 2000


In [283]:
# Zależy czy mapujemy tylko na pozytywne i negatywne czy na 6 co są w datasecie
def get_map_label_translation(num_classes = 6):
    '''
    Possible numbers [2, 6]
    '''
    if(num_classes == 2):
        return {
            0: 'negative',
            1: 'positive',
            2: 'positive',
            3: 'negative',
            4: 'negative',
            5: 'positive',
        }
    elif(num_classes == 6):
        return {
            0: 'sadness',
            1: 'joy',
            2: 'love',
            3: 'anger',
            4: 'fear',
            5: 'suprise',
        }

MAP_LABEL_TRANSLATION = get_map_label_translation(2)

In [284]:
data_class_test = {}
data_class_train = {}
data_class_validation = {}

for label in MAP_LABEL_TRANSLATION:
  if(MAP_LABEL_TRANSLATION[label] not in data_class_test):
    data_class_test[MAP_LABEL_TRANSLATION[label]] = []
    data_class_validation[MAP_LABEL_TRANSLATION[label]] = []
    data_class_train[MAP_LABEL_TRANSLATION[label]] = []

for data in data_valid_list:
  data_class_validation[MAP_LABEL_TRANSLATION[int(data['label'])]].append(data)
for data in data_train_list:
  data_class_train[MAP_LABEL_TRANSLATION[int(data['label'])]].append(data)
for data in data_test_list:
  data_class_test[MAP_LABEL_TRANSLATION[int(data['label'])]].append(data)

print('-- Stats for train set --')
for label in data_class_train:
  print(f'Label {label}: {len(data_class_train[label]):6d}')
print('-- Stats for test set --')
for label in data_class_test:
  print(f'Label {label}: {len(data_class_test[label]):6d}')
print('-- Stats for validation set --')
for label in data_class_validation:
  print(f'Label {label}: {len(data_class_validation[label]):6d}')


-- Stats for train set --
Label negative:   8762
Label positive:   7238
-- Stats for test set --
Label negative:   1080
Label positive:    920
-- Stats for validation set --
Label negative:   1037
Label positive:    963


In [285]:
   
def remove_if_exists(f):
    if(Path(f).exists()):
        f.unlink()

def save_unchanged(f, data):
    remove_if_exists(f)
    print(f'Saving into: {f}')
    with open(f, 'wt') as f_write:
        for data_line in data:
            data_line_str = json.dumps(data_line)
            f_write.write(f'{data_line_str}\n')

def save_as_translations(f, data_classes, num_entries):
    file_name = 'translations-' + f.name
    file_path = f.parent / file_name
    stats = {}
    remove_if_exists(Path(file_path))
    print(f'Saving into: {file_path}')
    
    with open(file_path, 'wt') as f_write:
        for class_list in data_classes:
            if(num_entries > len(data_classes[class_list])):
                samples = data_classes[class_list]
            else:
                samples = random.sample(data_classes[class_list], num_entries)

            stats[f'{class_list} entries'] = len(samples)

            for data_line in samples:
                data_line['label'] = class_list
                data_line_str = json.dumps(data_line)
                f_write.write(f'{data_line_str}\n')
        print(stats)

In [286]:
# Rozmiar zbiorów, podana wartość to ilość lini dla każdegj klasy, jeżeli dana klasa nie posiada danej ilości lini, wszystkie linie zostaja przekazane.
def get_num_of_samples(set_name):
    if(set_name == 'train'):
        return 1000
    else:
        return 100

for file_path, data_to_save, data_classes, num_entries in [ (train_path, data_train_list, data_class_train, get_num_of_samples('train') ), (valid_path, data_valid_list, data_class_validation, get_num_of_samples('valid')), (test_path, data_test_list, data_class_test, get_num_of_samples('test'))]:
  save_unchanged(file_path, data_to_save)
  save_as_translations(file_path, data_classes, num_entries)

Saving into: data\train.json
Saving into: data\translations-train.json
{'negative entries': 1000, 'positive entries': 1000}
Saving into: data\valid.json
Saving into: data\translations-valid.json
{'negative entries': 100, 'positive entries': 100}
Saving into: data\test.json
Saving into: data\translations-test.json
{'negative entries': 100, 'positive entries': 100}


## 🤗 Train

In [None]:
!python run_glue_no_trainer.py \
  --model_name_or_path gpt2 \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --per_device_train_batch_size 24 \
  --per_device_eval_batch_size 24 \
  --max_length 128 \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --output_dir out/gpt2