In [1]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
import transformers
from transformers import *
import numpy as np
from datasets import load_dataset
import evaluate
from tqdm import tqdm
import pandas as pd
from torch.utils.data import DataLoader, random_split, Dataset
from torch.utils.data import Subset
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cuda:0


In [3]:
torch.cuda.empty_cache()

# Load Data

In [3]:
model_name = 'finiteautomata/bertweet-base-sentiment-analysis'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

loading file vocab.txt from cache at /home/mikic202/.cache/huggingface/hub/models--finiteautomata--bertweet-base-sentiment-analysis/snapshots/924fc4c80bccb8003d21fe84dd92c7887717f245/vocab.txt
loading file bpe.codes from cache at /home/mikic202/.cache/huggingface/hub/models--finiteautomata--bertweet-base-sentiment-analysis/snapshots/924fc4c80bccb8003d21fe84dd92c7887717f245/bpe.codes
loading file added_tokens.json from cache at /home/mikic202/.cache/huggingface/hub/models--finiteautomata--bertweet-base-sentiment-analysis/snapshots/924fc4c80bccb8003d21fe84dd92c7887717f245/added_tokens.json
loading file special_tokens_map.json from cache at /home/mikic202/.cache/huggingface/hub/models--finiteautomata--bertweet-base-sentiment-analysis/snapshots/924fc4c80bccb8003d21fe84dd92c7887717f245/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/mikic202/.cache/huggingface/hub/models--finiteautomata--bertweet-base-sentiment-analysis/snapshots/924fc4c80bccb8003d21fe84dd92c7

In [4]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        words_to_delete = int(len(data['input_ids']) * 0.99)
        text_indecies, _ = random_split(
            range(len(data['input_ids'])), [words_to_delete, len(data['input_ids']) - words_to_delete]
        )
        return {'rating': data['rating'], 'input_ids': data['input_ids'][text_indecies], 'attention_mask': data['attention_mask'][text_indecies]}

In [5]:
review_dataset = load_dataset("csv", data_files="data/train_data.csv", split="train")
print(review_dataset)

Dataset({
    features: ['review', 'rating'],
    num_rows: 16392
})


In [6]:
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

In [7]:
tokenized_datasets = review_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["review"])
tokenized_datasets.set_format("torch")

torch.manual_seed(73)
n_train_examples = int(len(tokenized_datasets) * 0.9)
train_indices, validation_indices = random_split(
    range(len(tokenized_datasets)), [n_train_examples, len(tokenized_datasets) - n_train_examples]
)

validation_dataset = Subset(tokenized_datasets, validation_indices)
train_dataset = CustomDataset(Subset(tokenized_datasets, train_indices))

In [20]:
for i in train_dataset:
    print(i.keys())
    print(len(i['input_ids']))
    print(i['input_ids'][0:8])
    print(i['input_ids'][[0, 1, 3]])
    print(len(i['attention_mask']))
    print(i['rating'])
    break

dict_keys(['rating', 'input_ids', 'attention_mask'])
486
tensor([    0,     0, 12271,  2204,     0,     0,  1010,     0])
tensor([   0,    0, 2204])
486
tensor(4)


In [8]:
review_train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)
validation_dataloader = DataLoader(validation_dataset, batch_size=16)

# Model

In [8]:
result = model(**tokenizer("BERT is designed to pre-train deep bidirectional representations by jointly conditioning on both left and right context in all layers.", return_tensors="pt"))
result.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [18]:
result.last_hidden_state, result.last_hidden_state.size()

(tensor([[[ 0.0498, -0.3156, -0.3395,  ..., -1.6786, -0.4018, -0.0752],
          [ 0.3794, -0.0872, -0.4121,  ..., -1.6896, -0.3334,  0.0322],
          [ 0.5837, -0.7698, -0.4135,  ..., -1.5526, -0.4573,  0.1456],
          ...,
          [ 0.0703, -0.2677, -0.1546,  ..., -1.1223,  0.1195, -0.0484],
          [-0.0977, -0.2706,  0.0462,  ..., -1.3017, -0.3355, -0.1699],
          [ 0.0298, -0.4435, -0.3852,  ..., -1.2285, -0.2555, -0.4443]]],
        grad_fn=<NativeLayerNormBackward0>),
 torch.Size([1, 28, 768]))

In [9]:
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True)

loading configuration file config.json from cache at /home/mikic202/.cache/huggingface/hub/models--finiteautomata--bertweet-base-sentiment-analysis/snapshots/924fc4c80bccb8003d21fe84dd92c7887717f245/config.json
Model config RobertaConfig {
  "_name_or_path": "finiteautomata/bertweet-base-sentiment-analysis",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12

# Training

In [10]:
optimizer = Adam(sentiment_model.parameters(), lr=6e-5)
sentiment_model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [13]:
num_epochs = 3
loss_fun = nn.CrossEntropyLoss()
model.train()

for epoch in range(num_epochs):
    losses = []
    for batch in tqdm(review_train_loader):
        # print(batch)
        labels = batch["rating"].to(device)
        batch = {"attention_mask": batch['attention_mask'].to(device), "input_ids": batch['input_ids'].to(device)}#, "token_type_ids":batch['token_type_ids'].to(device)}
        outputs = sentiment_model(**batch)
        loss = loss_fun(outputs.logits, labels)
        loss.backward()

        optimizer.step()
#         lr_scheduler.step()
        optimizer.zero_grad()
#         progress_bar.update(1)
        losses.append(loss.item())
    print(np.mean(losses))

100%|██████████| 922/922 [07:53<00:00,  1.95it/s]


0.7082111597739952


100%|██████████| 922/922 [07:59<00:00,  1.92it/s]

0.5866529792304799





In [14]:
metric = evaluate.load("accuracy")
model.eval()
for batch in validation_dataloader:
    labels = batch["rating"].to(device)
    batch = {"attention_mask": batch['attention_mask'].to(device), "input_ids": batch['input_ids'].to(device)}#, "token_type_ids":batch['token_type_ids'].to(device)}

    with torch.no_grad():
        outputs = sentiment_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=labels)

metric.compute()

{'accuracy': 0.5768292682926829}

In [14]:
torch.save(sentiment_model.state_dict(),"sentiment_model_dict_emotion-english_65.model")

Model: distilbert-base-uncased 
Accuracy: 91,5 na zbiorze treningowym


Model: cross-encoder/ms-marco-MiniLM-L-12-v2 
Accuracy: 94,25 na zbiorze treningowym
Accuracy: 61 na validacyjnym


Model: bhadresh-savani/distilbert-base-uncased-emotion 
Accuracy: 97,7 na zbiorze treningowym


Model: joeddav/distilbert-base-uncased-go-emotions-student 
Accuracy: 97,3  na zbiorze treningowym
Accuracy: 66,5


Model: finiteautomata/bertweet-base-sentiment-analysis 
Accuracy: 92,2 na zbiorze treningowym, szybki 
Po douczeniu Accuracy 97,3


Model: bhadresh-savani/bert-base-uncased-emotion 
Accuracy: 96, 44 na zbiorze treningowym 
Bardziej skomplikowany więc może mniej się przeuczył


Model: michellejieli/emotion_text_classifier 
Accuracy 91,37 na zbiorze treningowym


Model: Falconsai/intent_classification 
Accuracy: 96,34 na zbiorze treningowym


Model: alperiox/autonlp-user-review-classification-536415182 
Accuracy: 93,67 na zbiorze treningowym


Model: Jorgeutd/sagemaker-roberta-base-emotion 
Accuracy: 44,186


Model: JungleLee/bert-toxic-comment-classification'
Accuracy: 96,59 na zbiorze treningowym


Model: nickwong64/bert-base-uncased-poems-sentiment
Accuracy: 63,53 na zbiorze walidacyjnym


Model: jitesh/emotion-english
Accuracy: 65 na walidacyjnym