In [1]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
import transformers
from transformers import *
import numpy as np
from datasets import load_dataset
import evaluate
from tqdm import tqdm
import pandas as pd
from torch.utils.data import DataLoader, random_split, Dataset
from torch.utils.data import Subset
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cuda:0


In [3]:
torch.cuda.empty_cache()

# Load Data

In [3]:
model_name = 'juliensimon/reviews-sentiment-analysis'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

loading configuration file config.json from cache at /home/mikic202/.cache/huggingface/hub/models--juliensimon--reviews-sentiment-analysis/snapshots/7d147bc6fbf417d17abf65b4cefe3e04cbc4e7c5/config.json
Model config DistilBertConfig {
  "_name_or_path": "juliensimon/reviews-sentiment-analysis",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.41.2",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/mikic202/.cache/huggingface/hub/models--juliensimon--reviews-sentiment-analysis/snapshots/7d147bc6fbf417d17abf65b4cefe3e04

In [8]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        words_to_delete = int(len(data['input_ids']) * 0.98)
        text_indecies, _ = random_split(
            range(len(data['input_ids'])), [words_to_delete, len(data['input_ids']) - words_to_delete]
        )
        return {'rating': data['rating'], 'input_ids': data['input_ids'][text_indecies], 'attention_mask': data['attention_mask'][text_indecies]}

In [5]:
review_dataset = load_dataset("csv", data_files="data/train_data.csv", split="train")
print(review_dataset)

Dataset({
    features: ['review', 'rating'],
    num_rows: 16392
})


In [6]:
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

In [19]:
tokenized_datasets = review_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["review"])
tokenized_datasets.set_format("torch")

torch.manual_seed(73)
n_train_examples = int(len(tokenized_datasets) * 0.9)
train_indices, validation_indices = random_split(
    range(len(tokenized_datasets)), [n_train_examples, len(tokenized_datasets) - n_train_examples]
)

validation_dataset = Subset(tokenized_datasets, validation_indices)
train_dataset = CustomDataset(Subset(tokenized_datasets, train_indices))

In [20]:
for i in train_dataset:
    print(i.keys())
    print(len(i['input_ids']))
    print(i['input_ids'][0:8])
    print(i['input_ids'][[0, 1, 3]])
    print(len(i['attention_mask']))
    print(i['rating'])
    break

dict_keys(['rating', 'input_ids', 'attention_mask'])
486
tensor([    0,     0, 12271,  2204,     0,     0,  1010,     0])
tensor([   0,    0, 2204])
486
tensor(4)


In [21]:
review_train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)
validation_dataloader = DataLoader(validation_dataset, batch_size=8)

# Model

In [8]:
result = model(**tokenizer("BERT is designed to pre-train deep bidirectional representations by jointly conditioning on both left and right context in all layers.", return_tensors="pt"))
result.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [18]:
result.last_hidden_state, result.last_hidden_state.size()

(tensor([[[ 0.0498, -0.3156, -0.3395,  ..., -1.6786, -0.4018, -0.0752],
          [ 0.3794, -0.0872, -0.4121,  ..., -1.6896, -0.3334,  0.0322],
          [ 0.5837, -0.7698, -0.4135,  ..., -1.5526, -0.4573,  0.1456],
          ...,
          [ 0.0703, -0.2677, -0.1546,  ..., -1.1223,  0.1195, -0.0484],
          [-0.0977, -0.2706,  0.0462,  ..., -1.3017, -0.3355, -0.1699],
          [ 0.0298, -0.4435, -0.3852,  ..., -1.2285, -0.2555, -0.4443]]],
        grad_fn=<NativeLayerNormBackward0>),
 torch.Size([1, 28, 768]))

In [17]:
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True)

loading configuration file config.json from cache at /home/mikic202/.cache/huggingface/hub/models--juliensimon--reviews-sentiment-analysis/snapshots/7d147bc6fbf417d17abf65b4cefe3e04cbc4e7c5/config.json
Model config DistilBertConfig {
  "_name_or_path": "juliensimon/reviews-sentiment-analysis",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": tr

# Training

In [18]:
optimizer = Adam(sentiment_model.parameters(), lr=3e-5)
sentiment_model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [24]:
num_epochs = 2
loss_fun = nn.CrossEntropyLoss()
model.train()

for epoch in range(num_epochs):
    losses = []
    for batch in tqdm(review_train_loader):
        # print(batch)
        labels = batch["rating"].to(device)
        batch = {"attention_mask": batch['attention_mask'].to(device), "input_ids": batch['input_ids'].to(device)}#, "token_type_ids":batch['token_type_ids'].to(device)}
        outputs = sentiment_model(**batch)
        loss = loss_fun(outputs.logits, labels)
        loss.backward()

        optimizer.step()
#         lr_scheduler.step()
        optimizer.zero_grad()
#         progress_bar.update(1)
        losses.append(loss.item())
    print(np.mean(losses))

100%|██████████| 922/922 [14:31<00:00,  1.06it/s]


0.6121862951399447


100%|██████████| 922/922 [14:47<00:00,  1.04it/s]

0.4793394688767363





In [25]:
metric = evaluate.load("accuracy")
model.eval()
for batch in validation_dataloader:
    labels = batch["rating"].to(device)
    batch = {"attention_mask": batch['attention_mask'].to(device), "input_ids": batch['input_ids'].to(device)}#, "token_type_ids":batch['token_type_ids'].to(device)}

    with torch.no_grad():
        outputs = sentiment_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=labels)

metric.compute()

{'accuracy': 0.6201219512195122}

In [25]:
torch.save(sentiment_model.state_dict(),"sentiment_model_distilbert-base-uncased-go-emotions-student_67.model")

Wszystkie modele były testowane na lr = 5e-5 oraz przy 2-5 epokach


Model: distilbert-base-uncased 
Accuracy: 91,5 na zbiorze treningowym


Model: cross-encoder/ms-marco-MiniLM-L-12-v2 
Accuracy: 94,25 na zbiorze treningowym
Accuracy: 61 na validacyjnym


Model: bhadresh-savani/distilbert-base-uncased-emotion 
Accuracy: 97,7 na zbiorze treningowym


Model: joeddav/distilbert-base-uncased-go-emotions-student 
Accuracy: 97,3  na zbiorze treningowym
Accuracy: 67,5 na walidacyjnym


Model: finiteautomata/bertweet-base-sentiment-analysis 
Accuracy: 92,2 na zbiorze treningowym, szybki 
Po douczeniu Accuracy 97,3


Model: bhadresh-savani/bert-base-uncased-emotion 
Accuracy: 96, 44 na zbiorze treningowym 


Model: michellejieli/emotion_text_classifier 
Accuracy 91,37 na zbiorze treningowym


Model: Falconsai/intent_classification 
Accuracy: 96,34 na zbiorze treningowym


Model: alperiox/autonlp-user-review-classification-536415182 
Accuracy: 93,67 na zbiorze treningowym


Model: Jorgeutd/sagemaker-roberta-base-emotion 
Accuracy: 44,186


Model: JungleLee/bert-toxic-comment-classification'
Accuracy: 96,59 na zbiorze treningowym


Model: nickwong64/bert-base-uncased-poems-sentiment
Accuracy: 63,53 na zbiorze walidacyjnym


Model: jitesh/emotion-english
Accuracy: 65 na walidacyjnym

Model: juliensimon/reviews-sentiment-analysis 
Accuracy: 63 na walidacyjnym

# Save test data

In [27]:
def test_tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True)

In [33]:
test_review_dataset = load_dataset("csv", data_files="data/test_data.csv", split="train")
print(test_review_dataset)

test_tokenized_datasets = test_review_dataset.map(tokenize_function, batched=True)
test_tokenized_datasets.set_format("torch")
test_dataloader = DataLoader(test_tokenized_datasets, batch_size=16)

Dataset({
    features: ['review'],
    num_rows: 4099
})


Map: 100%|██████████| 4099/4099 [00:01<00:00, 3635.46 examples/s]


In [34]:
test_predictions = []

for batch in test_dataloader:
    batch = {"attention_mask": batch['attention_mask'].to(device), "input_ids": batch['input_ids'].to(device)}#, "token_type_ids":batch['token_type_ids'].to(device)}

    with torch.no_grad():
        outputs = sentiment_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    test_predictions.extend(predictions.cpu().numpy())

In [36]:
test_predictions
print(test_predictions)
print(len(test_predictions))
print(np.unique(test_predictions, return_counts=True))

[3, 1, 3, 4, 4, 0, 3, 4, 1, 4, 3, 3, 4, 3, 4, 4, 1, 3, 3, 3, 4, 0, 4, 1, 4, 4, 4, 4, 2, 1, 4, 2, 4, 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 3, 1, 4, 2, 4, 4, 4, 3, 3, 4, 2, 4, 2, 4, 4, 2, 4, 4, 1, 4, 4, 4, 3, 3, 4, 4, 3, 4, 2, 2, 4, 3, 1, 4, 4, 2, 4, 4, 3, 1, 4, 4, 3, 4, 4, 3, 3, 0, 3, 4, 3, 0, 4, 4, 4, 0, 4, 4, 4, 3, 3, 0, 2, 4, 3, 2, 4, 3, 3, 3, 0, 4, 3, 4, 2, 4, 4, 4, 4, 3, 4, 4, 1, 2, 4, 4, 0, 2, 4, 3, 1, 1, 3, 3, 4, 4, 0, 4, 2, 4, 4, 3, 3, 4, 4, 4, 1, 3, 4, 4, 4, 3, 3, 4, 4, 3, 1, 2, 4, 3, 2, 1, 4, 4, 2, 4, 4, 1, 4, 3, 4, 0, 4, 3, 3, 4, 0, 3, 4, 4, 4, 4, 1, 4, 0, 4, 4, 2, 2, 3, 3, 4, 4, 3, 3, 1, 3, 1, 3, 3, 3, 0, 3, 1, 4, 1, 1, 1, 4, 1, 4, 3, 4, 3, 3, 4, 4, 4, 2, 4, 1, 2, 3, 4, 3, 4, 3, 3, 3, 3, 1, 1, 3, 4, 4, 4, 4, 1, 1, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 3, 3, 4, 1, 3, 3, 4, 4, 4, 1, 1, 1, 1, 4, 0, 4, 3, 3, 1, 4, 3, 3, 3, 0, 3, 2, 4, 4, 4, 4, 4, 3, 4, 1, 4, 3, 3, 3, 4, 3, 1, 0, 1, 4, 0, 3, 4, 4, 1, 0, 4, 4, 4, 1, 3, 3, 4, 0, 4, 4, 3, 3, 3, 4, 3, 3, 2, 2, 3, 3, 3, 4, 3, 4, 4, 0, 

In [38]:
pd.DataFrame(test_predictions).to_csv("ChomanskiMłynik_distilbert-base-uncased-go-emotions-student_67.csv", index=False, header=None)