In [1]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
import transformers
from transformers import *
import numpy as np
from datasets import load_dataset
import evaluate
from tqdm import tqdm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cuda:0


# Load Data

In [3]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

loading configuration file config.json from cache at /home/mikic202/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.41.2",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/mikic202/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/vocab.txt
loading file tokenizer.json from cache at /home/mikic202/.cache/huggingface/hub/models--dis

In [4]:
review_dataset = load_dataset("csv", data_files="data/train_data.csv", split="train")
print(review_dataset)

Dataset({
    features: ['review', 'rating'],
    num_rows: 16392
})


In [5]:
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

In [6]:
tokenized_datasets = review_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["review"])
tokenized_datasets.set_format("torch")
small_train_dataset = tokenized_datasets#.shuffle(seed=42).select(range(10000))

In [7]:
review_train_loader = DataLoader(small_train_dataset, shuffle=True, batch_size=16)
# eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)

# Model

In [8]:
result = model(**tokenizer("BERT is designed to pre-train deep bidirectional representations by jointly conditioning on both left and right context in all layers.", return_tensors="pt"))
result.keys()

odict_keys(['last_hidden_state'])

In [9]:
result.last_hidden_state, result.last_hidden_state.size()

(tensor([[[-0.4867, -0.1770, -0.1706,  ..., -0.0752, -0.0195,  0.6925],
          [ 0.0744, -0.2203,  0.1330,  ..., -0.0730, -0.1128,  0.3024],
          [-0.6035, -0.2432, -0.0209,  ..., -0.2963, -0.6380,  0.6463],
          ...,
          [ 0.2347,  0.1218,  0.1038,  ..., -0.2239, -0.1634,  0.2523],
          [ 0.7587,  0.1439, -0.5647,  ...,  0.4042, -0.7747, -0.2532],
          [ 0.3978,  0.4038, -0.3577,  ...,  0.3608, -0.9130, -0.0176]]],
        grad_fn=<NativeLayerNormBackward0>),
 torch.Size([1, 28, 768]))

In [10]:
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

loading configuration file config.json from cache at /home/mikic202/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.41.2",
  "vocab_size": 30522
}

loading weights file model.sa

# Training

In [11]:
optimizer = Adam(sentiment_model.parameters(), lr=5e-5)
sentiment_model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [12]:
num_epochs = 3
loss_fun = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    losses = []
    for batch in tqdm(review_train_loader):
        # print(batch)
        labels = batch["rating"].to(device)
        batch = {"attention_mask": batch['attention_mask'].to(device), "input_ids": batch['input_ids'].to(device)}#, "token_type_ids":batch['token_type_ids'].to(device)}
        outputs = sentiment_model(**batch)
        loss = loss_fun(outputs.logits, labels)
        loss.backward()

        optimizer.step()
#         lr_scheduler.step()
        optimizer.zero_grad()
#         progress_bar.update(1)
        losses.append(loss.item())
    print(np.mean(losses))

  0%|          | 0/1025 [00:00<?, ?it/s]

100%|██████████| 1025/1025 [15:01<00:00,  1.14it/s]


0.8411979861085008


100%|██████████| 1025/1025 [14:59<00:00,  1.14it/s]


0.6296928964010099


100%|██████████| 1025/1025 [15:29<00:00,  1.10it/s]

0.4395878749867765





In [14]:
metric = evaluate.load("accuracy")
model.eval()
for batch in review_train_loader:
    labels = batch["rating"].to(device)
    batch = {"attention_mask": batch['attention_mask'].to(device), "input_ids": batch['input_ids'].to(device)}#, "token_type_ids":batch['token_type_ids'].to(device)}

    with torch.no_grad():
        outputs = sentiment_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=labels)

metric.compute()

{'accuracy': 0.9158735968765251}

In [15]:
torch.save(sentiment_model.state_dict(),"sentiment_model_dict_first_model.model")