#### Налаштування середовища

In [1]:
import os
import random
import numpy as np
import torch

def set_seeds(seed):
    """Set seeds for reproducibility """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        

set_seeds(seed=42)

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

#### 1. Вибір датасету

In [3]:
from datasets import load_dataset, load_from_disk
import pandas as pd
import os

saveDatasetTo = 'datasets/dair-ai-emotion/dataset'

if os.path.exists(saveDatasetTo):
    dataset = load_from_disk(saveDatasetTo)
else:
    os.makedirs(saveDatasetTo, exist_ok=True)
    dataset = load_dataset("dair-ai/emotion")
    dataset.save_to_disk(saveDatasetTo)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [5]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

df = dataset["train"].to_pandas()
id2label = dataset["train"].features["label"].int2str
df["label_name"] = df["label"].apply(id2label)

profile = ProfileReport(df, title="Profiling Report on \"dair-ai/emotion\" dataset")
profile.to_file(saveDatasetTo + "/profileReport.html")
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00, 11.12it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]



In [6]:
# dataset_small = dataset
# dataset_small["train"] = dataset["train"].select(range(100))
# dataset_small["validation"] = dataset["validation"].select(range(20))
# dataset_small["test"] = dataset["test"].select(range(20))

#### 3. Вибір моделі 

In [7]:
model_name = 'bert-base-uncased'

MAX_LEN = 256

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import TrainingArguments, Trainer

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [10]:
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

In [11]:
import wandb

wandb.init(
    project = "iasa-nlp-project",
    entity = "oypio-kpi", 
    name= model_name
)

[34m[1mwandb[0m: Currently logged in as: [33moypio[0m ([33moypio-kpi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
training_args = TrainingArguments(
    output_dir="./models/" + model_name,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics = compute_metrics
)

#### 4. Перевірка претрейн моделі

In [13]:
pretrained_eval = trainer.evaluate()

print("Evaluation of pretrained model on validation set:")
for k, v in pretrained_eval.items():
    print(f"{k}: {v:.4f}")

Evaluation of pretrained model on validation set:
eval_loss: 1.6927
eval_accuracy: 0.1430
eval_f1: 0.0783
eval_runtime: 18.0914
eval_samples_per_second: 110.5500
eval_steps_per_second: 1.7690


In [14]:
from transformers import pipeline

label_names = dataset["train"].features["label"].names
pretrain_pipe = pipeline(
    "text-classification",
    model=trainer.model,
    tokenizer=tokenizer,
    return_all_scores=False,
)



In [15]:
print("Predictions before training:")
for example in dataset["validation"].select(range(3))["text"]:
    pred = pretrain_pipe(example)[0]
    label_id = int(pred["label"].replace("LABEL_", ""))
    label_name = label_names[label_id]
    print(f"Text: {example}")
    print(f"Predicted: {label_name} (Score: {pred['score']:.3f})\n")

Predictions before training:
Text: im feeling quite sad and sorry for myself but ill snap out of it soon
Predicted: anger (Score: 0.317)

Text: i feel like i am still looking at a blank canvas blank pieces of paper
Predicted: anger (Score: 0.215)

Text: i feel like a faithful servant
Predicted: anger (Score: 0.212)



In [16]:
texts = ['UwU', 'What a handsom devil!', 'Hello, Kitty', 'I like the way it burns, the way people run']

print("Predictions before training:")
for example in texts:
    pred = pretrain_pipe(example)[0]
    label_id = int(pred["label"].replace("LABEL_", ""))
    label_name = label_names[label_id]
    print(f"Text: {example}")
    print(f"→ Predicted: {label_name} (Score: {pred['score']:.3f})\n")

Predictions before training:
Text: UwU
→ Predicted: anger (Score: 0.238)

Text: What a handsom devil!
→ Predicted: anger (Score: 0.213)

Text: Hello, Kitty
→ Predicted: anger (Score: 0.269)

Text: I like the way it burns, the way people run
→ Predicted: anger (Score: 0.273)



#### 5. Тренування моделі

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2531,0.220445,0.9245,0.925323
2,0.0804,0.15521,0.937,0.93618
3,0.1162,0.166851,0.9365,0.936886
4,0.0775,0.197407,0.939,0.939165
5,0.0337,0.257811,0.9365,0.936257
6,0.0885,0.2849,0.9365,0.936353


TrainOutput(global_step=6000, training_loss=0.15206692525961746, metrics={'train_runtime': 3478.5258, 'train_samples_per_second': 27.598, 'train_steps_per_second': 1.725, 'total_flos': 2.5259568463872e+16, 'train_loss': 0.15206692525961746, 'epoch': 6.0})

#### 6. Аналіз результатів

In [18]:
trained_eval = trainer.evaluate()

# Print results
print("\nEvaluation of trained model on validation set:")
for k, v in trained_eval.items():
    print(f"{k}: {v:.4f}")


Evaluation of trained model on validation set:
eval_loss: 0.1974
eval_accuracy: 0.9390
eval_f1: 0.9392
eval_runtime: 136.5681
eval_samples_per_second: 14.6450
eval_steps_per_second: 0.2340
epoch: 6.0000


In [19]:
label_names = dataset["train"].features["label"].names
pipe = pipeline(
    "text-classification",
    model=trainer.model,
    tokenizer=tokenizer,
    return_all_scores=False,
)



In [20]:
print("Comparison (Pretrained vs Fine-tuned):")
for metric in ["eval_loss", "eval_accuracy", "eval_f1"]:
    pre_val = pretrained_eval.get(metric, 0)
    post_val = trained_eval.get(metric, 0)
    print(f"{metric:<15}: {pre_val:.4f} → {post_val:.4f}")

Comparison (Pretrained vs Fine-tuned):
eval_loss      : 1.6927 → 0.1974
eval_accuracy  : 0.1430 → 0.9390
eval_f1        : 0.0783 → 0.9392


#### 7. Збереження моделі

In [21]:
best_model_path = trainer.state.best_model_checkpoint
print(f"Best checkpoint path: {best_model_path}")

Best checkpoint path: ./models/bert-base-uncased/checkpoint-4000


In [22]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("./models/bert-base-uncased/checkpoint-14")