In [1]:
import time
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
)
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 🔹 Načítanie datasetu
dataset = load_dataset("csv", data_files={"train": "ISEAR_train.csv", "test": "ISEAR_test.csv"})

# 🔹 Konverzia emócií na číselné hodnoty
unique_emotions = sorted(set(dataset["train"]["emotion"]))
emotion2label = {emotion: idx for idx, emotion in enumerate(unique_emotions)}
num_labels = len(emotion2label)

def map_emotion_to_label(example):
    example["label"] = emotion2label[example["emotion"]]
    return example

dataset = dataset.map(map_emotion_to_label)

# 🔹 Inicializácia BERT tokenizeru a modelu
bert_model_name = "bert-base-uncased"
tokenizer_bert = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_labels)

# 🔹 Tokenizácia textu pre BERT
def tokenize_bert(examples):
    return tokenizer_bert(examples["text"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize_bert, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# 🔹 Funkcia na vyhodnotenie modelu
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# 🔹 Tréningové nastavenia pre BERT
bert_training_args = TrainingArguments(
    output_dir="./bert_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,  
    weight_decay=0.01,
)

# 🔹 Tréning BERT modelu
bert_trainer = Trainer(
    model=bert_model,
    args=bert_training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

print("Training BERT model...")

# ⏳ Start timing
start_time = time.time()

bert_trainer.train()

# ⏳ End timing
end_time = time.time()
training_time = end_time - start_time

print(f"BERT training complete. Training took {training_time:.2f} seconds.")

# 🔹 Vyhodnotenie modelu
print("Evaluating BERT model...")
bert_results = bert_trainer.evaluate()
print("📊 Výsledky vyhodnotenia BERT:")
print(bert_results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training BERT model...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.904883,0.690601,0.702255,0.690601,0.691928
2,1.192800,0.839048,0.711488,0.713073,0.711488,0.709487
3,0.601400,0.901001,0.70953,0.721533,0.70953,0.709179
4,0.366800,0.979866,0.71671,0.719446,0.71671,0.716853
5,0.366800,1.162347,0.710183,0.720897,0.710183,0.712204
6,0.174400,1.326308,0.712141,0.717127,0.712141,0.712791
7,0.101100,1.433319,0.713446,0.717616,0.713446,0.714821
8,0.072900,1.532428,0.710183,0.712941,0.710183,0.710738
9,0.072900,1.585067,0.712141,0.717275,0.712141,0.713922
10,0.049600,1.578996,0.710183,0.712853,0.710183,0.711156


BERT training complete. Training took 442.20 seconds.
Evaluating BERT model...


📊 Výsledky vyhodnotenia BERT:
{'eval_loss': 1.5789964199066162, 'eval_accuracy': 0.7101827676240209, 'eval_precision': 0.712852847208321, 'eval_recall': 0.7101827676240209, 'eval_f1': 0.711155657755513, 'eval_runtime': 2.699, 'eval_samples_per_second': 567.623, 'eval_steps_per_second': 35.569, 'epoch': 10.0}
