# BERT models

In [6]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import time
from utils.text_datasets import get_basic_tweet_sentiment_dataset, get_poem_sentiment_dataset, get_advanced_tweet_sentiment_dataset_only_text

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
SEED = 42
MODEL_NAME = 'all-MiniLM-L6-v2'
TEST_SIZE = 0.2

In [None]:
minilm_model = SentenceTransformer("all-MiniLM-L6-v2")

def tokenize_function_minilm(text):
    return minilm_model.encode(text, convert_to_numpy=True)

In [None]:
train_dataset, val_dataset, test_dataset = get_poem_sentiment_dataset(tokenize_function_minilm)

In [None]:
X_train = [x.numpy() for x, y in train_dataset]
y_train = [y.item() for x, y in train_dataset]

X_val = [x.numpy() for x, y in val_dataset]
y_val = [y.item() for x, y in val_dataset]

X_test = [x.numpy() for x, y in test_dataset]
y_test = [y.item() for x, y in test_dataset]

# Trenujemy klasyfikator
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

# 9. Macierz pomyłek
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predykcja")
plt.ylabel("Prawdziwa etykieta")
plt.title("Macierz pomyłek")
plt.show()

In [None]:
params = {}

with open("config_transformers.txt", "r") as f:
    for line in f:
        key, value = line.strip().split("=")
        try:
            value = int(value)
        except ValueError:
            try:
                value = float(value)
            except ValueError:
                pass
        params[key] = value

print(params)

In [None]:
tokenizer = BertTokenizer.from_pretrained(params["model_bert"])

In [9]:
def tokenize_function_bert(text):
    encoding = tokenizer(
        text,
        padding=False,
        truncation=True,
        max_length=params["max_length_token"],
        return_tensors=None
    )
    return torch.tensor(encoding["input_ids"])


def pad_collate_bert(batch):
    input_ids, labels = zip(*batch)
    attention_masks = []

    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    for ids in input_ids:
        mask = torch.ones(len(ids), dtype=torch.long)
        attention_masks.append(mask)
    padded_attention_mask = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    
    return {
        "input_ids": padded_input_ids,
        "attention_mask": padded_attention_mask,
        "labels": torch.tensor(labels)
    }

In [None]:
train_dataset, validation_dataset, test_dataset = get_poem_sentiment_dataset(tokenize_function_bert)

In [None]:
train_dataset, test_dataset = get_basic_tweet_sentiment_dataset(tokenize_function_bert)
train_dataset, validation_dataset = train_test_split(
    train_dataset, train_size=params["train_size"], shuffle=True
)

In [None]:
train_dataset, test_dataset = get_advanced_tweet_sentiment_dataset_only_text(tokenize_function_bert)
train_dataset, validation_dataset = train_test_split(
    train_dataset, train_size=params["train_size"], shuffle=True
)

In [None]:
class_num = params["class_num"]
dataset_labels = list(range(class_num))

In [12]:
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True, collate_fn=pad_collate_bert)
validation_loader = DataLoader(validation_dataset, batch_size=params["batch_size"], shuffle=False, collate_fn=pad_collate_bert)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False, collate_fn=pad_collate_bert)

In [None]:
model = BertForSequenceClassification.from_pretrained(params["model_bert"], num_labels=params["class_num"]).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=params["learning_rate"])
loss_fun = CrossEntropyLoss()

In [None]:
model.train()
progress_bar = tqdm.tqdm(range(params["epochs"]), desc="Epoch")

for epoch in progress_bar:
    total_loss = 0
    batches = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        batches += 1

    model.eval()
    with torch.no_grad():
        val_loss = 0
        val_batches = 0
        for batch in validation_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            val_batches += 1

    model.train()
    progress_bar.set_postfix({
        "Train loss": total_loss / batches,
        "Validation loss": val_loss / val_batches
    })

In [None]:
def measure_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_labels, all_preds

In [None]:
print("Train set metrics")
train_labels, train_preds = measure_model(model, train_loader)
train_preds_tensor = torch.tensor(train_preds)
train_labels_tensor = torch.tensor(train_labels)

display_clasification_metrics(train_preds_tensor, train_labels_tensor, labels=dataset_labels)


print("\nTest set metrics")
test_labels, test_preds = measure_model(model, test_loader)
test_preds_tensor = torch.tensor(test_preds)
test_labels_tensor = torch.tensor(test_labels)

display_clasification_metrics(test_preds_tensor, test_labels_tensor, labels=dataset_labels)