# BERT models

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn import CrossEntropyLoss
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch
from utils.text_datasets import get_basic_tweet_sentiment_dataset, get_poem_sentiment_dataset, get_advanced_tweet_sentiment_dataset_only_text
import tqdm
from sklearn.model_selection import train_test_split
from utils.metrics import get_metrics, display_clasification_metrics

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
params = {"train_size":0.9,
"batch_size":16,
"epochs":5,
"learning_rate":2e-5,
"class_num":4,
"max_length_token":128,
"model_bert":"bert-base-uncased"}

In [None]:
params = {}

with open("config_transformers.txt", "r") as f:
    for line in f:
        key, value = line.strip().split("=")
        try:
            value = int(value)
        except ValueError:
            try:
                value = float(value)
            except ValueError:
                pass
        params[key] = value

print(params)

In [None]:
tokenizer = BertTokenizer.from_pretrained(params["model_bert"])

In [9]:
def tokenize_function_bert(text):
    encoding = tokenizer(
        text,
        padding=False,
        truncation=True,
        max_length=params["max_length_token"],
        return_tensors=None
    )
    return torch.tensor(encoding["input_ids"])


def pad_collate_bert(batch):
    input_ids, labels = zip(*batch)
    attention_masks = []

    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    for ids in input_ids:
        mask = torch.ones(len(ids), dtype=torch.long)
        attention_masks.append(mask)
    padded_attention_mask = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    
    return {
        "input_ids": padded_input_ids,
        "attention_mask": padded_attention_mask,
        "labels": torch.tensor(labels)
    }

In [None]:
train_dataset, validation_dataset, test_dataset = get_poem_sentiment_dataset(tokenize_function_bert)

In [None]:
train_dataset, test_dataset = get_basic_tweet_sentiment_dataset(tokenize_function_bert)
train_dataset, validation_dataset = train_test_split(
    train_dataset, train_size=params["train_size"], shuffle=True
)

In [None]:
train_dataset, test_dataset = get_advanced_tweet_sentiment_dataset_only_text(tokenize_function_bert)
train_dataset, validation_dataset = train_test_split(
    train_dataset, train_size=params["train_size"], shuffle=True
)

In [None]:
class_num = params["class_num"]
dataset_labels = list(range(class_num))

In [12]:
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True, collate_fn=pad_collate_bert)
validation_loader = DataLoader(validation_dataset, batch_size=params["batch_size"], shuffle=False, collate_fn=pad_collate_bert)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False, collate_fn=pad_collate_bert)

In [None]:
model = BertForSequenceClassification.from_pretrained(params["model_bert"], num_labels=params["class_num"]).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=params["learning_rate"])
loss_fun = CrossEntropyLoss()

In [None]:
model.train()
progress_bar = tqdm.tqdm(range(params["epochs"]), desc="Epoch")

for epoch in progress_bar:
    total_loss = 0
    batches = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        batches += 1

    model.eval()
    with torch.no_grad():
        val_loss = 0
        val_batches = 0
        for batch in validation_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            val_batches += 1

    model.train()
    progress_bar.set_postfix({
        "Train loss": total_loss / batches,
        "Validation loss": val_loss / val_batches
    })

In [None]:
def measure_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_labels, all_preds

In [None]:
print("Train set metrics")
train_labels, train_preds = measure_model(model, train_loader)
train_preds_tensor = torch.tensor(train_preds)
train_labels_tensor = torch.tensor(train_labels)

display_clasification_metrics(train_preds_tensor, train_labels_tensor, labels=dataset_labels)


print("\nTest set metrics")
test_labels, test_preds = measure_model(model, test_loader)
test_preds_tensor = torch.tensor(test_preds)
test_labels_tensor = torch.tensor(test_labels)

display_clasification_metrics(test_preds_tensor, test_labels_tensor, labels=dataset_labels)