In [None]:
import os
import yaml
import wandb
import torch
from torch import cuda
from datetime import datetime
from utils.data_utils import read_data, filter_fallacies, encode_labels_sentiment, plot_training_curve, \
    plot_learning_curve
from model.BERT import compute_metrics_wandb, DataLoader
from transformers import BertTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback

In [None]:
# Load configuration file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

    # Start a new wandb run to track this script
run = wandb.init(
    project="Logical_Fallacies",
    config=config,
    allow_val_change=True,  # Allows you to update the config during the run
    settings=wandb.Settings(console="off")
)
current_time = datetime.now()

In [None]:
train_dataset, test_dataset, dev_dataset = read_data('combined_lfud_huggingface_nonfallacies_sent.csv')
dev_dataset.head()

In [None]:
dev_dataset['logical_fallacies'].value_counts()

In [None]:
train_dataset['logical_fallacies'].value_counts()

In [None]:
test_dataset['logical_fallacies'].value_counts()

In [None]:
logical_fallacies = list(set(list(train_dataset['logical_fallacies'])))
logical_fallacies

In [None]:
# logical_fallacies_subset = ['nonfallacy', 'faulty generalization', 'intentional']
# logical_fallacies_subset = ['nonfallacy', 'faulty generalization', 'intentional', 'ad hominem', 'false causality']
logical_fallacies_subset = ['faulty generalization', 'false dilemma', 'appeal to emotion',
                            'deductive fallacy', 'fallacy of extension', 'false causality', 'fallacy of relevance',
                            'intentional', 'ad hominem', 'circular reasoning', 'fallacy of credibility',
                            'ad populum', 'equivocation', 'nonfallacy', 'fallacy of logic']

fil_train_data, fil_test_data, fil_dev_data = filter_fallacies(train_dataset, test_dataset, dev_dataset,
                                                               logical_fallacies_subset)

fil_train_data.head()

In [None]:
logical_fallacies_subset

In [None]:
id2label = {id: label for id, label in enumerate(logical_fallacies_subset)}

label2id = {label: id for id, label in enumerate(logical_fallacies_subset)}

print(label2id, id2label)

In [None]:
train_data, test_data, dev_data = encode_labels_sentiment(fil_train_data, fil_test_data, fil_dev_data, label2id)

In [None]:
# ro-bert
tokenizer = BertTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", max_length=512,
                                          hidden_dropout_prob=0.4, attention_probs_dropout_prob=0.4)  # here

In [None]:
from model.BERT import BertWithSentiment

device = 'cuda' if cuda.is_available() else 'cpu'
model = BertWithSentiment("dumitrescustefan/bert-base-romanian-uncased-v1", num_labels=len(logical_fallacies_subset))

# Make model weights contiguous
for name, param in model.named_parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()
model.to(device)

In [None]:
train_encodings = tokenizer(list(train_data['source_article_ro']), padding=True, truncation=True, max_length=1024)
test_encodings = tokenizer(list(test_data['source_article_ro']), padding=True, truncation=True, max_length=1024)
dev_encodings = tokenizer(list(dev_data['source_article_ro']), padding=True, truncation=True, max_length=1024)

In [None]:
train_labels = list(train_data['logical_fallacies_id'])
test_labels = list(test_data['logical_fallacies_id'])
dev_labels = list(dev_data['logical_fallacies_id'])

In [None]:
train_sentiments = list(train_data['sentiment_id'])
test_sentiments = list(test_data['sentiment_id'])
dev_sentiments = list(dev_data['sentiment_id'])

In [None]:
train_dataloader = DataLoader(train_encodings, train_labels, train_sentiments)
test_dataloader = DataLoader(test_encodings, test_labels, test_sentiments)
dev_dataloader = DataLoader(dev_encodings, dev_labels, dev_sentiments)

In [None]:
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir="./results",
    eval_strategy="epoch",
    logging_strategy="epoch",  # Log at the end of each epoch
    logging_dir="./results/logs",  # Directory for logs
    save_strategy="epoch",
    learning_rate=float(config['model']['params']['learning_rate']),
    per_device_train_batch_size=config['model']['params']['train_batch_size'],
    per_device_eval_batch_size=config['model']['params']['eval_batch_size'],
    num_train_epochs=config['model']['params']['epochs'],
    weight_decay=0.1,
    max_grad_norm=1.0,  # Prevents gradient explosion
    log_level="warning",
    load_best_model_at_end=True
)
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=dev_dataloader,
    compute_metrics=compute_metrics_wandb,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # evaluation loss
)

In [None]:
# Start training
trainer.train()

In [None]:
plot = plot_training_curve(trainer, name + "loss_acc" + ".png")

In [None]:
plot = plot_learning_curve(trainer, name + "learning_curve" + ".png")

In [None]:
wandb.finish()

In [None]:
os.makedirs("outputs", exist_ok=True)
m_p = os.path.join("outputs", "model.pt")  # file
torch.save(model.state_dict(), m_p)

In [None]:
tokenizer_path = os.path.join("outputs", "tokenizer")  # directory
tokenizer.save_pretrained(tokenizer_path)