In [74]:
import torch
import pandas as pd
from torch.utils.data import Dataset

In [75]:
train_df = pd.read_csv('../data/huggingface/train.csv')
test_df = pd.read_csv('../data/huggingface/test.csv')
dev_df = pd.read_csv('../data/huggingface/dev.csv')

In [76]:
train_df.head()

Unnamed: 0,config,source_article,logical_fallacies,source_article_ro
0,edu,"company's slogan ""Expect More. Pay Less.""",appeal to emotion,sloganul companiei „Așteptați mai mult. Plătiț...
1,edu,"The bigger a child's shoe size, the better the...",false causality,Cu cât mărimea pantofilor unui copil este mai ...
2,edu,"Since many people believe this, then it must b...",ad populum,"Din moment ce mulți oameni cred asta, atunci t..."
3,edu,Senator Randall isn't lying when she says she ...,circular reasoning,Senatorul Randall nu minte când spune că îi pa...
4,edu,A mother is telling her daughter that she went...,fallacy of relevance,O mamă îi spune fiicei ei că și-a analizat dat...


In [77]:
train_dataset = train_df[['logical_fallacies', 'source_article_ro']]
test_dataset = test_df[['logical_fallacies', 'source_article_ro']]
dev_dataset = dev_df[['logical_fallacies', 'source_article_ro']]

In [78]:
train_dataset.head()

Unnamed: 0,logical_fallacies,source_article_ro
0,appeal to emotion,sloganul companiei „Așteptați mai mult. Plătiț...
1,false causality,Cu cât mărimea pantofilor unui copil este mai ...
2,ad populum,"Din moment ce mulți oameni cred asta, atunci t..."
3,circular reasoning,Senatorul Randall nu minte când spune că îi pa...
4,fallacy of relevance,O mamă îi spune fiicei ei că și-a analizat dat...


In [79]:
train_dataset['logical_fallacies'].value_counts()

logical_fallacies
faulty generalization     401
intentional               321
ad hominem                289
appeal to emotion         217
false causality           212
ad populum                209
fallacy of credibility    200
fallacy of logic          176
fallacy of relevance      175
false dilemma             143
circular reasoning        140
fallacy of extension      139
equivocation               58
Name: count, dtype: int64

In [80]:
test_dataset['logical_fallacies'].value_counts()

logical_fallacies
faulty generalization     89
intentional               60
ad hominem                57
appeal to emotion         49
fallacy of relevance      46
fallacy of credibility    37
ad populum                35
false causality           34
fallacy of logic          31
fallacy of extension      26
circular reasoning        20
false dilemma             18
equivocation               9
Name: count, dtype: int64

In [81]:
dev_dataset['logical_fallacies'].value_counts()

logical_fallacies
intentional               99
faulty generalization     84
ad hominem                52
ad populum                51
fallacy of relevance      44
false causality           43
appeal to emotion         42
fallacy of extension      40
fallacy of logic          33
false dilemma             29
fallacy of credibility    26
circular reasoning        18
equivocation               9
Name: count, dtype: int64

In [82]:
logical_fallacies = ['faulty generalization', 'intentional', 'ad hominem']
train_data = train_dataset[train_dataset.logical_fallacies.isin(logical_fallacies)]
test_data = test_dataset[test_dataset.logical_fallacies.isin(logical_fallacies)]
dev_data = dev_dataset[dev_dataset.logical_fallacies.isin(logical_fallacies)]

In [83]:
len(train_data), len(test_data), len(dev_data)

(1011, 206, 235)

In [84]:
num_labels = len(logical_fallacies)

id2label = {id: label for id, label in enumerate(logical_fallacies)}

label2id = {label: id for id, label in enumerate(logical_fallacies)}

In [85]:
label2id

{'faulty generalization': 0, 'intentional': 1, 'ad hominem': 2}

In [86]:
id2label

{0: 'faulty generalization', 1: 'intentional', 2: 'ad hominem'}

In [87]:
train_data.loc[:, 'logical_fallacies_id'] = train_data['logical_fallacies'].apply(lambda x: label2id[x.strip()])
test_data.loc[:, "logical_fallacies_id"] = test_data['logical_fallacies'].apply(lambda x: label2id[x.strip()])
dev_data.loc[:, "logical_fallacies_id"] = dev_data['logical_fallacies'].apply(lambda x: label2id[x.strip()])
train_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:, 'logical_fallacies_id'] = train_data['logical_fallacies'].apply(lambda x: label2id[x.strip()])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.loc[:, "logical_fallacies_id"] = test_data['logical_fallacies'].apply(lambda x: label2id[x.strip()])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

Unnamed: 0,logical_fallacies,source_article_ro,logical_fallacies_id
6,faulty generalization,Dacă interzicem Hummer-urile pentru că sunt dă...,0
8,faulty generalization,"""Ai întârziat în seara asta. Trebuie să înșeli!""",0
11,ad hominem,Vă opuneți propunerii unui senator de a extind...,2
27,faulty generalization,Toți jucătorii de fotbal sunt proști.,0
28,faulty generalization,"""Verișoara mea a spus că cursul ei de matemati...",0


In [88]:
from transformers import BertTokenizerFast, BertTokenizer

# ro-bert
tokenizer = BertTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", max_length=512)

In [89]:
from transformers import BertForSequenceClassification
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
model = BertForSequenceClassification.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1",
                                                      num_labels=num_labels,
                                                      id2label=id2label, label2id=label2id)

# Make model weights contiguous
for name, param in model.named_parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dumitrescustefan/bert-base-romanian-uncased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [90]:
train_encodings = tokenizer(list(train_data['source_article_ro']), padding=True, truncation=True, max_length=1024)
test_encodings = tokenizer(list(test_data['source_article_ro']), padding=True, truncation=True, max_length=1024)
dev_encodings = tokenizer(list(dev_data['source_article_ro']), padding=True, truncation=True, max_length=1024)

In [91]:
train_labels = list(train_data['logical_fallacies_id'])
test_labels = list(test_data['logical_fallacies_id'])
dev_labels = list(dev_data['logical_fallacies_id'])

In [92]:
class DataLoader(Dataset):
    """
    Custom Dataset class for handling tokenized text data and corresponding labels.
    Inherits from torch.utils.data.Dataset.
    """

    def __init__(self, encodings, labels):
        """
        Initializes the DataLoader class with encodings and labels.

        Args:
            encodings (dict): A dictionary containing tokenized input text data
                              (e.g., 'input_ids', 'token_type_ids', 'attention_mask').
            labels (list): A list of integer labels for the input text data.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Returns a dictionary containing tokenized data and the corresponding label for a given index.

        Args:
            idx (int): The index of the data item to retrieve.

        Returns:
            item (dict): A dictionary containing the tokenized data and the corresponding label.
        """
        # Retrieve tokenized data for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the label for the given index to the item dictionary
        item['labels'] = torch.tensor(self.labels[idx])
        return item
        
        # item = {key: torch.tensor(val[idx]).contiguous() for key, val in self.encodings.items()}
        # item['labels'] = torch.tensor(self.labels[idx]).contiguous()
        # return item

    def __len__(self):
        """
        Returns the number of data items in the dataset.

        Returns:
            (int): The number of data items in the dataset.
        """
        return len(self.labels)

In [93]:
train_dataloader = DataLoader(train_encodings, train_labels)
test_dataloader = DataLoader(test_encodings, test_labels)
dev_dataloader = DataLoader(dev_encodings, dev_labels)

In [94]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.

    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of
              that observation belonging to a certain class.

    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids

    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)

    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')

    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)

    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [95]:
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    # The pre-trained model that will be fine-tuned
    model=model,
    # Training arguments that we defined above
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=dev_dataloader,
    compute_metrics=compute_metrics
)

In [96]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.709409,0.710638,0.702066,0.703559,0.706756
2,No log,0.606049,0.757447,0.756657,0.764947,0.751045
3,No log,0.598689,0.778723,0.776729,0.780145,0.77701


TrainOutput(global_step=192, training_loss=0.6553902626037598, metrics={'train_runtime': 3188.0858, 'train_samples_per_second': 0.951, 'train_steps_per_second': 0.06, 'total_flos': 397452859320510.0, 'train_loss': 0.6553902626037598, 'epoch': 3.0})

In [97]:
q = [trainer.evaluate(eval_dataset=dataloader) for dataloader in [dev_dataloader, test_dataloader]]
print(q)
results_df = pd.DataFrame(q, index=["dev", "test"])

[{'eval_loss': 0.5986889004707336, 'eval_Accuracy': 0.7787234042553192, 'eval_F1': 0.7767288824174754, 'eval_Precision': 0.780145442245899, 'eval_Recall': 0.777010027010027, 'eval_runtime': 75.2596, 'eval_samples_per_second': 3.123, 'eval_steps_per_second': 0.199, 'epoch': 3.0}, {'eval_loss': 0.727419376373291, 'eval_Accuracy': 0.6504854368932039, 'eval_F1': 0.6522906407250133, 'eval_Precision': 0.6488706323096018, 'eval_Recall': 0.6637624022603325, 'eval_runtime': 65.4869, 'eval_samples_per_second': 3.146, 'eval_steps_per_second': 0.199, 'epoch': 3.0}]


In [98]:
print(results_df.iloc[:, :5])

      eval_loss  eval_Accuracy   eval_F1  eval_Precision  eval_Recall
dev    0.598689       0.778723  0.776729        0.780145     0.777010
test   0.727419       0.650485  0.652291        0.648871     0.663762


In [99]:
def predict(text):
    """
    Predicts the class label for a given input text

    Args:
        text (str): The input text for which the class label needs to be predicted.

    Returns:
        probs (torch.Tensor): Class probabilities for the input text.
        pred_label_idx (torch.Tensor): The index of the predicted class label.
        pred_label (str): The predicted class label.
    """
    # Tokenize the input text and move tensors to the GPU if available
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")

    # Get model output (logits)
    outputs = model(**inputs)

    probs = outputs[0].softmax(1)
    """ Explanation outputs: The BERT model returns a tuple containing the output logits (and possibly other elements depending on the model configuration). In this case, the output logits are the first element in the tuple, which is why we access it using outputs[0].

    outputs[0]: This is a tensor containing the raw output logits for each class. The shape of the tensor is (batch_size, num_classes) where batch_size is the number of input samples (in this case, 1, as we are predicting for a single input text) and num_classes is the number of target classes.

    softmax(1): The softmax function is applied along dimension 1 (the class dimension) to convert the raw logits into class probabilities. Softmax normalizes the logits so that they sum to 1, making them interpretable as probabilities. """

    # Get the index of the class with the highest probability
    # argmax() finds the index of the maximum value in the tensor along a specified dimension.
    # By default, if no dimension is specified, it returns the index of the maximum value in the flattened tensor.
    pred_label_idx = probs.argmax()

    # Now map the predicted class index to the actual class label
    # Since pred_label_idx is a tensor containing a single value (the predicted class index),
    # the .item() method is used to extract the value as a scalar
    pred_label = model.config.id2label[pred_label_idx.item()]

    return probs, pred_label_idx, pred_label

In [100]:
model_path = "logical-fallacies-bert-3-classes-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('logical-fallacies-bert-3-classes-model\\tokenizer_config.json',
 'logical-fallacies-bert-3-classes-model\\special_tokens_map.json',
 'logical-fallacies-bert-3-classes-model\\vocab.txt',
 'logical-fallacies-bert-3-classes-model\\added_tokens.json')

In [109]:
from transformers import pipeline

model_path = "logical-fallacies-bert-3-classes-model"

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)
nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [110]:
nlp("Toate florile nu rămân deschise pentru totdeauna. Trandafirii sunt un tip de plante. Prin urmare, toate plantele nu rămân deschise pentru totdeauna..")

# faulty generalization

[{'label': 'faulty generalization', 'score': 0.9594514966011047}]

In [111]:
nlp("Din moment ce nimeni nu-mi poate arăta o floare care rămâne deschisă pentru totdeauna, toate florile nu rămân deschise pentru totdeauna.")

# intentional

[{'label': 'faulty generalization', 'score': 0.9295602440834045}]

In [112]:
nlp("Toate produsele electronice au nevoie de electricitate.")
# faulty generalization

[{'label': 'faulty generalization', 'score': 0.879109263420105}]

In [105]:
nlp("Jake susține că toți cei din familia lui nu au fost niciodată în Europa. Dar Jake este un bărbat needucat, așa că afirmația lui trebuie să fie falsă.")
# ad hominem

[{'label': 'ad hominem', 'score': 0.7307600378990173}]

In [106]:
nlp("Nu există rapoarte care să ateste că cineva a avut probleme cu nerespectarea regulilor rutiere; prin urmare, toți oamenii ar trebui să respecte regulile de drum.")

# intentional

[{'label': 'faulty generalization', 'score': 0.7639155387878418}]