In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm

# install datasets
!pip install datasets
from datasets import load_dataset, load_metric, Dataset

# install transformers
!pip install transformers==4.28.0
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DefaultDataCollator



In [2]:
dataset = load_dataset("persiannlp/parsinlu_entailment")
dataset = dataset.filter(lambda example: example['label'] in ["c", "e", "n"])
dataset

DatasetDict({
    train: Dataset({
        features: ['sent1', 'sent2', 'category', 'label'],
        num_rows: 754
    })
    test: Dataset({
        features: ['sent1', 'sent2', 'category', 'label'],
        num_rows: 1673
    })
    validation: Dataset({
        features: ['sent1', 'sent2', 'category', 'label'],
        num_rows: 270
    })
})

# BERT

In [3]:
model_checkpoint = "HooshvareLab/bert-base-parsbert-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

str_to_int = {"e": 0, "n": 1, "c": 2}
def tokenize_function(examples):
    tokenized_batch = tokenizer(examples["sent1"], examples["sent2"], truncation=True)
    tokenized_batch["label"] = [str_to_int[label] for label in examples["label"]]
    return tokenized_batch

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassifica

Map:   0%|          | 0/1673 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


DatasetDict({
    train: Dataset({
        features: ['sent1', 'sent2', 'category', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 754
    })
    test: Dataset({
        features: ['sent1', 'sent2', 'category', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1673
    })
    validation: Dataset({
        features: ['sent1', 'sent2', 'category', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 270
    })
})

In [None]:
training_args = TrainingArguments(
    output_dir="parsbert",
    evaluation_strategy="epoch",
    logging_steps = 20,
    learning_rate=5e-5,
    num_train_epochs=3,
    warmup_steps=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy = "epoch",
    load_best_model_at_end=True,
    save_total_limit = 1,
    metric_for_best_model="accuracy",
    group_by_length = True,
    seed=0
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
trainer.evaluate(tokenized_datasets["test"])

# XLM-RoBERTa

In [None]:
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

In [None]:
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

str_to_int = {"e": 0, "n": 1, "c": 2}
def tokenize_function(examples):
    tokenized_batch = tokenizer(examples["sent1"], examples["sent2"], truncation=True, max_length=128)
    tokenized_batch["label"] = [str_to_int[label] for label in examples["label"]]
    return tokenized_batch

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

In [None]:
training_args = TrainingArguments(
    output_dir="xlm-r",
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    logging_steps = 20,
    learning_rate=2e-5,
    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    load_best_model_at_end=True,
    weight_decay=0.001,
    save_total_limit = 1,
    metric_for_best_model="accuracy",
    group_by_length = True,
    seed=0
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
trainer.evaluate(tokenized_datasets["test"])