In [1]:
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DefaultDataCollator
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import classification_report
from sklearn import metrics
from scipy.special import softmax

# Load Data

In [2]:
df = pd.read_excel("Question2_Data/train.xlsx")
df

Unnamed: 0,source,targets,category
0,"When news is brought to one of them, of (the b...",و چون یکی از آنان را به [ولادت] دختر مژده دهند...,quran
1,After them repaired Zadok the son of Immer ove...,و چون دشمنان ما شنیدند که ما آگاه شده‌ایم و خد...,bible
2,And establish regular prayers at the two ends ...,و نماز را در دو طرف روز و ساعات نخستین شب برپا...,quran
3,"And it came to pass, that, when I was come aga...",و فرمود تا مدعیانش نزد تو حاضر شوند؛ و از او ب...,bible
4,"Ah woe, that Day, to the Rejecters of Truth!",وای در آن روز بر تکذیب کنندگان!,quran
...,...,...,...
12595,"Women impure are for men impure, and men impur...",زنان پلید برای مردان پلید و مردان پلید برای زن...,quran
12596,I don't want any silly dance given in my honour.',بنابراین حالا هم میل ندارم جشنی به افتخار من د...,mizan
12597,And the Earth will shine with the Glory of its...,و زمین به نور پروردگارش روشن می‌شود، و کتاب [ا...,quran
12598,"Then lifted I up mine eyes, and saw, and behol...",گفتم: «این چیست؟» او جواب داد: «این است آن ایف...,bible


In [3]:
dataset = DatasetDict()
for data_set in ["train", "valid", "test"]:
    dataset[data_set] = Dataset.from_pandas(pd.read_excel(f"Question2_Data/{data_set}.xlsx"))
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'targets', 'category'],
        num_rows: 12600
    })
    valid: Dataset({
        features: ['source', 'targets', 'category'],
        num_rows: 2700
    })
    test: Dataset({
        features: ['source', 'targets', 'category'],
        num_rows: 2700
    })
})

In [4]:
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

str_to_int = {"quran": 0, "bible": 1, "mizan": 2}
def tokenize_function(examples):
    tokenized_batch = tokenizer(examples["source"], truncation=True, max_length=128)
    tokenized_batch["label"] = [str_to_int[label] for label in examples["category"]]
    return tokenized_batch

tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets["train"][0])
tokenized_datasets

training_args = TrainingArguments(
    output_dir="q2_parsbert", 
    evaluation_strategy="epoch",
    logging_steps = 20,
    learning_rate=3e-5,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit = 1,
#     load_best_model_at_end=True,
#     save_strategy = "epoch",
#     metric_for_best_model="accuracy",
    group_by_length = True,
    seed=0,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

{'source': 'When news is brought to one of them, of (the birth of) a female (child), his face darkens, and he is filled with inward grief!', 'targets': 'و چون یکی از آنان را به [ولادت] دختر مژده دهند [از شدت خشم] چهره\u200cاش سیاه گردد، ودرونش از غصه واندوه لبریز و آکنده شود!!', 'category': 'quran', 'input_ids': [0, 14847, 7123, 83, 91048, 47, 1632, 111, 2856, 4, 111, 15, 2347, 127319, 111, 16, 10, 117776, 15, 206, 38472, 247, 1919, 2577, 43334, 1755, 4, 136, 764, 83, 152382, 678, 23, 19364, 10314, 4240, 38, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 0}


The following columns in the training set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: category, targets, source. If category, targets, source are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12600
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3940


Epoch,Training Loss,Validation Loss,Accuracy
1,0.268,0.150862,0.958889
2,0.1094,0.129471,0.971481
3,0.0813,0.173883,0.967037
4,0.0226,0.110631,0.98
5,0.0538,0.124389,0.98


Saving model checkpoint to q2_parsbert\checkpoint-500
Configuration saved in q2_parsbert\checkpoint-500\config.json
Model weights saved in q2_parsbert\checkpoint-500\pytorch_model.bin
tokenizer config file saved in q2_parsbert\checkpoint-500\tokenizer_config.json
Special tokens file saved in q2_parsbert\checkpoint-500\special_tokens_map.json
Deleting older checkpoint [q2_parsbert\checkpoint-3500] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: category, targets, source. If category, targets, source are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2700
  Batch size = 16
Saving model checkpoint to q2_parsbert\checkpoint-1000
Configuration saved in q2_parsbert\checkpoint-1000\config.json
Model weights saved in q2_parsbert\checkpoint-1000\pytorch_model.bin


TrainOutput(global_step=3940, training_loss=0.15370832606405493, metrics={'train_runtime': 1128.6259, 'train_samples_per_second': 55.82, 'train_steps_per_second': 3.491, 'total_flos': 1082210153998176.0, 'train_loss': 0.15370832606405493, 'epoch': 5.0})

In [6]:
pred = trainer.predict(tokenized_datasets["test"])
print(trainer.evaluate(tokenized_datasets["test"]))
y_pred = pred.predictions.argmax(axis=-1)
print("ID")
print(classification_report(tokenized_datasets["test"]["label"], y_pred, target_names=str_to_int.keys()))
print("AUC-ovr", metrics.roc_auc_score(tokenized_datasets["test"]["label"], 
                                       softmax(pred.predictions, axis=-1), multi_class="ovr"))
print("AUC-ovo", metrics.roc_auc_score(tokenized_datasets["test"]["label"], 
                                       softmax(pred.predictions, axis=-1), multi_class="ovo"))

The following columns in the test set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: category, targets, source. If category, targets, source are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2700
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: category, targets, source. If category, targets, source are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2700
  Batch size = 16


{'eval_loss': 0.10795055329799652, 'eval_accuracy': 0.9825925925925926, 'eval_runtime': 11.3704, 'eval_samples_per_second': 237.46, 'eval_steps_per_second': 14.863, 'epoch': 5.0}
ID
              precision    recall  f1-score   support

       quran       0.99      0.97      0.98       900
       bible       0.97      0.99      0.98       900
       mizan       0.98      0.98      0.98       900

    accuracy                           0.98      2700
   macro avg       0.98      0.98      0.98      2700
weighted avg       0.98      0.98      0.98      2700

AUC-ovr 0.9987145061728396
AUC-ovo 0.9987145061728396


In [8]:
def tokenize_function(examples):
    tokenized_batch = tokenizer(examples["targets"], truncation=True, max_length=128)
    tokenized_batch["label"] = [str_to_int[label] for label in examples["category"]]
    return tokenized_batch

tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets["train"][0])
tokenized_datasets

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

{'source': 'When news is brought to one of them, of (the birth of) a female (child), his face darkens, and he is filled with inward grief!', 'targets': 'و چون یکی از آنان را به [ولادت] دختر مژده دهند [از شدت خشم] چهره\u200cاش سیاه گردد، ودرونش از غصه واندوه لبریز و آکنده شود!!', 'category': 'quran', 'input_ids': [0, 65, 17980, 8583, 270, 45903, 406, 178, 378, 3606, 159045, 268, 33514, 665, 20125, 6974, 29936, 378, 8428, 46687, 76127, 376, 268, 94699, 14524, 91080, 38803, 50, 141682, 900, 870, 270, 9475, 3092, 176, 3138, 13370, 176, 43720, 103347, 65, 4573, 1901, 12157, 1994, 1146, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 0}


DatasetDict({
    train: Dataset({
        features: ['source', 'targets', 'category', 'input_ids', 'attention_mask', 'label'],
        num_rows: 12600
    })
    valid: Dataset({
        features: ['source', 'targets', 'category', 'input_ids', 'attention_mask', 'label'],
        num_rows: 2700
    })
    test: Dataset({
        features: ['source', 'targets', 'category', 'input_ids', 'attention_mask', 'label'],
        num_rows: 2700
    })
})

In [9]:
pred = trainer.predict(tokenized_datasets["test"])
print(trainer.evaluate(tokenized_datasets["test"]))
y_pred = pred.predictions.argmax(axis=-1)
print("OOD")
print(classification_report(tokenized_datasets["test"]["label"], y_pred, target_names=str_to_int.keys()))
print("AUC-ovr", metrics.roc_auc_score(tokenized_datasets["test"]["label"], 
                                       softmax(pred.predictions, axis=-1), multi_class="ovr"))
print("AUC-ovo", metrics.roc_auc_score(tokenized_datasets["test"]["label"], 
                                       softmax(pred.predictions, axis=-1), multi_class="ovo"))

The following columns in the test set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: category, targets, source. If category, targets, source are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2700
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: category, targets, source. If category, targets, source are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2700
  Batch size = 16


{'eval_loss': 1.515942931175232, 'eval_accuracy': 0.7925925925925926, 'eval_runtime': 12.6424, 'eval_samples_per_second': 213.567, 'eval_steps_per_second': 13.368, 'epoch': 5.0}
OOD
              precision    recall  f1-score   support

       quran       0.79      0.76      0.78       900
       bible       0.82      0.64      0.72       900
       mizan       0.77      0.98      0.86       900

    accuracy                           0.79      2700
   macro avg       0.80      0.79      0.79      2700
weighted avg       0.80      0.79      0.79      2700

AUC-ovr 0.9203460905349795
AUC-ovo 0.9203460905349794


In [11]:
np.unique(tokenized_datasets["test"]["label"], return_counts=True)

(array([0, 1, 2]), array([900, 900, 900], dtype=int64))