In [3]:
!pip install transformers --quiet

[K     |████████████████████████████████| 4.2 MB 17.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 45.5 MB/s 
[K     |████████████████████████████████| 596 kB 53.0 MB/s 
[K     |████████████████████████████████| 86 kB 6.2 MB/s 
[?25h

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
import torch
from torch import nn
from torch.utils.data import Dataset, TensorDataset, DataLoader
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW
import time
from tqdm import tqdm
import string
from scipy.special import softmax

In [6]:
train_path = '/content/drive/MyDrive/Colab Notebooks/Multilingual Classification/train.xlsx'
test_path = '/content/drive/MyDrive/Colab Notebooks/Multilingual Classification/test.xlsx'
valid_path = '/content/drive/MyDrive/Colab Notebooks/Multilingual Classification/valid.xlsx'

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [8]:
def source_preprocess(text):
  text = text.lower()
  text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
  return text

def target_preprocess(text):
  punctuations = string.punctuation + "؟«»؛،"
  text = text.translate(str.maketrans(punctuations, ' ' * len(punctuations)))
  return text

In [9]:
class CustomDataset(Dataset):
    
    def __init__(self, dataframe, labels, tokenizer, features, features_preprocess, max_len):
        self.dataframe = dataframe
        self.labels = labels
        self.tokenizer = tokenizer
        self.features_preprocess = features_preprocess
        self.max_len = max_len
        self.features = features
        
    def get_dataset(self):
      input_ids = []
      attention_masks = []
      targets = []
      for index, row in self.dataframe.iterrows():
        if row['category'] not in self.labels.keys():
          continue
        target = self.labels[row['category']]
        combined_encoded = []
        for i, (feature, preprocess) in enumerate(zip(self.features, self.features_preprocess)):
          combined_encoded += self.tokenizer.encode(preprocess(row[feature]), add_special_tokens=False)
          if i < len(self.features) - 1:
            combined_encoded += [self.tokenizer.sep_token_id]
        if len(combined_encoded) > self.max_len:
          combined_encoded = combined_encoded[:self.max_len]
        attention_mask = [1] * len(combined_encoded)
        padded_combined_encoded = combined_encoded + [1] * (self.max_len - len(combined_encoded))
        padded_attention_mask = attention_mask + [0] * (self.max_len - len(attention_mask))
        input_ids.append(padded_combined_encoded)
        attention_masks.append(padded_attention_mask)
        targets.append(target)
      input_ids = torch.tensor(input_ids)
      attention_masks = torch.tensor(attention_masks)
      targets = torch.tensor(targets)
      return TensorDataset(input_ids, attention_masks, targets)

def create_data_loader(dataframe, labels, tokenizer, features, features_preprocess, max_len, batch_size):
    dataset = CustomDataset(dataframe, labels, tokenizer, features, features_preprocess, max_len)
    return DataLoader(dataset.get_dataset(), batch_size = batch_size)

In [10]:
def eval(model, dataloader, data_len, device):
  model.eval()
  y_preds = np.array([])
  y_probs = None
  y_trues = np.array([])
  losses = []
  correct_preds = 0
  with torch.no_grad():
    for d in tqdm(dataloader):
      input_ids = d[0].to(device)
      attention_mask = d[1].to(device)
      targets = d[2].to(device)
      y_trues = np.concatenate((y_trues, targets.cpu().numpy().copy()), axis=0)
      outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
      y_prob = softmax(outputs.logits.cpu().tolist(), axis=1)
      if y_probs is None:
        y_probs = y_prob
      else:  
        y_probs = np.concatenate((y_probs, y_prob), axis=0)
      _, preds = torch.max(outputs.logits, dim=1)
      y_preds = np.concatenate((y_preds, preds.cpu().numpy().copy()), axis=0)
      loss = outputs.loss
      correct_preds += torch.sum(preds == targets)
      losses.append(loss.item())
  acc = correct_preds / data_len
  loss = np.mean(losses)
  return y_probs, y_preds, y_trues, acc, loss

In [11]:
def train(model, train_dataloader, eval_dataloader, train_data_len, eval_data_len, device, epochs):
  optimizer = AdamW(model.parameters(), lr=3e-5)
  for epoch in range(epochs):
    start = time.time()
    model.train()
    losses = []
    correct_preds = 0
    for d in tqdm(train_dataloader):
      input_ids = d[0].to(device)
      attention_mask = d[1].to(device)
      targets = d[2].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
      _, preds = torch.max(outputs.logits, dim=1)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      correct_preds += torch.sum(preds == targets)
      losses.append(loss.item())
    train_acc = correct_preds / train_data_len
    train_loss = np.mean(losses)
    _, _, _, eval_acc, eval_loss = eval(model, eval_dataloader, eval_data_len, device)
    iter_time = time.time() - start
    print(f"epoch {epoch + 1} -- train accuracy: {train_acc}, train loss: {train_loss}, validation accuracy: {eval_acc}, validation loss: {eval_loss}, epoch time: {int(iter_time)} (s)")

In [12]:
train_df = pd.read_excel(train_path)
test_df = pd.read_excel(test_path)
valid_df = pd.read_excel(valid_path)
labels = {"quran": 0, "bible": 1, "mizan": 2}

In [14]:
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
train_loader = create_data_loader(train_df, labels, tokenizer, ['source'], [source_preprocess], 128, 32)
eval_loader = create_data_loader(valid_df, labels, tokenizer, ['source'], [source_preprocess], 128, 32)
test_loader = create_data_loader(test_df, labels, tokenizer, ['source'], [source_preprocess], 128, 32)
model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=3)
model.to(device)
train(model, train_loader, eval_loader, len(train_df), len(valid_df), device, 10)
y_probs, y_preds, y_trues, test_acc, test_loss = eval(model, test_loader, len(test_df), device)
print(f"test accuracy: {test_acc}, test loss: {test_loss}")
print()
print(classification_report(y_trues.tolist(), y_preds.tolist(), labels=list(labels.values()), target_names=list(labels.keys())))
y_trues_onehot = []
for y_true in y_trues.tolist():
  temp = [0] * len(labels.keys())
  temp[int(y_true)] = 1
  y_trues_onehot.append(temp)
print(roc_auc_score(y_trues_onehot, y_probs.tolist()))

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 

epoch 1 -- train accuracy: 0.8817459940910339, train loss: 0.3027635833938715, validation accuracy: 0.9388889074325562, validation loss: 0.16611239746322526, epoch time: 134 (s)


100%|██████████| 394/394 [02:12<00:00,  2.98it/s]
100%|██████████| 85/85 [00:09<00:00,  9.10it/s]


epoch 2 -- train accuracy: 0.9562698006629944, train loss: 0.12782378796237487, validation accuracy: 0.9544444680213928, validation loss: 0.1387679290437304, epoch time: 141 (s)


100%|██████████| 394/394 [02:14<00:00,  2.93it/s]
100%|██████████| 85/85 [00:09<00:00,  9.01it/s]


epoch 3 -- train accuracy: 0.9742856621742249, train loss: 0.0773935271102018, validation accuracy: 0.9518518447875977, validation loss: 0.1598546664661947, epoch time: 144 (s)


100%|██████████| 394/394 [02:15<00:00,  2.91it/s]
100%|██████████| 85/85 [00:09<00:00,  8.97it/s]


epoch 4 -- train accuracy: 0.9825396537780762, train loss: 0.054035907578158196, validation accuracy: 0.9529629349708557, validation loss: 0.17840492739033995, epoch time: 145 (s)


100%|██████████| 394/394 [02:15<00:00,  2.90it/s]
100%|██████████| 85/85 [00:09<00:00,  9.02it/s]


epoch 5 -- train accuracy: 0.989444375038147, train loss: 0.0319886847938704, validation accuracy: 0.962592601776123, validation loss: 0.15189235407400273, epoch time: 145 (s)


100%|██████████| 394/394 [02:15<00:00,  2.91it/s]
100%|██████████| 85/85 [00:09<00:00,  9.01it/s]


epoch 6 -- train accuracy: 0.9913491606712341, train loss: 0.025349220020391004, validation accuracy: 0.9559259414672852, validation loss: 0.1594332077584969, epoch time: 144 (s)


100%|██████████| 394/394 [02:15<00:00,  2.91it/s]
100%|██████████| 85/85 [00:09<00:00,  9.03it/s]


epoch 7 -- train accuracy: 0.9919047355651855, train loss: 0.023930799382012504, validation accuracy: 0.9618518352508545, validation loss: 0.1551444893678897, epoch time: 144 (s)


100%|██████████| 394/394 [02:15<00:00,  2.91it/s]
100%|██████████| 85/85 [00:09<00:00,  9.03it/s]


epoch 8 -- train accuracy: 0.9946824908256531, train loss: 0.015960794676678505, validation accuracy: 0.9577777981758118, validation loss: 0.15086497230736046, epoch time: 144 (s)


100%|██████████| 394/394 [02:15<00:00,  2.91it/s]
100%|██████████| 85/85 [00:09<00:00,  9.03it/s]


epoch 9 -- train accuracy: 0.9932539463043213, train loss: 0.020318056168734937, validation accuracy: 0.9611111283302307, validation loss: 0.14836927055884563, epoch time: 144 (s)


100%|██████████| 394/394 [02:15<00:00,  2.91it/s]
100%|██████████| 85/85 [00:09<00:00,  9.01it/s]


epoch 10 -- train accuracy: 0.9962698221206665, train loss: 0.01232254984794506, validation accuracy: 0.9603703618049622, validation loss: 0.18139371232537088, epoch time: 144 (s)


100%|██████████| 85/85 [00:09<00:00,  9.03it/s]

test accuracy: 0.9685184955596924, test loss: 0.14640078320568564

              precision    recall  f1-score   support

       quran       0.96      0.97      0.97       900
       bible       0.96      0.98      0.97       900
       mizan       0.98      0.95      0.97       900

    accuracy                           0.97      2700
   macro avg       0.97      0.97      0.97      2700
weighted avg       0.97      0.97      0.97      2700

0.9965351851851851





In [16]:
tokenizer = AutoTokenizer.from_pretrained('HooshvareLab/bert-base-parsbert-uncased')
train_loader = create_data_loader(train_df, labels, tokenizer, ['targets'], [target_preprocess], 128, 32)
eval_loader = create_data_loader(valid_df, labels, tokenizer, ['targets'], [target_preprocess], 128, 32)
test_loader = create_data_loader(test_df, labels, tokenizer, ['targets'], [target_preprocess], 128, 32)
model = AutoModelForSequenceClassification.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', num_labels=3)
model.to(device)
train(model, train_loader, eval_loader, len(train_df), len(valid_df), device, 10)
y_probs, y_preds, y_trues, test_acc, test_loss = eval(model, test_loader, len(test_df), device)
print(f"test accuracy: {test_acc}, test loss: {test_loss}")
print()
print(classification_report(y_trues.tolist(), y_preds.tolist(), labels=list(labels.values()), target_names=list(labels.keys())))
y_trues_onehot = []
for y_true in y_trues.tolist():
  temp = [0] * len(labels.keys())
  temp[int(y_true)] = 1
  y_trues_onehot.append(temp)
print(roc_auc_score(y_trues_onehot, y_probs.tolist()))

Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassifica

epoch 1 -- train accuracy: 0.9173809289932251, train loss: 0.23440470377002195, validation accuracy: 0.9496296048164368, validation loss: 0.14247339216663557, epoch time: 280 (s)


100%|██████████| 394/394 [04:33<00:00,  1.44it/s]
100%|██████████| 85/85 [00:21<00:00,  4.04it/s]


epoch 2 -- train accuracy: 0.9835714101791382, train loss: 0.05329522771434622, validation accuracy: 0.9548147916793823, validation loss: 0.15390689375344663, epoch time: 294 (s)


100%|██████████| 394/394 [04:34<00:00,  1.43it/s]
100%|██████████| 85/85 [00:21<00:00,  4.02it/s]


epoch 3 -- train accuracy: 0.9927777647972107, train loss: 0.02354339202987101, validation accuracy: 0.9507407546043396, validation loss: 0.2062645820544704, epoch time: 296 (s)


100%|██████████| 394/394 [04:34<00:00,  1.44it/s]
100%|██████████| 85/85 [00:20<00:00,  4.07it/s]


epoch 4 -- train accuracy: 0.9924602508544922, train loss: 0.024050804120541648, validation accuracy: 0.9559259414672852, validation loss: 0.20557230056660689, epoch time: 295 (s)


100%|██████████| 394/394 [04:34<00:00,  1.44it/s]
100%|██████████| 85/85 [00:20<00:00,  4.07it/s]


epoch 5 -- train accuracy: 0.9957935810089111, train loss: 0.013576697874613353, validation accuracy: 0.9522222280502319, validation loss: 0.19389825715505354, epoch time: 295 (s)


100%|██████████| 394/394 [04:34<00:00,  1.44it/s]
100%|██████████| 85/85 [00:20<00:00,  4.07it/s]


epoch 6 -- train accuracy: 0.9973809123039246, train loss: 0.008485944200083164, validation accuracy: 0.9585185050964355, validation loss: 0.1999495416155482, epoch time: 295 (s)


100%|██████████| 394/394 [04:34<00:00,  1.44it/s]
100%|██████████| 85/85 [00:20<00:00,  4.07it/s]


epoch 7 -- train accuracy: 0.9966666102409363, train loss: 0.010207535408858001, validation accuracy: 0.9618518352508545, validation loss: 0.154112584869076, epoch time: 294 (s)


100%|██████████| 394/394 [04:34<00:00,  1.44it/s]
100%|██████████| 85/85 [00:20<00:00,  4.05it/s]


epoch 8 -- train accuracy: 0.9983332753181458, train loss: 0.0050290050529270625, validation accuracy: 0.9633333086967468, validation loss: 0.18110515704765395, epoch time: 295 (s)


100%|██████████| 394/394 [04:34<00:00,  1.44it/s]
100%|██████████| 85/85 [00:20<00:00,  4.09it/s]


epoch 9 -- train accuracy: 0.99698406457901, train loss: 0.01084776659673263, validation accuracy: 0.9629629850387573, validation loss: 0.188694906098638, epoch time: 294 (s)


100%|██████████| 394/394 [04:34<00:00,  1.44it/s]
100%|██████████| 85/85 [00:20<00:00,  4.07it/s]


epoch 10 -- train accuracy: 0.9973809123039246, train loss: 0.007832258763956644, validation accuracy: 0.9592592716217041, validation loss: 0.18659333516124582, epoch time: 295 (s)


100%|██████████| 85/85 [00:20<00:00,  4.07it/s]

test accuracy: 0.9577777981758118, test loss: 0.19726977458665393

              precision    recall  f1-score   support

       quran       0.96      0.96      0.96       900
       bible       0.98      0.94      0.96       900
       mizan       0.93      0.97      0.95       900

    accuracy                           0.96      2700
   macro avg       0.96      0.96      0.96      2700
weighted avg       0.96      0.96      0.96      2700

0.9957053497942386





In [17]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
train_loader = create_data_loader(train_df, labels, tokenizer, ['source', 'targets'], [source_preprocess, target_preprocess], 128, 32)
eval_loader = create_data_loader(valid_df, labels, tokenizer, ['source', 'targets'], [source_preprocess, target_preprocess], 128, 32)
test_loader = create_data_loader(test_df, labels, tokenizer, ['source', 'targets'], [source_preprocess, target_preprocess], 128, 32)
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=3)
model.to(device)
train(model, train_loader, eval_loader, len(train_df), len(valid_df), device, 10)
y_probs, y_preds, y_trues, test_acc, test_loss = eval(model, test_loader, len(test_df), device)
print(f"test accuracy: {test_acc}, test loss: {test_loss}")
print()
print(classification_report(y_trues.tolist(), y_preds.tolist(), labels=list(labels.values()), target_names=list(labels.keys())))
y_trues_onehot = []
for y_true in y_trues.tolist():
  temp = [0] * len(labels.keys())
  temp[int(y_true)] = 1
  y_trues_onehot.append(temp)
print(roc_auc_score(y_trues_onehot, y_probs.tolist()))

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

epoch 1 -- train accuracy: 0.8938888311386108, train loss: 0.2884959740715508, validation accuracy: 0.9699999690055847, validation loss: 0.08951005666407154, epoch time: 318 (s)


100%|██████████| 394/394 [04:57<00:00,  1.32it/s]
100%|██████████| 85/85 [00:19<00:00,  4.28it/s]


epoch 2 -- train accuracy: 0.9746031165122986, train loss: 0.08028216300848966, validation accuracy: 0.9733333587646484, validation loss: 0.09048174690071713, epoch time: 317 (s)


100%|██████████| 394/394 [04:57<00:00,  1.32it/s]
100%|██████████| 85/85 [00:19<00:00,  4.26it/s]


epoch 3 -- train accuracy: 0.9834126830101013, train loss: 0.0534714408653793, validation accuracy: 0.9748148322105408, validation loss: 0.0871463113457567, epoch time: 317 (s)


100%|██████████| 394/394 [04:57<00:00,  1.32it/s]
100%|██████████| 85/85 [00:19<00:00,  4.25it/s]


epoch 4 -- train accuracy: 0.9892063140869141, train loss: 0.03419420129349479, validation accuracy: 0.9803703427314758, validation loss: 0.0844939000055413, epoch time: 317 (s)


100%|██████████| 394/394 [04:57<00:00,  1.32it/s]
100%|██████████| 85/85 [00:19<00:00,  4.26it/s]


epoch 5 -- train accuracy: 0.9924602508544922, train loss: 0.02441719312066872, validation accuracy: 0.9774073958396912, validation loss: 0.08037664095583537, epoch time: 317 (s)


100%|██████████| 394/394 [04:57<00:00,  1.32it/s]
100%|██████████| 85/85 [00:19<00:00,  4.26it/s]


epoch 6 -- train accuracy: 0.9934920072555542, train loss: 0.022255993803981088, validation accuracy: 0.9751851558685303, validation loss: 0.11264248963390641, epoch time: 317 (s)


100%|██████████| 394/394 [04:57<00:00,  1.32it/s]
100%|██████████| 85/85 [00:19<00:00,  4.26it/s]


epoch 7 -- train accuracy: 0.9888888597488403, train loss: 0.035879100975462326, validation accuracy: 0.9770370125770569, validation loss: 0.0798188859343474, epoch time: 317 (s)


100%|██████████| 394/394 [04:57<00:00,  1.32it/s]
100%|██████████| 85/85 [00:19<00:00,  4.25it/s]


epoch 8 -- train accuracy: 0.9949999451637268, train loss: 0.015557384288628694, validation accuracy: 0.9729629755020142, validation loss: 0.09980335860163905, epoch time: 317 (s)


100%|██████████| 394/394 [04:57<00:00,  1.32it/s]
100%|██████████| 85/85 [00:19<00:00,  4.25it/s]


epoch 9 -- train accuracy: 0.9971427917480469, train loss: 0.008731466723595057, validation accuracy: 0.978518545627594, validation loss: 0.09994845122804262, epoch time: 317 (s)


100%|██████████| 394/394 [04:57<00:00,  1.32it/s]
100%|██████████| 85/85 [00:19<00:00,  4.26it/s]


epoch 10 -- train accuracy: 0.9937301278114319, train loss: 0.01918573248442995, validation accuracy: 0.9774073958396912, validation loss: 0.10189671605424819, epoch time: 317 (s)


100%|██████████| 85/85 [00:19<00:00,  4.26it/s]


test accuracy: 0.9740740656852722, test loss: 0.10600230747145628

              precision    recall  f1-score   support

       quran       0.95      0.99      0.97       900
       bible       0.99      0.99      0.99       900
       mizan       0.99      0.95      0.97       900

    accuracy                           0.97      2700
   macro avg       0.97      0.97      0.97      2700
weighted avg       0.97      0.97      0.97      2700

0.9984164609053497
