# Environment Setup

In [1]:
!pip install datasets
!pip install -U sentence-transformers
!pip install transformers
!pip install farasapy
!pip install arabert
# !pip install openai

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.

# Load dataset

In [2]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# load dataset
dataset = load_dataset("NoraAlt/Mawqif_Stance-Detection")

# convert to pandas dataframe
df = pd.DataFrame({k: dataset['train'][k] for k, _ in dataset['train'].features.items()})
df['stance'] = df['stance'].apply(lambda x: "Neutral" if x is None else x)

# train test split
train_df, test_df = train_test_split(df, test_size=500, random_state=12345)

# train val split
train_df, val_df = train_test_split(train_df, test_size=300, random_state=12345)

# print sizes
print(f"train length: {len(train_df)}")
print(f"test length: {len(test_df)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3502 [00:00<?, ? examples/s]

train length: 2702
test length: 500


In [34]:
import os
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoModel
import transformers
from tqdm import tqdm
import torch
from torch import nn
from torch.optim import Adam
from sklearn.model_selection import train_test_split
# from arabert.preprocess import ArabertPreprocessor
from sklearn.metrics import classification_report
from arabert.preprocess import ArabertPreprocessor
import gc
import re


class MawqifDataset(torch.utils.data.Dataset):
  def __init__(self, df, tokenizer, model_name, task='stance'):
    self.labelsIds = {'Neutral': 0, 'Against': 1, 'Favor': 2}
    self.sent_labels = {'Neutral': 0, 'Negative': 1, 'Positive': 2}
    self.task = task
    if task == 'stance':
      self.labels = [self.labelsIds[label] for label in df['stance']]
    elif task == 'both':
      self.labels = [[self.labelsIds[label] for label in df['stance']], [self.sent_labels[label] for label in df['sentiment']]]
    self.targets = df['target'].tolist()
    if model_name in ["aubmindlab/bert-base-arabertv02-twitter", "aubmindlab/bert-base-arabertv2"]:
      arabert_prep = ArabertPreprocessor(model_name=model_name)
      texts = [arabert_prep.preprocess(t) for t in df['text'].tolist()]
    else:
      texts = df['text'].tolist()
    self.texts = [tokenizer(self.targets[i], text, padding='max_length', max_length = 128, truncation=True, return_tensors="pt") for i, text in enumerate(texts)]

  def classes(self):
    return self.labels

  def __len__(self):
    return len(self.texts)

  def get_batch_labels(self, idx):
    if self.task == 'both':
        return np.array([self.labels[0][idx], self.labels[1][idx]])
    return np.array(self.labels[idx])

  def get_batch_texts(self, idx):
    return self.texts[idx]

  def __getitem__(self, idx):
    batch_texts = self.get_batch_texts(idx)
    batch_y = self.get_batch_labels(idx)
    return batch_texts, batch_y

# Models

In [4]:
import os
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoModel
import transformers
from tqdm import tqdm
import torch
from torch import nn
from torch.optim import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [5]:
class PSUMClassifier(nn.Module):
  def __init__(self, model_name= "UBC-NLP/MARBERT", n_layers=4, n_classes=3, max_length=128):
    super(PSUMClassifier, self).__init__()
    self.config = AutoConfig.from_pretrained(model_name)

    self.bert = AutoModel.from_pretrained(model_name)

    self.bertLayers = nn.ModuleList()
    self.linears = nn.ModuleList()
    self.n_layers = n_layers

    for i in range(n_layers):
      self.bertLayers.append(transformers.BertLayer(self.config))
      self.linears.append(nn.Linear(768, n_classes))


  def forward(self, input_id, mask):
    hidden_states = self.bert(input_ids= input_id, attention_mask=mask, return_dict=True, output_hidden_states=True)['hidden_states']
    final_outputs = []

    for i in range(self.n_layers):
      final_outputs.append(self.linears[i](self.bertLayers[i](hidden_states[-i-1])[0][:,0,:]))


    return tuple(final_outputs)

  def loss(self, output, labels, criterion):
    bloss = 0.0
    for i in range(self.n_layers):
      bloss += criterion(output[i], labels.long())
    return bloss

  def aggregate(self, output):
    agg = output[0]
    for i in range(1, len(output)):
      agg += output[i]
    return agg / len(output)

  def calcAcc(self, output, labels):
    return (self.aggregate(output).argmax(dim=1) == labels).sum().item()

  def evaluate(self, output, test_label, preds, golds):
    preds += self.aggregate(output).argmax(dim=1).to("cpu").tolist()
    golds += test_label.to("cpu").tolist()
    return preds, golds

  def evaluation_report(self, preds, golds, is_f1pn=False):
    print(classification_report(golds, preds, digits=4))
    if is_f1pn:
      r = classification_report(golds, preds, digits=4, output_dict=True)
      f1pn = (r["1"]["f1-score"] + r["2"]["f1-score"]) / 2.0
      print(f'F1PN: {f1pn:.4f}')

In [6]:
class HSUMClassifier(nn.Module):
  def __init__(self, model_name= "UBC-NLP/MARBERT",n_layers=4, n_classes=3, max_length=128):
    super(HSUMClassifier, self).__init__()
    self.config = AutoConfig.from_pretrained(model_name)

    self.bert = AutoModel.from_pretrained(model_name)

    self.bertLayers = nn.ModuleList()
    self.linears = nn.ModuleList()
    self.n_layers = n_layers

    for i in range(n_layers):
      self.bertLayers.append(transformers.BertLayer(self.config))
      self.linears.append(nn.Linear(768, n_classes))


  def forward(self, input_id, mask):
    hidden_states = self.bert(input_ids= input_id, attention_mask=mask, return_dict=True, output_hidden_states=True)['hidden_states']
    final_outputs = []
    berts_outputs = []

    berts_outputs.append(self.bertLayers[0](hidden_states[-1]))
    final_outputs.append(self.linears[0](berts_outputs[-1][0][:,0,:]))

    for i in range(1, self.n_layers):
      berts_outputs.append(self.bertLayers[i](hidden_states[-i-1] + berts_outputs[-1][0]))
      final_outputs.append(self.linears[i](berts_outputs[-1][0][:,0,:]))


    return tuple(final_outputs)

  def loss(self, output, labels, criterion):
    bloss = 0.0
    for i in range(self.n_layers):
      bloss += criterion(output[i], labels.long())
    return bloss

  def aggregate(self, output):
    agg = output[0]
    for i in range(1, len(output)):
      agg += output[i]
    return agg / len(output)

  def calcAcc(self, output, labels):
    return (self.aggregate(output).argmax(dim=1) == labels).sum().item()

  def evaluate(self, output, test_label, preds, golds):
    preds += self.aggregate(output).argmax(dim=1).to("cpu").tolist()
    golds += test_label.to("cpu").tolist()
    return preds, golds

  def evaluation_report(self, preds, golds, is_f1pn=False):
    print(classification_report(golds, preds, digits=4))
    if is_f1pn:
      r = classification_report(golds, preds, digits=4, output_dict=True)
      f1pn = (r["1"]["f1-score"] + r["2"]["f1-score"]) / 2.0
      print(f'F1PN: {f1pn:.4f}')

In [26]:
class PSUMTwoTasksClassifier(nn.Module):
  def __init__(self, model_name= "UBC-NLP/MARBERT",n_layers=4, n_classes_1=3, n_classes_2=3, max_length=128, separate_bert_layers_for_tasks=False, informed_by='', one_softmax_informing=False, detach=False):
    super(PSUMTwoTasksClassifier, self).__init__()
    self.config = AutoConfig.from_pretrained(model_name)

    self.bert = AutoModel.from_pretrained(model_name)

    self.bertLayers = nn.ModuleList()
    if separate_bert_layers_for_tasks:
      self.bertLayers2 = nn.ModuleList()
    self.linears_1 = nn.ModuleList()
    self.linears_2 = nn.ModuleList()
    self.n_layers = n_layers
    self.n_classes_1 = n_classes_1
    self.n_classes_2 = n_classes_2
    self.separate_bert_layers_for_tasks = separate_bert_layers_for_tasks
    self.all_informed = informed_by != ''
    self.informed_by = informed_by
    self.one_softmax_informing = one_softmax_informing
    self.detach = detach

    if self.all_informed:
      self.softmaxes_1 = nn.ModuleList()
      self.softmaxes_2 = nn.ModuleList()
      self.informed_linears_1 = nn.ModuleList()
      self.informed_linears_2 = nn.ModuleList()


    for i in range(n_layers):
      self.bertLayers.append(transformers.BertLayer(self.config))
      if separate_bert_layers_for_tasks:
        self.bertLayers2.append(transformers.BertLayer(self.config))
      self.linears_1.append(nn.Linear(768, n_classes_1))
      self.linears_2.append(nn.Linear(768, n_classes_2))

      if self.all_informed:
        self.softmaxes_1.append(nn.Softmax(dim=1))
        self.softmaxes_2.append(nn.Softmax(dim=1))
        self.informed_linears_1.append(nn.Linear(768 + n_classes_2, n_classes_1))
        self.informed_linears_2.append(nn.Linear(768 + n_classes_1, n_classes_2))


  def forward(self, input_id, mask):
    hidden_states = self.bert(input_ids= input_id, attention_mask=mask, return_dict=True, output_hidden_states=True)['hidden_states']
    final_outputs_1 = []
    final_outputs_2 = []

    if self.all_informed:
      informed_final_outputs_1 = []
      informed_final_outputs_2 = []

    for i in range(self.n_layers):
      final_outputs_1.append(self.linears_1[i](self.bertLayers[i](hidden_states[-i-1])[0][:,0,:]))
      if self.separate_bert_layers_for_tasks:
        final_outputs_2.append(self.linears_2[i](self.bertLayers2[i](hidden_states[-i-1])[0][:,0,:]))
      else:
        final_outputs_2.append(self.linears_2[i](self.bertLayers[i](hidden_states[-i-1])[0][:,0,:]))

      if self.all_informed and not self.one_softmax_informing:
        if self.detach:
          fo2i = torch.Tensor.detach(final_outputs_2[i])
          fo1i = torch.Tensor.detach(final_outputs_1[i])
        else:
          fo2i = final_outputs_2[i]
          fo1i = final_outputs_1[i]

        informed_final_outputs_1.append(self.informed_linears_1[i](torch.cat((self.bertLayers[i](hidden_states[-i-1])[0][:,0,:], self.softmaxes_2[i](fo2i)), dim=1)))
        informed_final_outputs_2.append(self.informed_linears_2[i](torch.cat((self.bertLayers[i](hidden_states[-i-1])[0][:,0,:], self.softmaxes_1[i](fo1i)), dim=1)))

    if self.all_informed and self.one_softmax_informing:
      sum1 = final_outputs_1[0]
      sum2 = final_outputs_2[0]
      for j in range(1, self.n_layers):
        sum1 += final_outputs_1[j]
        sum2 += final_outputs_2[j]
      sum1 /= self.n_layers
      sum2 /= self.n_layers

      o1 = self.softmaxes_1[0](sum1)
      o2 = self.softmaxes_2[0](sum2)

      for i in range(self.n_layers):
        informed_final_outputs_1.append(self.informed_linears_1[i](torch.cat((self.bertLayers[i](hidden_states[-i-1])[0][:,0,:], o2), dim=1)))
        informed_final_outputs_2.append(self.informed_linears_2[i](torch.cat((self.bertLayers[i](hidden_states[-i-1])[0][:,0,:], o1), dim=1)))

    if self.all_informed:
      return final_outputs_1, final_outputs_2, informed_final_outputs_1, informed_final_outputs_2

    return final_outputs_1, final_outputs_2

  def loss(self, output, labels, criterion):
    bloss = 0.0
    for i in range(self.n_layers):
      if self.informed_by in ['', 't1', 'all']:
        bloss += criterion(output[0][i], labels[:, 0].long())

      if self.informed_by in ['', 't2', 'all']:
        bloss += criterion(output[1][i], labels[:, 1].long())

      if self.informed_by in ['t2', 'all']:
        bloss += criterion(output[2][i], labels[:, 0].long())

      if self.informed_by in ['t1', 'all']:
        bloss += criterion(output[3][i], labels[:, 1].long())

    return bloss

  def aggregate(self, output):
    if self.informed_by == '':
      t1_idx = 0
      t2_idx = 1
    elif self.informed_by == 't1':
      t1_idx = 0
      t2_idx = 3
    elif self.informed_by == 't2':
      t1_idx = 2
      t2_idx = 1
    elif self.informed_by == 'all':
      t1_idx = 2
      t2_idx = 3

    agg1 = output[t1_idx][0]
    agg2 = output[t2_idx][0]
    for i in range(1, len(output[0])):
      agg1 += output[t1_idx][i]
      agg2 += output[t2_idx][i]
    return agg1 / len(output[t1_idx]), agg2 / len(output[t2_idx])

  def calcAcc(self, output, labels):
    return (self.aggregate(output)[0].argmax(dim=1) == labels[:, 0]).sum().item()

  def evaluate(self, output, test_label, preds, golds):
    if len(preds) == 0:
      preds = [[], []]
      golds = [[], []]

    agg1, agg2 = self.aggregate(output)
    preds[0] += agg1.argmax(dim=1).to("cpu").tolist()
    golds[0] += test_label[:, 0].to("cpu").tolist()
    preds[1] += agg2.argmax(dim=1).to("cpu").tolist()
    golds[1] += test_label[:, 1].to("cpu").tolist()

    return preds, golds

  def evaluation_report(self, preds, golds, is_f1pn=False):
    print("########## Task 1 Results ###############")
    print(classification_report(golds[0], preds[0], digits=4))
    if self.n_classes_1 == 3:
      r = classification_report(golds[0], preds[0], digits=4, output_dict=True)
      f1pn = (r["1"]["f1-score"] + r["2"]["f1-score"]) / 2.0
      print(f'F1PN: {f1pn:.4f}')

    print("\n\n########## Task 2 Results ###############")
    print(classification_report(golds[1], preds[1], digits=4))
    if self.n_classes_2 == 3:
      r = classification_report(golds[1], preds[1], digits=4, output_dict=True)
      f1pn = (r["1"]["f1-score"] + r["2"]["f1-score"]) / 2.0
      print(f'F1PN: {f1pn:.4f}')

# Train

In [37]:
import os
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoModel
import transformers
from tqdm import tqdm
import torch
from torch import nn
from torch.optim import Adam
from sklearn.model_selection import train_test_split
from arabert.preprocess import ArabertPreprocessor
from sklearn.metrics import classification_report


def train(model, train_data, val_data, learning_rate, epochs, batch_size, model_path_save, task, informed_by):
  """
  model: the custom model class
  train_data: from the custom dataset class
  val_data: from the custom dataset class
  """

  train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
  val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=batch_size)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  criterion = nn.CrossEntropyLoss()
  optimizer = Adam(model.parameters(), lr= learning_rate)

  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()
  val_loss_min = float("inf")
  my_metric = 0.0
  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):
      train_label = train_label.to(device)
      mask = train_input['attention_mask'].to(device)
      input_id = train_input['input_ids'].squeeze(1).to(device)

      output = model(input_id, mask)

      batch_loss = model.loss(output, train_label, criterion)
      total_loss_train += batch_loss.item()

      acc = model.calcAcc(output, train_label)
      total_acc_train += acc

      model.zero_grad()
      batch_loss.backward()
      optimizer.step()
      os.system('cls')


    total_acc_val = 0
    total_loss_val = 0
    preds = []
    golds = []

    with torch.no_grad():
      for val_input, val_label in val_dataloader:

        val_label = val_label.to(device)
        mask = val_input['attention_mask'].to(device)
        input_id = val_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)

        batch_loss = model.loss(output, val_label, criterion)
        total_loss_val += batch_loss.item()

        acc = model.calcAcc(output, val_label)
        total_acc_val += acc

        preds, golds = model.evaluate(output, val_label, preds, golds)


    if task == 'sentiment' or task == 'stance' or informed_by == 't2':
      if informed_by == 't2':
        golds, preds = golds[0], preds[0]
      r = classification_report(golds, preds, output_dict=True)
      f1_pn = (r["1"]["f1-score"] + r["2"]["f1-score"]) / 2.0
      printed_metric = f1_pn
      if f1_pn >= my_metric:
        my_metric = f1_pn
        torch.save(model, model_path_save)
      # if total_loss_val <= val_loss_min:
      #   val_loss_min = total_loss_val
      #   torch.save(model, model_path_save)
    elif task == 'sarcasm' or informed_by == 't1':
      if informed_by == 't1':
        golds, preds = golds[1], preds[1]
      r = classification_report(golds, preds, output_dict=True)
      printed_metric = r["1"]["f1-score"]
      if r["1"]["f1-score"] >= my_metric:
        my_metric = r["1"]["f1-score"]
        torch.save(model, model_path_save)
    elif task == "both" or task == 'all':
      r = classification_report(golds[0], preds[0], output_dict=True)
      f1_pn = (r["1"]["f1-score"] + r["2"]["f1-score"]) / 2.0
      printed_metric = f1_pn
      if f1_pn >= my_metric:
        my_metric = f1_pn
        torch.save(model, model_path_save)


    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data)} \
        | Train Accuracy: {total_acc_train / len(train_data)} \
        | Val Loss: {total_loss_val / len(val_data)} \
        | Val Accuracy: {total_acc_val / len(val_data)}')

    print(f"########### Target Metric: {printed_metric} ##############")

# Validate

In [77]:
import os
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoModel
import transformers
from tqdm import tqdm
import torch
from torch import nn
from torch.optim import Adam
from sklearn.model_selection import train_test_split
from arabert.preprocess import ArabertPreprocessor
from sklearn.metrics import classification_report




def evaluate(model, test_data, batch_size=16, task='sentiment', show_eval=True):

  test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  if use_cuda:
    model = model.cuda()

  preds = []
  golds = []
  with torch.no_grad():

    for test_input, test_label in tqdm(test_dataloader):
      test_label = test_label.to(device)
      mask = test_input['attention_mask'].to(device)
      input_id = test_input['input_ids'].squeeze(1).to(device)

      output = model(input_id, mask)
      preds, golds = model.evaluate(output, test_label, preds, golds)

      os.system('cls')

  print()
  if show_eval:
    model.evaluation_report(preds, golds, task == 'sentiment' or task == 'stance' or task == 'both' or task == "all")
  return preds

# Running

In [27]:
task = 'both' # 'sentiment', 'sarcasm', 'both', 'all'
freeze = True
EPOCHS = 5
LR = 5e-4
batch_size = 32
model_path_save = "model.pth"
informed_by = '' # '', 't1', 't2', 'all'
n_classes = 3

In [49]:
model_name = "UBC-NLP/MARBERT"
# model_name = "aubmindlab/bert-base-arabertv02-twitter"

In [50]:
# model = PSUMClassifier(model_name,n_layers=4, n_classes=n_classes)
# model = HSUMClassifier(model_name,n_layers=4, n_classes=n_classes)
model = PSUMTwoTasksClassifier(model_name,n_layers=4, n_classes_1=3, n_classes_2=3)



In [51]:
if freeze:
  for param in model.bert.parameters():
    param.requires_grad = False

In [52]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
trainDataset, valDataset = MawqifDataset(train_df, tokenizer, model_name, task=task), MawqifDataset(val_df, tokenizer, model_name, task=task)

In [46]:
train(model, trainDataset, valDataset, LR, EPOCHS, batch_size, model_path_save, task, informed_by)

100%|██████████| 85/85 [00:48<00:00,  1.77it/s]


Epochs: 1 | Train Loss: 0.19455283928764563         | Train Accuracy: 0.7379718726868986         | Val Loss: 0.15324620405832926         | Val Accuracy: 0.8133333333333334
########### Target Metric: 0.8313492063492063 ##############


100%|██████████| 85/85 [00:47<00:00,  1.77it/s]


Epochs: 2 | Train Loss: 0.15026840922746015         | Train Accuracy: 0.7994078460399704         | Val Loss: 0.16226173957188925         | Val Accuracy: 0.79
########### Target Metric: 0.8170002007897731 ##############


100%|██████████| 85/85 [00:47<00:00,  1.77it/s]


Epochs: 3 | Train Loss: 0.129348949447197         | Train Accuracy: 0.8478904515173945         | Val Loss: 0.16632586399714153         | Val Accuracy: 0.8133333333333334
########### Target Metric: 0.8412667919919865 ##############


100%|██████████| 85/85 [00:48<00:00,  1.77it/s]


Epochs: 4 | Train Loss: 0.09725766385775156         | Train Accuracy: 0.9111769059955589         | Val Loss: 0.18954864343007405         | Val Accuracy: 0.8233333333333334
########### Target Metric: 0.8528825995807128 ##############


100%|██████████| 85/85 [00:48<00:00,  1.77it/s]


Epochs: 5 | Train Loss: 0.07236556045926121         | Train Accuracy: 0.957438934122872         | Val Loss: 0.1990172306696574         | Val Accuracy: 0.8266666666666667
########### Target Metric: 0.8431054199432452 ##############


In [47]:
testDataset = MawqifDataset(test_df, tokenizer, model_name, task=task)
loaded_model = torch.load(model_path_save)
loaded_model.eval()
model.eval()
evaluate(loaded_model, testDataset, batch_size, task)
print()
evaluate(model, testDataset, batch_size, task)

100%|██████████| 16/16 [00:05<00:00,  2.94it/s]



########## Task 1 Results ###############
              precision    recall  f1-score   support

           0     0.7647    0.2407    0.3662        54
           1     0.6011    0.8462    0.7029       130
           2     0.8767    0.8323    0.8539       316

    accuracy                         0.7720       500
   macro avg     0.7475    0.6397    0.6410       500
weighted avg     0.7929    0.7720    0.7620       500

F1PN: 0.7784


########## Task 2 Results ###############
              precision    recall  f1-score   support

           0     0.6972    0.4294    0.5315       177
           1     0.5333    0.7080    0.6084       113
           2     0.7303    0.8381    0.7805       210

    accuracy                         0.6640       500
   macro avg     0.6536    0.6585    0.6401       500
weighted avg     0.6741    0.6640    0.6534       500

F1PN: 0.6944



100%|██████████| 16/16 [00:05<00:00,  2.94it/s]


########## Task 1 Results ###############
              precision    recall  f1-score   support

           0     0.5366    0.4074    0.4632        54
           1     0.6531    0.7385    0.6931       130
           2     0.8622    0.8513    0.8567       316

    accuracy                         0.7740       500
   macro avg     0.6839    0.6657    0.6710       500
weighted avg     0.7726    0.7740    0.7717       500

F1PN: 0.7749


########## Task 2 Results ###############
              precision    recall  f1-score   support

           0     0.6358    0.5819    0.6077       177
           1     0.6147    0.5929    0.6036       113
           2     0.7598    0.8286    0.7927       210

    accuracy                         0.6880       500
   macro avg     0.6701    0.6678    0.6680       500
weighted avg     0.6831    0.6880    0.6845       500

F1PN: 0.6982





In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [48]:
!cp "model.pth" "/content/drive/MyDrive/PhD/CU/Courses/NLP/Project_data/pth_models/arabertTwitter_PSUM_MTL_LR_5e-4_0.7784.pth"

In [40]:
torch.save(model, model_path_save)

# Load saved Model and Eval



In [75]:
from sklearn.metrics import f1_score, classification_report
def evaluate_official(test_df, y_pred):
    sum_f2_final = 0
    sum_f3_final = 0
    results = {}
    for target in test_df["target"].unique():
        target_indices = [i for i in range(len(test_df['target'].tolist())) if test_df['target'].tolist()[i] == target]
        filtered_test_labels = [test_df['stance_int'].tolist()[i] for i in target_indices]
        filtered_predictions = [y_pred[i] for i in target_indices]
        # print(classification_report(filtered_test_labels, filtered_predictions))
        f1_3class = f1_score(filtered_test_labels, filtered_predictions, average = None)
        sum_f2_final += (f1_3class[1] + f1_3class[2])/2
        sum_f3_final += sum(f1_3class)/3
        results[target] = {"F1_score_2class": (f1_3class[0] + f1_3class[1])/2, "F1_score_3class": sum(f1_3class)/3}
    results["All Targets"] = {"F1_score_2class": sum_f2_final/3, "F1_score_3class": sum_f3_final/3}
    return results

In [81]:
paths = [
    "/content/drive/MyDrive/PhD/CU/Courses/NLP/Project_data/pth_models/MARBERT_PSUM_MTL_LR_5e-4_0.7918.pth",
    "/content/drive/MyDrive/PhD/CU/Courses/NLP/Project_data/pth_models/arabertTwitter_PSUM_MTL_LR_5e-4_0.7784.pth",
    "/content/drive/MyDrive/PhD/CU/Courses/NLP/Project_data/pth_models/MARBERT_PSUM_LR_5e-4_0.7724.pth",
    "/content/drive/MyDrive/PhD/CU/Courses/NLP/Project_data/pth_models/arabertTwitter_PSUM_LR_5e-4_0.7793.pth"
]

tasks = [
    "both",
    "both",
    "stance",
    "stance"
]

model_names = [
    "UBC-NLP/MARBERT",
    "aubmindlab/bert-base-arabertv02-twitter",
    "UBC-NLP/MARBERT",
    "aubmindlab/bert-base-arabertv02-twitter"
]

stance_to_int = {
  "Against": 1,
  "Favor": 2,
  "Neutral": 0
}
int_to_stance = {value: key for key, value in stance_to_int.items()}

test_df['stance_int'] = test_df['stance'].map(stance_to_int)

for i in range(len(paths)):
  if i < 2:
    model = PSUMTwoTasksClassifier(model_names[i],n_layers=4, n_classes_1=3, n_classes_2=3)
  else:
    model = PSUMClassifier(model_names[i],n_layers=4, n_classes=3)
  model = torch.load(paths[i])
  tokenizer = AutoTokenizer.from_pretrained(model_names[i])
  testDataset = MawqifDataset(test_df, tokenizer, model_names[i], task=tasks[i])
  preds = evaluate(model, testDataset, batch_size, task, False)
  if i < 2:
    preds = preds[0]
  print(evaluate_official(test_df, preds))

100%|██████████| 16/16 [00:05<00:00,  2.93it/s]



{'Covid Vaccine': {'F1_score_2class': 0.625, 'F1_score_3class': 0.6718106995884773}, 'Digital Transformation': {'F1_score_2class': 0.5060606060606061, 'F1_score_3class': 0.6408384924513957}, 'Women empowerment': {'F1_score_2class': 0.44457013574660637, 'F1_score_3class': 0.5869783810960282}, 'All Targets': {'F1_score_2class': 0.7630616498158137, 'F1_score_3class': 0.6332091910453004}}


Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 16/16 [00:05<00:00,  2.82it/s]



{'Covid Vaccine': {'F1_score_2class': 0.5433047590895654, 'F1_score_3class': 0.5993396380507617}, 'Digital Transformation': {'F1_score_2class': 0.4817813765182186, 'F1_score_3class': 0.6281420207963184}, 'Women empowerment': {'F1_score_2class': 0.43766233766233764, 'F1_score_3class': 0.5774891774891775}, 'All Targets': {'F1_score_2class': 0.7462259913204666, 'F1_score_3class': 0.6016569454454191}}


100%|██████████| 16/16 [00:04<00:00,  3.54it/s]



{'Covid Vaccine': {'F1_score_2class': 0.6487274655355248, 'F1_score_3class': 0.6745231935841929}, 'Digital Transformation': {'F1_score_2class': 0.41050903119868637, 'F1_score_3class': 0.5787157378962952}, 'Women empowerment': {'F1_score_2class': 0.42840909090909096, 'F1_score_3class': 0.5702907452907454}, 'All Targets': {'F1_score_2class': 0.7345596494058272, 'F1_score_3class': 0.6078432255904112}}


Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 16/16 [00:04<00:00,  3.58it/s]



{'Covid Vaccine': {'F1_score_2class': 0.6757478632478633, 'F1_score_3class': 0.7121473210182888}, 'Digital Transformation': {'F1_score_2class': 0.34224598930481287, 'F1_score_3class': 0.5381873028931853}, 'Women empowerment': {'F1_score_2class': 0.5022321428571429, 'F1_score_3class': 0.6243837181337182}, 'All Targets': {'F1_score_2class': 0.7185485698590538, 'F1_score_3class': 0.6249061140150641}}


In [68]:
|

{'Covid Vaccine': {'F1_score_2class': 0.625, 'F1_score_3class': 0.6718106995884773}, 'Digital Transformation': {'F1_score_2class': 0.5060606060606061, 'F1_score_3class': 0.6408384924513957}, 'Women empowerment': {'F1_score_2class': 0.44457013574660637, 'F1_score_3class': 0.5869783810960282}, 'All Targets': {'F1_score_2class': 0.7630616498158137, 'F1_score_3class': 0.6332091910453004}}
