# Environment Setup

In [None]:
!pip install datasets
!pip install -U sentence-transformers
!pip install transformers
!pip install farasapy
!pip install arabert
# !pip install openai

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-an

# Load Dataset

In [None]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# load dataset
dataset = load_dataset("NoraAlt/Mawqif_Stance-Detection")

# convert to pandas dataframe
df = pd.DataFrame({k: dataset['train'][k] for k, _ in dataset['train'].features.items()})
df['stance'] = df['stance'].apply(lambda x: "Neutral" if x is None else x)

# train test split
train_df, test_df = train_test_split(df, test_size=500, random_state=12345)

# print sizes
print(f"train length: {len(train_df)}")
print(f"test length: {len(test_df)}")

# Extract Embeddings for train and test

In [None]:
from sentence_transformers import SentenceTransformer
import os
import gc

embedding_models = [
    "all-mpnet-base-v2",
    "multi-qa-mpnet-base-dot-v1",
    "paraphrase-multilingual-mpnet-base-v2",
    "paraphrase-multilingual-MiniLM-L12-v2",
    "paraphrase-xlm-r-multilingual-v1",
    "quora-distilbert-multilingual",
    "stsb-xlm-r-multilingual",
    "use-cmlm-multilingual",
    "distiluse-base-multilingual-cased-v2",
    "distiluse-base-multilingual-cased-v1",
    "LaBSE"
]

# save embeddings
save_path = "/content/drive/MyDrive/PhD/CU/Courses/NLP/Project_data/embeddings"

for model_name in embedding_models:
  embedding_model = SentenceTransformer(model_name)

  # extract train embeddings
  train_embeddings = embedding_model.encode(train_df['text'].tolist())

  # extract test embeddings
  test_embeddings = embedding_model.encode(test_df['text'].tolist())

  np.save(os.path.join(save_path, f"{model_name}_train.npy"), train_embeddings)
  np.save(os.path.join(save_path, f"{model_name}_test.npy"), test_embeddings)

  del embedding_model
  del train_embeddings
  del test_embeddings
  gc.collect()
  torch.cuda.empty_cache()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/572 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/447 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/709 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

Some weights of the model checkpoint at sentence-transformers/use-cmlm-multilingual were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.69k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

2_Dense/pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

2_Dense/pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

In [None]:
print(f"train embeddings shape: {train_embeddings.shape}")
print(f"test embeddings shape: {test_embeddings.shape}")

# Retrieve most similar tweets

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

def get_closest_tweets(train_df, test_df, train_embeddings, test_embeddings, k=5):
  closest_tweets = []
  for i, target in enumerate(test_df['target'].tolist()):
    same_stance_embeddings = train_embeddings[train_df['target'] == target]
    top_indices = np.argsort(cosine_similarity([test_embeddings[i]], same_stance_embeddings)[0])[::-1][:k]
    temp_df = train_df[train_df['target'] == target]
    temp_df = temp_df.reset_index()
    closest_tweets.append([{"text": temp_df.loc[idx]['text'], "target": temp_df.loc[idx]['target'], "stance": temp_df.loc[idx]['stance']} for idx in top_indices])
  return closest_tweets


In [None]:
# load embeddings
save_path = "/content/drive/MyDrive/PhD/CU/Courses/NLP/Project_data"
train_embeddings = np.load(os.path.join(save_path, "train_embeddings.npy"))
test_embeddings = np.load(os.path.join(save_path, "test_embeddings.npy"))

In [None]:
# get top 5 closest tweets to test set
closest_tweets = get_closest_tweets(train_df, test_df, train_embeddings, test_embeddings, k=5)

# Get random tweets per topic

In [None]:
train_df['target'].unique()

array(['Covid Vaccine', 'Digital Transformation', 'Women empowerment'],
      dtype=object)

In [None]:
covid_shot_df = train_df[train_df['target'] == "Covid Vaccine"].sample(n=5, random_state=12345)
digital_shot_df = train_df[train_df['target'] == "Digital Transformation"].sample(n=5, random_state=12345)
women_shot_df = train_df[train_df['target'] == "Women empowerment"].sample(n=5, random_state=12345)

In [None]:
closest_tweets_fixed = []
for i, target in enumerate(test_df['target'].tolist()):
  if target == "Covid Vaccine":
    temp_df = covid_shot_df
  elif target == "Digital Transformation":
    temp_df = digital_shot_df
  else:
    temp_df = women_shot_df
  closest_tweets_fixed.append([{"text": temp_df.iloc[idx]['text'], "target": temp_df.iloc[idx]['target'], "stance": temp_df.iloc[idx]['stance']} for idx in range(5)])

# Construct Prompts

In [None]:
def construct_prompts(test_df, closest_tweets):
  prompts = []
  for i in range(len(closest_tweets)):
    prompts.append(f"""You are a text analyst whose target is to find the stance of an Arabic sentence towards a given topic. The stance output is one of three classes ["Favour", "Against", "Neutral"]. The output MUST be one word only from the given stance list.

Examples:
<example>
Sentence:
{closest_tweets[i][0]['text']}
Topic: {closest_tweets[i][0]['target']}
Output: {closest_tweets[i][0]['stance']}
</example>
<example>
Sentence:
{closest_tweets[i][1]['text']}
Topic: {closest_tweets[i][1]['target']}
Output: {closest_tweets[i][1]['stance']}
</example>
<example>
Sentence:
{closest_tweets[i][2]['text']}
Topic: {closest_tweets[i][2]['target']}
Output: {closest_tweets[i][2]['stance']}
</example>
<example>
Sentence:
{closest_tweets[i][3]['text']}
Topic: {closest_tweets[i][3]['target']}
Output: {closest_tweets[i][3]['stance']}
</example>
<example>
Sentence:
{closest_tweets[i][4]['text']}
Topic: {closest_tweets[i][4]['target']}
Output: {closest_tweets[i][4]['stance']}
</example>


Input Sentence:
{test_df.iloc[i]['text']}
Topic: {test_df.iloc[i]['target']}
Output:""")
  return prompts


## Embedding based few shot

In [None]:
test_prompts = construct_prompts(test_df, closest_tweets)

In [None]:
# save prompts
import json

with open(os.path.join(save_path, "test_prompts_embeddings.json"), 'w', encoding='utf8') as f:
  json.dump(test_prompts, f, ensure_ascii=False)

## Fixed few shot

In [None]:
test_prompts = construct_prompts(test_df, closest_tweets_fixed)

In [None]:
with open(os.path.join(save_path, "test_prompts_fixed.json"), 'w', encoding='utf8') as f:
  json.dump(test_prompts, f, ensure_ascii=False)

In [None]:
test_prompts

# Evaluation


In [None]:
from sklearn.metrics import classification_report

In [None]:
y_pred = ['Neutral', 'Neutral', 'Against', 'Against', 'Neutral', 'Favor', 'Favor', 'Against', 'Against', 'Favor', 'Neutral', 'Neutral', 'Favor', 'Favor', 'Neutral', 'Against', 'Against', 'Favor', 'Favor', 'Against', 'Against', 'Neutral', 'Favor', 'Neutral', 'Favor', 'Against', 'Favor', 'Neutral', 'Against', 'Favor', 'Neutral', 'Favor', 'Favor', 'Favor', 'Against', 'Favor', 'Against', 'Favor', 'Against', 'Against', 'Against', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Favor', 'Favor', 'Against', 'Against', 'Favor', 'Against', 'Against', 'Favor', 'Against', 'Against', 'Favor', 'Favor', 'Favor', 'Favor', 'Against', 'Against', 'Against', 'Against', 'Against', 'Favor', 'Against', 'Favor', 'Against', 'Against', 'Against', 'Favor', 'Neutral', 'Against', 'Neutral', 'Favor', 'Favor', 'Favor', 'Favor', 'Against', 'Against', 'Against', 'Against', 'Neutral', 'Neutral', 'Favor', 'Against', 'Favor', 'Against', 'Favor', 'Neutral', 'Favor', 'Against', 'Favor', 'Favor', 'Favor', 'Neutral', 'Neutral', 'Against', 'Favor', 'Favor']

print(classification_report(test_df['stance'].tolist()[:100], y_pred))

              precision    recall  f1-score   support

     Against       0.69      0.84      0.76        32
       Favor       0.95      0.61      0.75        62
     Neutral       0.14      0.50      0.22         6

    accuracy                           0.68       100
   macro avg       0.60      0.65      0.58       100
weighted avg       0.82      0.68      0.72       100



# Other models

## KNN

In [None]:
label2id = {'Neutral': 0, 'Against': 1, 'Favor': 2}
id2label = {v: k for k, v in label2id.items()}

In [None]:
from scipy import stats as st

def predict_KNN(closest_tweets, k=5):
  labels = []
  for tweets in closest_tweets:
    labels.append(id2label[st.mode([label2id[t['stance']] for t in tweets]).mode])
  return labels

In [None]:
print(classification_report(test_df['stance'].tolist()[:100], predict_KNN(closest_tweets)[:100]))

# BERTs

## Dataset

In [None]:
import os
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoModel
import transformers
from tqdm import tqdm
import torch
from torch import nn
from torch.optim import Adam
from sklearn.model_selection import train_test_split
# from arabert.preprocess import ArabertPreprocessor
from sklearn.metrics import classification_report
from arabert.preprocess import ArabertPreprocessor
import gc
import re


class MawqifDataset(torch.utils.data.Dataset):
  def __init__(self, df, tokenizer, model_name):
    self.labelsIds = {'Neutral': 0, 'Against': 1, 'Favor': 2}
    self.labels = [self.labelsIds[label] for label in df['stance']]
    self.targets = df['target'].tolist()
    if model_name in ["aubmindlab/bert-base-arabertv02-twitter", "aubmindlab/bert-base-arabertv2"]:
      arabert_prep = ArabertPreprocessor(model_name=model_name)
      texts = [arabert_prep.preprocess(t) for t in df['text'].tolist()]
    else:
      texts = df['text'].tolist()
    self.texts = [tokenizer(text, padding='max_length', max_length = 128, truncation=True, return_tensors="pt") for text in texts]

  def classes(self):
    return self.labels

  def __len__(self):
    return len(self.texts)

  def get_batch_labels(self, idx):
    return np.array(self.labels[idx])

  def get_batch_texts(self, idx):
    return self.texts[idx]

  def __getitem__(self, idx):
    batch_texts = self.get_batch_texts(idx)
    batch_y = self.get_batch_labels(idx)
    return batch_texts, batch_y

## Models

In [None]:
class CLSClassifier(nn.Module):
  def __init__(self, model_name= "UBC-NLP/MARBERT"):
    super(CLSClassifier, self).__init__()
    self.bert = AutoModel.from_pretrained(model_name)

  def forward(self, input_id, mask):
    _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask, return_dict=False)
    return pooled_output

In [None]:
class AvgPoolerClassifier(nn.Module):
  def __init__(self, model_name= "UBC-NLP/MARBERT"):
    super(AvgPoolerClassifier, self).__init__()
    self.bert = AutoModel.from_pretrained(model_name)

  def forward(self, input_id, mask):
    last_hidden_state, pooled_output = self.bert(input_ids= input_id, attention_mask=mask, return_dict=False)
    full_mask = mask.reshape(-1,128,1).repeat(1,1,768)
    last_hidden_state = last_hidden_state * full_mask
    last_hidden_state = last_hidden_state.sum(dim=1)
    last_hidden_state /= mask.sum(dim=2)
    return last_hidden_state

## Forward Pass

In [None]:
def getOutputEmbeddings(model, dataset, batch_size=1024):
  dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  if use_cuda:
    model = model.cuda()

  embeddings = []
  with torch.no_grad():
    for input, label in tqdm(dataloader):
      label = label.to(device)
      mask = input['attention_mask'].to(device)
      input_id = input['input_ids'].squeeze(1).to(device)
      output = model(input_id, mask)
      tempEmbedding = output.detach().to('cpu').numpy()
      embeddings.append(tempEmbedding)
  return np.concatenate(embeddings, axis=0)

## Get embeddings

In [None]:
model_names = [
    # "UBC-NLP/MARBERT",
    # "qarib/bert-base-qarib",
    # "UBC-NLP/ARBERT",
    # "asafaya/bert-base-arabic",
    "aubmindlab/bert-base-arabertv02-twitter",
    "aubmindlab/bert-base-arabertv2"
]

pooling_types = ['cls', 'avg']

save_path = "/content/drive/MyDrive/PhD/CU/Courses/NLP/Project_data/embeddings"

In [None]:
for model_name in model_names:
  for pooling_type in pooling_types:
    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # prepare dataset
    trainDataset = MawqifDataset(train_df, tokenizer, model_name)
    testDataset = MawqifDataset(test_df, tokenizer, model_name)

    # prepare model
    if pooling_type == 'cls':
      model = CLSClassifier(model_name)
    else:
      model = AvgPoolerClassifier(model_name)

    # get embedding
    train_embeddings = getOutputEmbeddings(model, trainDataset)
    test_embeddings = getOutputEmbeddings(model, testDataset)

    # save embeddings
    mName = model_name
    np.save(os.path.join(save_path, f"{re.sub('/', '_', mName)}_{pooling_type}_train.npy"), train_embeddings)
    np.save(os.path.join(save_path, f"{re.sub('/', '_', mName)}_{pooling_type}_test.npy"), test_embeddings)

    # free memory
    del tokenizer
    del trainDataset
    del testDataset
    del model
    del train_embeddings
    del test_embeddings
    gc.collect()
    torch.cuda.empty_cache()

config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 3/3 [00:19<00:00,  6.57s/it]
100%|██████████| 1/1 [00:03<00:00,  3.38s/it]
Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 3/3 [00:19<00:00,  6.51s/it]
100%|██████████| 1/1 [00:03<00:00,  3.20s/it]


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/720k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



100%|██████████| 241M/241M [04:16<00:00, 940kiB/s]




model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

100%|██████████| 3/3 [00:20<00:00,  6.72s/it]
100%|██████████| 1/1 [00:03<00:00,  3.35s/it]
100%|██████████| 3/3 [00:19<00:00,  6.42s/it]
100%|██████████| 1/1 [00:03<00:00,  3.32s/it]
