### define drive & paths

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ROOT_PATH = "/content/drive/MyDrive/AraLlamaProject/"
PRETRAIND_MODEL = "/content/drive/MyDrive/AraLlamaProject/Models_1/"
TOKENIZERS = "/content/drive/MyDrive/AraLlamaProject/tokenizers_1/"
FINTUNED_MODEL = "/content/drive/MyDrive/AraLlamaProject/Fine-tuning-models/"
DATASETS_PATH = "/content/drive/MyDrive/AraLlamaProject/Datasets/"
tasks = ["Arabic-Natural-Language-Inference","Arabic-Hate-Speech","Arabic-News-Articles","Arabic-Sentiment-Analysis"]
vocab_sizes = [16000,28000,44000]
token_methods = ["BPE","WPC", "WLV"]
is_farasa = ["with-farasa","nofarasa"]
ARGS = [(3,50),(5,50),(10,50),(10,50),(10, 50)] #arguments for the finetunning (num_epochs , batch_size)

### load libraries

In [None]:
!pip install --quiet datasets sentencepiece argparse &> /dev/null
!pip install --quiet git+https://github.com/huggingface/transformers@v4.28.1 &> /dev/null
!pip install --quiet evaluate &> /dev/null

In [None]:
from datasets import load_dataset
import tempfile
from tqdm import tqdm
import io
import pandas as pd
import os
from google.colab import drive
import os
import numpy as np
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, Trainer, PreTrainedTokenizerFast
from transformers import Trainer, TrainingArguments, pipeline
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
import os
import json
from datasets import load_dataset,load_from_disk
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from datasets import DatasetDict
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

### functionalities

In [None]:
def load_datasets(task):
  # print(f'LOADING DATASET :{task}')
  dataset = load_from_disk(DATASETS_PATH + '{}/hugging-face'.format(task))
  return dataset

In [None]:
def load_model_tokenizer(model_path, token_path,num_label):
  # ===================================================
  tokenizer = Tokenizer.from_file(token_path)
  tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer,pad_token = "<pad>")
  # ===================================================
  model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_label)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
  return model , tokenizer , data_collator

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True,max_length=512)
def tokenize_function_2(examples):
  return tokenizer(examples["premise"], examples["hypothesis"], padding="max_length", truncation=True,max_length=512)
# ==============================================================================
def tokenization(dataset,task):
  if task == "Arabic-Natural-Language-Inference":
    tokenized_dataset = dataset.map(tokenize_function_2, batched=True)
    tokenized_dataset = tokenized_dataset.remove_columns(["premise","hypothesis"])
    tokenized_dataset = tokenized_dataset.rename_column('label','labels')
    tokenized_dataset.with_format('pt')
    return tokenized_dataset
  tokenized_dataset = dataset.map(tokenize_function, batched=True)
  tokenized_dataset = tokenized_dataset.remove_columns(["text"])
  tokenized_dataset = tokenized_dataset.rename_column('label','labels')
  tokenized_dataset.with_format('pt')
  return tokenized_dataset

In [None]:
def split_dataset(dataset):
  train_testvalid = dataset.train_test_split(test_size=0.2)
  test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
  train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
  return train_test_valid_dataset

In [None]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
  acc = accuracy_score(labels, preds)
  return {
  'accuracy': acc,
  'f1': f1,
  'precision': precision,
  'recall': recall
}

In [None]:
def load_args(num_epochs , Batch_size):
  training_args = TrainingArguments(
      output_dir="./test_trainer",
      num_train_epochs=num_epochs,
      gradient_accumulation_steps=1,
      per_device_train_batch_size=Batch_size,
      fp16=True,
      adam_beta1=0.90,
      adam_beta2 =0.98,
      adam_epsilon = 1e-6,
      learning_rate=1e-5
      )
  return training_args

In [None]:
def load_trainer(model ,training_args ,data_collator , dataset ):
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=dataset['train'],
      eval_dataset=dataset['valid'],
      compute_metrics=compute_metrics,
      data_collator=data_collator
  )
  return trainer

In [None]:
def predict(dataset,trainer):
  dataset = dataset['test'].remove_columns(['labels'])
  yhat = trainer.predict(dataset)
  preds = np.argmax(yhat.predictions, axis=1)
  return preds

In [None]:
def save_fintuned_model(task,trainer,vocab,tok,farasa):
  trainer.save_model(FINTUNED_MODEL+"{}/xlm-roberta-{}-{}/{}".format(task,farasa,vocab , tok))
  print("the model saved in {}".format(FINTUNED_MODEL+"{}/xlm-roberta-{}-{}/{}".format(task,farasa,vocab , tok)))

In [None]:
def save_logs(task,trainer,vocab,tok,farasa):
  df = pd.DataFrame({"vocab size": [vocab], "tokenization": [tok],"farasa": [farasa]})
  df1 = pd.DataFrame(trainer.state.log_history)
  df2 = pd.DataFrame(trainer.evaluate(),index=["x"])
  data = [df, df1,df2]
  df3 = pd.concat(data, ignore_index=True, sort=False)
  df3.to_excel(FINTUNED_MODEL+"{}/xlm-roberta-{}-{}/{}_paramaters.xlsx".format(task,farasa,vocab , tok))
  print("the model saved in {}".format(FINTUNED_MODEL+"{}/xlm-roberta-{}-{}/{}".format(task,farasa,vocab , tok)))

### loop

In [None]:
for farasa in is_farasa:
  i = 0
  for task in tasks:
    dataset = load_datasets(task)
    num_label = len(dataset.features['label'].names)
    for tok in token_methods:
      for vocab in vocab_sizes:
        finetune_model = FINTUNED_MODEL+"{}/xlm-roberta-{}-{}/{}".format(task,farasa,vocab , tok)
        if os.path.exists(finetune_model):
          print("FINETUNED-Model", finetune_model, " already exists. Skipping...")
          print("------------------------------------------------")
          continue
        model_path = PRETRAIND_MODEL+"xlm-roberta-{}-{}/{}".format(farasa,vocab , tok)
        token_path = TOKENIZERS+"xlm-roberta-{}-tokenizer-{}-{}.json".format(tok , vocab,farasa)
        if not os.path.exists(model_path):
          print(f"Model using {tok} NOT exists. Skipping...")
          print("------------------------------------------------")
          continue
        print(model_path , token_path)
        model , tokenizer , data_collator = load_model_tokenizer(model_path ,token_path ,num_label)
        tokenized_dataset = tokenization(dataset,task)
        train_test_eval = split_dataset(tokenized_dataset)
        args = load_args(ARGS[i][0] , ARGS[i][1])
        trainer = load_trainer(model,args , data_collator , train_test_eval)
        trainer.train()
        save_fintuned_model(task,trainer,vocab,tok,farasa)
        save_logs(task,trainer,vocab,tok,farasa)
        print("================================================================")
    i=i+1

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

# push to hub

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.c

In [None]:
farasa = "nofarasa"
vocab = 16000
tok = "BPE"

In [None]:
model_path = PRETRAIND_MODEL+"xlm-roberta-{}-{}/{}".format(farasa,vocab , tok)
token_path = TOKENIZERS+"xlm-roberta-{}-tokenizer-{}-{}.json".format(tok , vocab,farasa)

In [None]:
# ===================================================
tokenizer = Tokenizer.from_file(token_path)
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer,pad_token = "<pad>")
# ===================================================
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
# Push the model and tokenizer to Hugging Face
model.push_to_hub("nourmorsy/arabic-llm-tokenizers_1")
tokenizer.push_to_hub("nourmorsy/arabic-llm-tokenizers_1")

pytorch_model.bin:   0%|          | 0.00/136M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nourmorsy/arabic-llm-tokenizers_1/commit/e6d6df891f2544c12165469e91e4de5146689363', commit_message='Upload tokenizer', commit_description='', oid='e6d6df891f2544c12165469e91e4de5146689363', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
import os

# Set the file path for the output
output_file_path = "/content/output.txt"  # Replace with your desired file path

# Open the file in write mode
with open(output_file_path, "w") as output_file:
    for farasa in is_farasa:
        for tok in token_methods:
            for vocab in vocab_sizes:
                model_path = PRETRAIND_MODEL + "xlm-roberta-{}-{}/{}".format(farasa, vocab, tok)
                token_path = TOKENIZERS + "xlm-roberta-{}-tokenizer-{}-{}.json".format(tok, vocab, farasa)
                if not os.path.exists(model_path):
                    # output_file.write(f"Model using {tok} NOT exists. Skipping...\n")
                    # output_file.write("------------------------------------------------\n")
                    continue

                model = AutoModelForSequenceClassification.from_pretrained(model_path)

                output_file.write("-------------------------------start-------------------------------------\n")
                output_file.write("xlm-roberta-{}-{}/{}".format(farasa, vocab, tok) + "\n")
                output_file.write("--------------------------------------------------------------------\n")
                config = model.config
                output_file.write("Layers: {}\n".format(config.num_hidden_layers))
                output_file.write("Heads: {}\n".format(config.num_attention_heads))
                output_file.write("Max Length: {}\n".format(config.max_position_embeddings))
                output_file.write('No of parameters: {}\n'.format(model.num_parameters()))
                output_file.write(f'Hardware: A100 GPU\n')
                output_file.write(f'Train data: 9.60GB\n')
                output_file.write(f'vocab_size: {vocab}\n')
                output_file.write(f'farsa: {farasa}\n')
                output_file.write("--------------------------------------------------------------------\n")
                output_file.write("--------------------------------end------------------------------------\n")

# Print the file path for reference
print("Output written to:", output_file_path)


In [None]:
for farasa in is_farasa:
  for tok in token_methods:
    for vocab in vocab_sizes:
      model_path = PRETRAIND_MODEL+"xlm-roberta-{}-{}/{}".format(farasa,vocab , tok)
      token_path = TOKENIZERS+"xlm-roberta-{}-tokenizer-{}-{}.json".format(tok , vocab,farasa)
      if not os.path.exists(model_path):
          print(f"Model using {tok} NOT exists. Skipping...")
          print("------------------------------------------------")
          continue
      # ===================================================
      # tokenizer = Tokenizer.from_file(token_path)
      # tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer,pad_token = "<pad>")
      # ===================================================
      model = AutoModelForSequenceClassification.from_pretrained(model_path)
      # ===================================================
      print("-------------------------------start-------------------------------------")
      print(model_path)
      print("--------------------------------------------------------------------")
      config = model.config
      print("Layers:", config.num_hidden_layers)
      print("Heads:", config.num_attention_heads)
      print("Max Length:", config.max_position_embeddings)
      print('No of parameters: ', model.num_parameters())
      print("--------------------------------------------------------------------")
      print("--------------------------------end------------------------------------")
      # Push the model and tokenizer to Hugging Face
      # if farasa == "with-farasa" : name = "Farasa"
      # else: name = "NoFarasa"
      # model.push_to_hub(f"nourmorsy/PermoBERT-{name}-{tok}-{vocab}Token")
      # tokenizer.push_to_hub(f"nourmorsy/PermoBERT-{name}-{tok}-{vocab}Token")

In [None]:
for farasa in is_farasa:
  for task in tasks:
    for tok in token_methods:
      for vocab in vocab_sizes:
        finetune_model = FINTUNED_MODEL+"{}/xlm-roberta-{}-{}/{}".format(task,farasa,vocab , tok)
        token_path = TOKENIZERS+"xlm-roberta-{}-tokenizer-{}-{}.json".format(tok , vocab,farasa)
        if not os.path.exists(finetune_model):
            print(f"Model using {tok} NOT exists. Skipping...")
            print("------------------------------------------------")
            continue
        # # ===================================================
        tokenizer = Tokenizer.from_file(token_path)
        tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer,pad_token = "<pad>")
        # # ===================================================
        model = AutoModelForSequenceClassification.from_pretrained(finetune_model)
        # # ===================================================
        # # Push the model and tokenizer to Hugging Face
        if farasa == "with-farasa" : name = "Farasa"
        else: name = "NoFarasa"
        model.push_to_hub(f"nourmorsy/PermoBERT-{task}-{name}-{tok}-{vocab}Token")
        tokenizer.push_to_hub(f"nourmorsy/PermoBERT-{task}-{name}-{tok}-{vocab}Token")

In [None]:
model = AutoModelForCausalLM.from_pretrained("asafaya/bert-medium-arabic")
print('No of parameters: ', model.num_parameters())
# return model

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


No of parameters:  42162944


In [None]:
config = model.config

In [None]:
print(config)

BertConfig {
  "_name_or_path": "asafaya/bert-medium-arabic",
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}



In [None]:
print("Layers:", config.num_hidden_layers)
print("Heads:", config.num_attention_heads)
# print("Batch Size:", trainer.args.per_device_train_batch_size)
print("Max Length:", config.max_position_embeddings)

Layers: 8
Heads: 8
Max Length: 512


#calculate ratio

In [None]:
!pip install farasapy
!pip install arabert
from arabert import ArabertPreprocessor

Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14
Collecting arabert
  Downloading arabert-1.0.1-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyArabic (from arabert)
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting emoji==1.4.2 (from arabert)
  Downloading emoji-1.4.2.tar.gz (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.4.2-py3-none-any.whl 

In [None]:
model_name="bert-base-arabert"
arabert_prep_0 = ArabertPreprocessor(
    model_name= model_name,
    keep_emojis = False,
    remove_html_markup = True,
    replace_urls_emails_mentions = True,
    strip_tashkeel = True,
    strip_tatweel = True,
    insert_white_spaces = True,
    remove_non_digit_repetition = True,
    replace_slash_with_dash = None,
    map_hindi_numbers_to_arabic = True,
    apply_farasa_segmentation = False
)



In [None]:
def preprocessing(data):
  for i in range(0,len(data)):
    data[i]= arabert_prep_0.preprocess(data[i])
  return data

In [None]:
from collections import Counter
import os

# Set the file path for the output
output_file_path = "/content/output.txt"  # Replace with your desired file path
ratio_unknow_to_know_path = "/content/stats.txt"
# Open the file in write mode
with open(ratio_unknow_to_know_path, "w") as ratio_unknow_to_know:
  with open(output_file_path, "w") as output_file:
    for farasa in is_farasa:
      for task in tasks:
        dataset = load_datasets(task)
        output_file.write(f'LOADING DATASET :{task}\n')
        ratio_unknow_to_know.write(f'LOADING DATASET :{task}\n')
        text_columns = [column for column in dataset.column_names if column!='label']
        all_text = [str(text) for column in text_columns for text in dataset[column]]
        all_text = preprocessing(all_text)
        unique_words = set(word for text in all_text for word in text.split())
        unique_words_list = list(unique_words)
        word_counts = Counter(word for text in all_text for word in text.split())
        for tok in token_methods:
          for vocab in vocab_sizes:
            unknown_token_count = 0
            know_token_count = 0
            unknow_list = []
            token_path = TOKENIZERS+"xlm-roberta-{}-tokenizer-{}-{}.json".format(tok , vocab,farasa)
            if not os.path.exists(token_path):
                # print(f"Token NOT exists. Skipping...")
                # print("------------------------------------------------")
                continue
            # # ===================================================
            tokenizer = Tokenizer.from_file(token_path)
            tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer,pad_token = "<pad>")
            # # ===================================================
            for i in range(0,len(unique_words_list)):
              encoded_input = tokenizer.tokenize(unique_words_list[i])
              if("<unk>" in encoded_input):
                unknown_token_count+=1
                unknow_list.append(unique_words_list[i])
              else:
                know_token_count +=1
            # # ===================================================
            ratio_unknow_to_know.write("================================================\n")
            ratio_unknow_to_know.write(f"token:{tok} , vocab:{vocab} , farasa:{farasa}\n")
            ratio_unknow_to_know.write(f"unknown_token_count:{unknown_token_count}\nknow_token_count{know_token_count}\n")
            ratio_unknow_to_know.write(f"ratio unknow to know tokens:{unknown_token_count/know_token_count}\n")
            # =======================================================
            output_file.write(f"token:{tok} , vocab:{vocab} , farasa:{farasa}\n")
            output_file.write(f"unknow_list:{unknow_list}\n")
            output_file.write("===================================================================\n")
