In [None]:
!pip install --upgrade datasets
!pip install --upgrade evaluate
!pip install accelerate -U
!pip install --upgrade transformers[torch]

In [None]:
from transformers import AutoModel , AutoTokenizer
from transformers import AutoModelForSequenceClassification
import pandas as pd

In [None]:
import torch

class DatasetPrep(torch.utils.data.Dataset):
   def __init__(self, texts, labels):
       self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=50)
       self.labels = labels
   def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        label = self.labels[idx] if self.labels is not None and idx < len(self.labels) else None
        if label is not None:
            item["label"] = torch.tensor(label)
        return item
   def __len__(self):
       return len(self.encodings["input_ids"])
    
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np

def compute_metrics(p):
   pred, labels = p
   pred = np.argmax(pred, axis=1)
   accuracy = accuracy_score(y_true=labels, y_pred=pred)
   recall = recall_score(y_true=labels, y_pred=pred, pos_label='positive', average='weighted')
   precision = precision_score(y_true=labels, y_pred=pred, pos_label='positive', average='weighted')
   f1 = f1_score(y_true=labels, y_pred=pred, pos_label='positive', average='weighted')
   return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
def generate_external_datasets():
    revs = pd.read_csv("/kaggle/input/arabic-100k-reviews/ar_reviews_100k.tsv" , delimiter = "\t")
    company_rates = pd.read_csv("/kaggle/input/d/fahdseddik/arabic-company-reviews/CompanyReviews.csv")
    company_rates = company_rates[["review_description" , "rating"]]
    company_rates.columns = ["text" , "label"]
    company_rates = company_rates[company_rates.label.isin([1,-1])]
    mapping = {1 : "Positive" , -1 : "Negative"}
    company_rates.label = company_rates.label.map(mapping)

    merged_df = pd.concat((revs , company_rates))

    twitter = pd.read_excel("/kaggle/input/arabictwittercorpusajgtmaster/AJGT.xlsx")
    twitter.drop("ID" , axis = 1 ,  inplace=True)
    twitter.columns = ["text" , "label"]
    twitter.label.value_counts()

    merged_df = pd.concat((merged_df , twitter))
    from datasets import load_dataset

    tw_test = pd.DataFrame(load_dataset("asas-ai/Arabic_Sentiment_Twitter_Corpus")["test"])
    tw_train = pd.DataFrame(load_dataset("asas-ai/Arabic_Sentiment_Twitter_Corpus")["train"])
    twitter_dataset = pd.concat((tw_test , tw_train))
    twitter_dataset.columns = ["label" , "text"]
    mapping = {"neg" : "Negative" , "pos" : "Positive"}
    twitter_dataset.label = twitter_dataset.label.map(mapping)
    twitter_dataset

    merged_df = pd.concat((merged_df , twitter_dataset))
    return merged_df

In [None]:
def prepare_dataset():
    dataset = pd.read_csv("/kaggle/input/semngr/sentiment task - Sheet1.csv")
    dataset = dataset.dropna()
    dataset.columns = ['text' , 'label']
    dataset = dataset[~dataset.text.str.contains("ارين")]
    dataset['text'] = dataset.text.apply(lambda x : post_process_summary(x))
    from sklearn.model_selection import train_test_split
    dataset , eval_dataset = train_test_split(dataset , test_size = 0.2 , stratify=dataset['label'] , random_state = 42)
    dataset , left_out_test_set = train_test_split(dataset , test_size = 0.2 , stratify=dataset['label'] , random_state = 42)
    unique_labels = dataset['label'].unique()

    # Create a contiguous mapping dictionary automatically
    contiguous_mapping = {label: idx for idx, label in enumerate(unique_labels)}
    dataset.label =  dataset['label'].map(contiguous_mapping)
    reversedDict = {val:key for key , val in contiguous_mapping.items()}
    # eval_dataset = pd.read_csv("/kaggle/input/classifier-data/5K Balanced Evaluation Benchmark.csv")[["text", "label"]]
    # eval_dataset.columns = ["text" , "label"]
    eval_dataset.label =  eval_dataset['label'].map(contiguous_mapping)
    dataset = dataset.sample(frac = 1)
    eval_dataset = eval_dataset.sample(frac = 1)
    
    dataset2 = generate_external_datasets()
    dataset2 , left_out_test_set_for_online = train_test_split(dataset2 , test_size = 0.2 , stratify=dataset2['label'] , random_state = 42)

    dataset2.label = dataset2.label.map(contiguous_mapping)
    dataset = pd.concat((dataset , dataset2))
    dataset = dataset.sample(frac=1)
    dataset = dataset.dropna()
    dataset = dataset.drop_duplicates(subset = ["text"])
    
    from datasets import Dataset
    train_set = Dataset.from_pandas(dataset)
    test_set = Dataset.from_pandas(eval_dataset)

    train_dataset = DatasetPrep(train_set["text"], train_set['label'])
    test_dataset = DatasetPrep(test_set["text"], test_set['label'])
    
    return train_dataset , test_dataset , left_out_test_set , reversedDict, left_out_test_set_for_online

In [None]:
import wandb
wandb.login(key = "")
from huggingface_hub import login
login("")

In [None]:
from transformers import pipeline

In [None]:
models = [  
#             "Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka" ,   
#             "Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet" ,
#             "CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment",
#             "CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment" , 
#             "CAMeL-Lab/bert-base-arabic-camelbert-ca-sentiment" , 
#             "nourmorsy/PermoBERT-Arabic-Sentiment-Analysis-NoFarasa-WLV-44000Token" , 
#             "lxyuan/distilbert-base-multilingual-cased-sentiments-student" , 
#             "ssary/XLM-RoBERTa-German-sentiment" , 
#             "Ammar-alhaj-ali/arabic-MARBERT-sentiment" , 
#             "sentence-transformers/distiluse-base-multilingual-cased-v2" , 
#             "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" ,
#             "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
#             "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7",
#             "aubmindlab/bert-base-arabertv02",
#             "UBC-NLP/MARBERTv2"   ,
#             "intfloat/multilingual-e5-large" , 
#             "intfloat/multilingual-e5-small"
              "omarelsayeed/setfit_ammar"
]  


experiment_tracker = []
import torch
import gc

for MODEL_NAME in models:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = (AutoModelForSequenceClassification.from_pretrained(MODEL_NAME , num_labels = 4 , ignore_mismatched_sizes= True).to("cuda"))

        train_dataset , test_dataset, left_out_test_set , reversedDict , left_out_test_set_for_online = prepare_dataset()

        from transformers import Trainer, TrainingArguments

        batch_size = 250

        args = TrainingArguments(
          output_dir="my_awesome_model",
          eval_strategy="epoch",
          per_device_train_batch_size=batch_size,
          per_device_eval_batch_size=128,
          num_train_epochs=2,
          learning_rate=5e-5,
          logging_steps =12,
          warmup_ratio = 0.2
        )

        from transformers import Trainer

        trainer = Trainer(
          model = model ,
          args = args ,
          train_dataset = train_dataset ,
          eval_dataset = test_dataset ,
          compute_metrics=compute_metrics
        )

        results = trainer.train()
        eval_results = trainer.evaluate()

        model.config.id2label = reversedDict
        model.save_pretrained(f"exp_{MODEL_NAME}")
        tokenizer.save_pretrained(f"exp_{MODEL_NAME}")

        pipe = pipeline("text-classification" ,f"exp_{MODEL_NAME}" , device= "cuda")
        predictions = pipe.predict(left_out_test_set.text.tolist())
        pred , score = [p['label'] for p in predictions] , [p['score'] for p in predictions]
        left_out_test_set['predictions'] = pred
        left_out_test_set['score'] = score

        test_accuracy = left_out_test_set[left_out_test_set.label == left_out_test_set.predictions].shape[0]/left_out_test_set.shape[0]
        print("Test set accuracy = " , test_accuracy)

        experiment_tracker.append(
            {
                "test_accuracy" : test_accuracy, 
                "model_name" : MODEL_NAME , 
                "training_loss" : results.training_loss ,
                "eval_loss" : eval_results["eval_loss"] , 
                "eval_f1" : eval_results['eval_f1']
            }
        )

        del model , tokenizer , pipe
        gc.collect()
        torch.cuda.empty_cache()

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("")
tokz = AutoTokenizer.from_pretrained()

In [None]:
model.push_to_hub("")
tokz.push_to_hub("")