### Pipeline to Fine-Tune CAMELBERT and Evaluate It Using Three Datasets

This document outlines the pipeline to fine-tune the MARBERT model and evaluate its performance on three distinct datasets. After fine-tuning, we will compare the results across the datasets to assess performance consistency and accuracy across varying input types and data structures.

---

#### Overview of Datasets:
1. **Dataset 1:** GPT-4o-generated samples
2. **Dataset 2:** Samples classified using 18 binary classifiers on random samples
3. **Dataset 3:** Samples classified using 18 binary classifiers on equivalent samples

---

In [2]:
import torch
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EvalPrediction,
    TrainerCallback,
)
from sklearn.metrics import accuracy_score, hamming_loss, precision_recall_fscore_support
from preprocess import final_eliminations
from transformers import TrainingArguments, EarlyStoppingCallback
from transformers.trainer_utils import IntervalStrategy


  from .autonotebook import tqdm as notebook_tqdm


In [19]:
class BertTrainer:
    def __init__(self, training_dataset_path, labels, exp_num, threshold=0.5, model_name="CAMeL-Lab/bert-base-arabic-camelbert-ca"):
        self.labels = labels
        self.label2id = {label: idx for idx, label in enumerate(labels)}
        self.id2label = {idx: label for label, idx in self.label2id.items()}
        self.model_name = model_name
        self.exp_num = exp_num
        training_dataset = pd.read_csv(training_dataset_path)
        self.training_dataset_processed = pd.DataFrame({
            'text': training_dataset['tweet'],
            'label': training_dataset[self.labels].values.tolist()
        })
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.train_df, self.val_df = train_test_split(self.training_dataset_processed, test_size=0.1, random_state=42)
        self.train_df['text'] = self.train_df['text'].astype(str)
        self.val_df['text'] = self.val_df['text'].astype(str)
        self.train_dataset = self.create_dataset(self.train_df)
        self.val_dataset = self.create_dataset(self.val_df)
        self.threshold = threshold
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.load_model(dropout_rate=0.3)  # Adding dropout rate

    def create_dataset(self, df):
        encodings = self.tokenizer(
            df['text'].tolist(), truncation=True, padding=True, max_length=128
        )
        return TweetDataset(encodings, df['label'].values)

    def load_model(self, dropout_rate=0.3):
        model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=len(self.labels),
            id2label=self.id2label,
            label2id=self.label2id,
            problem_type="multi_label_classification"
        )
        # Adjust dropout if supported
        model.config.hidden_dropout_prob = dropout_rate
        model.config.attention_probs_dropout_prob = dropout_rate
        
        # Freeze the lower layers of the model to prevent overfitting
        for param in model.bert.encoder.layer[:8].parameters():
            param.requires_grad = False

        model.to(self.device)
        return model

    
    def predict(self, texts):
        encodings = self.tokenizer(
            texts, 
            truncation=True, 
            padding=True, 
            max_length=128, 
            return_tensors="pt"
        )
        input_ids = encodings['input_ids'].to(self.device)
        attention_mask = encodings['attention_mask'].to(self.device)
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
        probabilities = torch.sigmoid(logits).cpu().numpy()
        predictions = (probabilities >= self.threshold).astype(int)
        variation_score = 1 - (np.sum(probabilities)/18)
        return predictions, probabilities, variation_score

    
    def evaluate(self, dev_path):
        if '.tsv' in dev_path:
            dev = pd.read_csv(dev_path, sep='\t')
        else:
            dev = pd.read_csv(dev_path)
        
        dev = final_eliminations(dev, column_name="sentence")

        df_replaced = dev.replace({'y': 1, 'n': 0})
        country_columns = df_replaced.columns.difference(['sentence'])
        df_replaced['label'] = df_replaced[country_columns].values.tolist()
        df_final = df_replaced[['sentence', 'label']]
        
        predictions, probabilities, _ = self.predict(df_final['sentence'].tolist())
        output_dir = f'./exp_{self.exp_num}'
        output_file = os.path.join(self.save_dir, f"{self.model_name.replace('/', '-')}-experiment-{self.exp_num}_predictions.txt")
        os.makedirs(output_dir, exist_ok=True)
        with open(output_file, 'w') as f:
            for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
                pred_str = ','.join(map(str, pred))
                f.write(f'{pred_str}\n')
        # with open(output_file, 'w') as f:
        #     for pred in predictions:
        #         pred_str = ','.join(map(str, pred))
        #         f.write(f'{pred_str}\n')
        
        indexes = [0, 2, 4, 10, 13, 14, 15, 17]
        predictions = [output[indexes] for output in predictions]


        subset_accuracy = accuracy_score(df_final['label'].tolist(), predictions)
        print(f"Subset Accuracy: {subset_accuracy:.4f}")

        hamming = hamming_loss(df_final['label'].tolist(), predictions)
        print(f"Hamming Loss: {hamming:.4f}")

        precision, recall, f1, _ = precision_recall_fscore_support(
            df_final['label'].tolist(), predictions, average='micro'  # Use 'micro' for multi-label tasks
        )
        print(f"Micro Precision: {precision:.4f}")
        print(f"Micro Recall: {recall:.4f}")
        print(f"Micro F1-Score: {f1:.4f}")

        precision_per_label, recall_per_label, f1_per_label, _ = precision_recall_fscore_support(
            df_final['label'].tolist(), predictions, average=None  # 'None' gives metrics for each label
        )
        print(f"Precision per label: {precision_per_label}")
        print(f"Recall per label: {recall_per_label}")
        print(f"F1-Score per label: {f1_per_label}")
        multilabel_check = [np.sum(np.array(prediction)) for prediction in predictions]
        print(set(multilabel_check))


    def compute_metrics(self, p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        result = self.multi_label_metrics(preds, p.label_ids)
        return result

    def multi_label_metrics(self, predictions, labels):
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.Tensor(predictions))
        y_pred = np.zeros(probs.shape)
        y_pred[np.where(probs >= self.threshold)] = 1
        f1 = f1_score(labels, y_pred, average='micro')
        roc_auc = roc_auc_score(labels, y_pred, average='micro')
        accuracy = accuracy_score(labels, y_pred)
        return {'f1': f1, 'roc_auc': roc_auc, 'accuracy': accuracy}
    
    # def train(
    #     self,
    #     num_train_epochs=3,  
    #     metric_for_best_model="eval_f1",  
    #     greater_is_better=True,  
    #     per_device_train_batch_size=8,
    #     per_device_eval_batch_size=16,
    #     patience=2
    # ):
    #     training_args = TrainingArguments(
    #         output_dir='./exp_' + str(self.exp_num) + '/results',
    #         num_train_epochs=num_train_epochs,
    #         per_device_train_batch_size=per_device_train_batch_size,
    #         per_device_eval_batch_size=per_device_eval_batch_size,
    #         warmup_steps=500,
    #         weight_decay=0.01,
    #         logging_dir='./exp_' + str(self.exp_num) + '/logs',
    #         logging_steps=500,
    #         evaluation_strategy="epoch",
    #         save_strategy="epoch",
    #         load_best_model_at_end=True,
    #         metric_for_best_model=metric_for_best_model,
    #         greater_is_better=greater_is_better,
    #         fp16=True,
    #         report_to=["tensorboard"],
    #         lr_scheduler_type="reduce_lr_on_plateau",  # Adjust learning rate dynamically
    #     )

    #     early_stopping_callback = EarlyStoppingCallback(
    #         early_stopping_patience=patience
    #     )

    #     trainer = CustomTrainer(
    #         model=self.model,
    #         args=training_args,
    #         train_dataset=self.train_dataset,
    #         eval_dataset=self.val_dataset,
    #         tokenizer=self.tokenizer,
    #         compute_metrics=self.compute_metrics,
    #         callbacks=[early_stopping_callback]  # Register the early stopping callback
    #     )
        
    #     trainer.train()
    #     best_metric_value = trainer.state.best_metric 
    #     num_epochs = training_args.num_train_epochs
    #     greater_is_better = training_args.greater_is_better
    #     metric_name = training_args.metric_for_best_model
    #     save_dir = f'./exp_{self.exp_num}/marbert_finetuned_epochs_{num_epochs}_{metric_name}_{best_metric_value:.4f}_{"greater" if greater_is_better else "less"}_threshold_{self.threshold}'
    #     self.save_dir = save_dir
    #     os.makedirs(save_dir, exist_ok=True)
    #     self.model.save_pretrained(save_dir, safe_serialization=False)
    #     self.tokenizer.save_pretrained(save_dir)
    def train(
        self,
        num_train_epochs=3,  
        metric_for_best_model="eval_f1",  
        greater_is_better=True,  
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        patience=2,
        warmup_steps=500,  # Number of steps for learning rate warmup
        base_learning_rate=5e-5,  # Initial learning rate
    ):
        training_args = TrainingArguments(
            output_dir='./exp_' + str(self.exp_num) + '/results',
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            warmup_steps=warmup_steps,
            learning_rate=base_learning_rate,
            weight_decay=0.01,
            logging_dir='./exp_' + str(self.exp_num) + '/logs',
            logging_steps=500,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model=metric_for_best_model,
            greater_is_better=greater_is_better,
            fp16=True,
            report_to=["tensorboard"],
            lr_scheduler_type="linear",  # Use linear warmup and decay
        )

        early_stopping_callback = EarlyStoppingCallback(
            early_stopping_patience=patience
        )

        trainer = CustomTrainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics,
            callbacks=[early_stopping_callback]  # Register the early stopping callback
        )

        trainer.train()

        # Save the best model
        best_metric_value = trainer.state.best_metric 
        num_epochs = training_args.num_train_epochs
        greater_is_better = training_args.greater_is_better
        metric_name = training_args.metric_for_best_model
        save_dir = f'./exp_{self.exp_num}/marbert_finetuned_epochs_{num_epochs}_{metric_name}_{best_metric_value:.4f}_{"greater" if greater_is_better else "less"}_threshold_{self.threshold}'
        self.save_dir = save_dir
        os.makedirs(save_dir, exist_ok=True)
        self.model.save_pretrained(save_dir, safe_serialization=False)
        self.tokenizer.save_pretrained(save_dir)

        


class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item


class CustomTrainer(Trainer):
    def save_model(self, output_dir=None, **kwargs):
        if output_dir is None:
            output_dir = self.args.output_dir
        for param in self.model.parameters():
            param.data = param.data.contiguous()
        super().save_model(output_dir, **kwargs)


In [3]:
dataset_path = '/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/multilabel/NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv'
df = pd.read_csv(dataset_path)
# df_200 = df.head(1000)
# output_path = '/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/First_1000.csv'
# df_200.to_csv(output_path, index=False)

In [5]:
import pandas as pd
from sklearn.utils import shuffle

# Load the dataset
label_columns = df.columns[2:-1]  # Excludes 'id', 'tweet', and 'Computed' columns
df[label_columns] = df[label_columns].astype(int)  # Ensure labels are integers

threshold = 500  
balanced_df = pd.DataFrame()

# Iterate over possible combinations of active labels    model_name="/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_8/camelbert_finetuned_epochs_2_eval_f1_0.7694_greater_threshold_0.3",

for num_classes in range(1, len(label_columns) + 1):
    subset = df[df[label_columns].sum(axis=1) == num_classes]  # Filter rows with num_classes active labels
    
    # Shuffle and sample the subset if it exceeds the threshold
    if len(subset) > threshold:
        subset = shuffle(subset).head(threshold)
    
    # Append sampled subset to the balanced dataset
    balanced_df = pd.concat([balanced_df, subset], ignore_index=True)

# Shuffle the final balanced DataFrame and save it to a new CSV
balanced_df = shuffle(balanced_df).reset_index(drop=True)
balanced_df.to_csv('balanced_multilabel_dataset_' + str(threshold) + '.csv', index=False)

print("Balanced dataset created and saved as 'balanced_multilabel_dataset_500.csv'")

label_columns = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait', 'Lebanon', 
                 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar', 'Saudi_Arabia', 
                 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']

# Initialize a dictionary to store counts
counts = {}

# Loop to count rows where the sum of 1s in label columns equals i (from 0 to 18)
for i in range(19):
    counts[i] = (balanced_df[label_columns].sum(axis=1) == i).sum()


Balanced dataset created and saved as 'balanced_multilabel_dataset_500.csv'


In [6]:
counts

{0: 0,
 1: 500,
 2: 500,
 3: 500,
 4: 500,
 5: 500,
 6: 500,
 7: 500,
 8: 500,
 9: 500,
 10: 168,
 11: 137,
 12: 334,
 13: 273,
 14: 256,
 15: 500,
 16: 11,
 17: 26,
 18: 500}

### EXPERIMENT 6

In [161]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv"]
dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[3]}"
dev_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    labels=labels,
    threshold=0.3,
    exp_num=6
)
trainer.train(
    num_train_epochs=10,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=24
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▍         | 502/10060 [00:27<08:58, 17.75it/s]

{'loss': 0.5952, 'grad_norm': 1.13016939163208, 'learning_rate': 4.97e-05, 'epoch': 0.5}


 10%|▉         | 1004/10060 [00:56<07:53, 19.14it/s]

{'loss': 0.5852, 'grad_norm': 4.487825870513916, 'learning_rate': 4.7400627615062765e-05, 'epoch': 0.99}


 10%|█         | 1006/10060 [00:56<08:19, 18.13it/s]
 10%|█         | 1006/10060 [00:56<08:19, 18.13it/s]

{'eval_loss': 0.5702190399169922, 'eval_f1': 0.721625086283404, 'eval_roc_auc': 0.6566456002973553, 'eval_accuracy': 0.08035714285714286, 'eval_runtime': 0.3316, 'eval_samples_per_second': 1350.871, 'eval_steps_per_second': 57.291, 'epoch': 1.0}


 15%|█▍        | 1502/10060 [01:29<08:15, 17.28it/s]  

{'loss': 0.5474, 'grad_norm': 1.6961125135421753, 'learning_rate': 4.478556485355649e-05, 'epoch': 1.49}


 20%|█▉        | 2002/10060 [01:59<08:12, 16.35it/s]

{'loss': 0.5359, 'grad_norm': 3.8677239418029785, 'learning_rate': 4.217050209205021e-05, 'epoch': 1.99}


 20%|██        | 2012/10060 [01:59<08:10, 16.40it/s]
 20%|██        | 2012/10060 [02:00<08:10, 16.40it/s]

{'eval_loss': 0.551264226436615, 'eval_f1': 0.737281067556297, 'eval_roc_auc': 0.6925700472012972, 'eval_accuracy': 0.08035714285714286, 'eval_runtime': 0.3326, 'eval_samples_per_second': 1346.946, 'eval_steps_per_second': 57.125, 'epoch': 2.0}


 25%|██▍       | 2502/10060 [02:32<07:33, 16.67it/s]  

{'loss': 0.4861, 'grad_norm': 3.64426326751709, 'learning_rate': 3.955543933054394e-05, 'epoch': 2.49}


 30%|██▉       | 3003/10060 [03:01<06:41, 17.59it/s]

{'loss': 0.4749, 'grad_norm': 2.2494912147521973, 'learning_rate': 3.694560669456067e-05, 'epoch': 2.98}


 30%|██▉       | 3017/10060 [03:02<06:55, 16.94it/s]
 30%|███       | 3018/10060 [03:03<06:55, 16.94it/s]

{'eval_loss': 0.5411345362663269, 'eval_f1': 0.7527460210715087, 'eval_roc_auc': 0.7294907566665392, 'eval_accuracy': 0.08928571428571429, 'eval_runtime': 0.3123, 'eval_samples_per_second': 1434.531, 'eval_steps_per_second': 60.84, 'epoch': 3.0}


 35%|███▍      | 3503/10060 [03:33<06:20, 17.25it/s]

{'loss': 0.4221, 'grad_norm': 3.1075947284698486, 'learning_rate': 3.433054393305439e-05, 'epoch': 3.48}


 40%|███▉      | 4003/10060 [04:03<06:08, 16.43it/s]

{'loss': 0.4147, 'grad_norm': 5.762308120727539, 'learning_rate': 3.171548117154812e-05, 'epoch': 3.98}


 40%|███▉      | 4023/10060 [04:04<06:03, 16.60it/s]
 40%|████      | 4024/10060 [04:04<06:03, 16.60it/s]

{'eval_loss': 0.5831584930419922, 'eval_f1': 0.7425149700598802, 'eval_roc_auc': 0.7443502191112906, 'eval_accuracy': 0.11607142857142858, 'eval_runtime': 0.3143, 'eval_samples_per_second': 1425.53, 'eval_steps_per_second': 60.458, 'epoch': 4.0}


 45%|████▍     | 4503/10060 [04:36<05:33, 16.65it/s]

{'loss': 0.3712, 'grad_norm': 1.8115085363388062, 'learning_rate': 2.9100418410041842e-05, 'epoch': 4.47}


 50%|████▉     | 5003/10060 [05:07<05:28, 15.39it/s]

{'loss': 0.369, 'grad_norm': 3.180023193359375, 'learning_rate': 2.6485355648535566e-05, 'epoch': 4.97}


 50%|████▉     | 5029/10060 [05:09<05:41, 14.71it/s]
 50%|█████     | 5030/10060 [05:09<05:41, 14.71it/s]

{'eval_loss': 0.5870450735092163, 'eval_f1': 0.7595059336401065, 'eval_roc_auc': 0.7547659683134773, 'eval_accuracy': 0.12946428571428573, 'eval_runtime': 0.3544, 'eval_samples_per_second': 1264.012, 'eval_steps_per_second': 53.608, 'epoch': 5.0}


 55%|█████▍    | 5503/10060 [05:41<04:50, 15.70it/s]

{'loss': 0.3241, 'grad_norm': 2.609508991241455, 'learning_rate': 2.387029288702929e-05, 'epoch': 5.47}


 60%|█████▉    | 6003/10060 [06:11<04:06, 16.45it/s]

{'loss': 0.3189, 'grad_norm': 4.649298191070557, 'learning_rate': 2.1255230125523013e-05, 'epoch': 5.96}


 60%|█████▉    | 6035/10060 [06:13<04:18, 15.56it/s]
 60%|██████    | 6036/10060 [06:14<04:18, 15.56it/s]

{'eval_loss': 0.607021152973175, 'eval_f1': 0.7645331767469172, 'eval_roc_auc': 0.7531913915381108, 'eval_accuracy': 0.109375, 'eval_runtime': 0.3174, 'eval_samples_per_second': 1411.641, 'eval_steps_per_second': 59.869, 'epoch': 6.0}


 65%|██████▍   | 6503/10060 [06:45<03:27, 17.11it/s]

{'loss': 0.2827, 'grad_norm': 5.2005462646484375, 'learning_rate': 1.8640167364016737e-05, 'epoch': 6.46}


 70%|██████▉   | 7003/10060 [07:16<03:03, 16.68it/s]

{'loss': 0.269, 'grad_norm': 3.7985293865203857, 'learning_rate': 1.602510460251046e-05, 'epoch': 6.96}


 70%|██████▉   | 7041/10060 [07:18<03:09, 15.94it/s]
 70%|███████   | 7042/10060 [07:18<03:09, 15.94it/s]

{'eval_loss': 0.6416592597961426, 'eval_f1': 0.7568916349809885, 'eval_roc_auc': 0.7478023782895085, 'eval_accuracy': 0.09151785714285714, 'eval_runtime': 0.32, 'eval_samples_per_second': 1399.965, 'eval_steps_per_second': 59.374, 'epoch': 7.0}


 75%|███████▍  | 7503/10060 [07:50<02:30, 16.98it/s]

{'loss': 0.2388, 'grad_norm': 2.66792893409729, 'learning_rate': 1.3410041841004184e-05, 'epoch': 7.46}


 80%|███████▉  | 8003/10060 [08:22<02:12, 15.54it/s]

{'loss': 0.2438, 'grad_norm': 1.4793015718460083, 'learning_rate': 1.079497907949791e-05, 'epoch': 7.95}


 80%|███████▉  | 8047/10060 [08:24<02:14, 14.91it/s]
 80%|████████  | 8048/10060 [08:25<02:14, 14.91it/s]

{'eval_loss': 0.6881314516067505, 'eval_f1': 0.756483082242529, 'eval_roc_auc': 0.7560156587990452, 'eval_accuracy': 0.109375, 'eval_runtime': 0.3317, 'eval_samples_per_second': 1350.675, 'eval_steps_per_second': 57.283, 'epoch': 8.0}


 85%|████████▍ | 8502/10060 [08:55<01:48, 14.41it/s]

{'loss': 0.2159, 'grad_norm': 2.017803192138672, 'learning_rate': 8.185146443514645e-06, 'epoch': 8.45}


 89%|████████▉ | 9002/10060 [09:26<01:02, 16.91it/s]

{'loss': 0.2117, 'grad_norm': 2.7105093002319336, 'learning_rate': 5.570083682008369e-06, 'epoch': 8.95}


 90%|█████████ | 9054/10060 [09:29<00:59, 16.99it/s]
 90%|█████████ | 9054/10060 [09:29<00:59, 16.99it/s]

{'eval_loss': 0.7002390623092651, 'eval_f1': 0.7545499262174127, 'eval_roc_auc': 0.7531412598940754, 'eval_accuracy': 0.10714285714285714, 'eval_runtime': 0.303, 'eval_samples_per_second': 1478.576, 'eval_steps_per_second': 62.707, 'epoch': 9.0}


 94%|█████████▍| 9502/10060 [09:59<00:34, 16.07it/s]

{'loss': 0.1931, 'grad_norm': 1.5471935272216797, 'learning_rate': 2.955020920502092e-06, 'epoch': 9.44}


 99%|█████████▉| 10002/10060 [10:29<00:03, 17.19it/s]

{'loss': 0.1922, 'grad_norm': 2.906043529510498, 'learning_rate': 3.3995815899581595e-07, 'epoch': 9.94}


100%|██████████| 10060/10060 [10:33<00:00, 16.57it/s]
100%|██████████| 10060/10060 [10:35<00:00, 16.57it/s]

{'eval_loss': 0.7048290967941284, 'eval_f1': 0.7522365805168986, 'eval_roc_auc': 0.7531335047226225, 'eval_accuracy': 0.09821428571428571, 'eval_runtime': 0.3158, 'eval_samples_per_second': 1418.587, 'eval_steps_per_second': 60.163, 'epoch': 10.0}


100%|██████████| 10060/10060 [10:38<00:00, 15.75it/s]


{'train_runtime': 638.6339, 'train_samples_per_second': 62.994, 'train_steps_per_second': 15.752, 'train_loss': 0.36358296686327957, 'epoch': 10.0}


In [165]:
trainer.evaluate(dev_path=dev_path)

Subset Accuracy: 0.0833
Hamming Loss: 0.3354
Micro Precision: 0.5590
Micro Recall: 0.4522
Micro F1-Score: 0.5000
Precision per label: [0.42857143 0.45       0.53731343 0.62711864 0.71428571 0.50847458
 0.25       0.63934426]
Recall per label: [0.08571429 0.23076923 0.72       0.578125   0.11904762 0.65217391
 0.0952381  0.66101695]
F1-Score per label: [0.14285714 0.30508475 0.61538462 0.60162602 0.20408163 0.57142857
 0.13793103 0.65      ]
{np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(8)}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [166]:
import os

scorer_script = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_6/camelbert_finetuned_epochs_10_eval_f1_0.7645_greater_threshold_0.3/CAMeL-Lab-bert-base-arabic-camelbert-ca-experiment-6_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 51.94 %
MACRO AVERAGE RECALL SCORE: 39.28 %
MACRO AVERAGE F1-SCORE: 40.35 %
MACRO AVERAGE ACCURACY: 66.46 %



### EXPERIMENT 7

In [171]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dev_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
    labels=labels,
    threshold=0.3,
    exp_num=7
)
trainer.train(
    num_train_epochs=1,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24
)
trainer.evaluate(dev_path=dev_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-mix and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 252/252 [00:19<00:00, 13.91it/s]
100%|██████████| 252/252 [00:21<00:00, 13.91it/s]

{'eval_loss': 0.48796308040618896, 'eval_f1': 0.7621827981136312, 'eval_roc_auc': 0.7501663991770002, 'eval_accuracy': 0.12667660208643816, 'eval_runtime': 0.4998, 'eval_samples_per_second': 1342.658, 'eval_steps_per_second': 56.027, 'epoch': 1.0}


100%|██████████| 252/252 [00:24<00:00, 10.39it/s]


{'train_runtime': 24.2469, 'train_samples_per_second': 248.857, 'train_steps_per_second': 10.393, 'train_loss': 0.5769250052315849, 'epoch': 1.0}
Subset Accuracy: 0.1250
Hamming Loss: 0.2844
Micro Precision: 0.6005
Micro Recall: 0.6966
Micro F1-Score: 0.6450
Precision per label: [0.78571429 0.675      0.49484536 0.65384615 0.88235294 0.4875
 0.66666667 0.65277778]
Recall per label: [0.31428571 0.69230769 0.96       0.796875   0.35714286 0.84782609
 0.47619048 0.79661017]
F1-Score per label: [0.44897959 0.6835443  0.65306122 0.71830986 0.50847458 0.61904762
 0.55555556 0.71755725]
{np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(8)}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [172]:
import os

scorer_script = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_7/camelbert_finetuned_epochs_1_eval_f1_0.7622_greater_threshold_0.3/CAMeL-Lab-bert-base-arabic-camelbert-mix-experiment-7_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 66.23 %
MACRO AVERAGE RECALL SCORE: 65.52 %
MACRO AVERAGE F1-SCORE: 61.31 %
MACRO AVERAGE ACCURACY: 71.56 %



### EXPERIMENT 8

In [173]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dev_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
    labels=labels,
    threshold=0.3,
    exp_num=8
)
trainer.train(
    num_train_epochs=2,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24
)
trainer.evaluate(dev_path=dev_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-mix and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                 
 50%|█████     | 252/504 [00:19<00:19, 13.26it/s]

{'eval_loss': 0.48796144127845764, 'eval_f1': 0.7621827981136312, 'eval_roc_auc': 0.7501663991770002, 'eval_accuracy': 0.12667660208643816, 'eval_runtime': 0.4798, 'eval_samples_per_second': 1398.644, 'eval_steps_per_second': 58.364, 'epoch': 1.0}


100%|█████████▉| 502/504 [00:42<00:00, 12.29it/s]

{'loss': 0.5124, 'grad_norm': 3.563359260559082, 'learning_rate': 4.99e-05, 'epoch': 1.98}


100%|██████████| 504/504 [00:42<00:00, 12.74it/s]
100%|██████████| 504/504 [00:45<00:00, 12.74it/s]

{'eval_loss': 0.4665817618370056, 'eval_f1': 0.769424942809813, 'eval_roc_auc': 0.7668044404020513, 'eval_accuracy': 0.14754098360655737, 'eval_runtime': 0.4957, 'eval_samples_per_second': 1353.615, 'eval_steps_per_second': 56.485, 'epoch': 2.0}


100%|██████████| 504/504 [00:47<00:00, 10.53it/s]


{'train_runtime': 47.8604, 'train_samples_per_second': 252.15, 'train_steps_per_second': 10.531, 'train_loss': 0.5122848169671165, 'epoch': 2.0}
Subset Accuracy: 0.1333
Hamming Loss: 0.2646
Micro Precision: 0.6321
Micro Recall: 0.6854
Micro F1-Score: 0.6577
Precision per label: [0.6875     0.70833333 0.52808989 0.65384615 1.         0.53846154
 0.64705882 0.75510204]
Recall per label: [0.31428571 0.87179487 0.94       0.796875   0.26190476 0.91304348
 0.52380952 0.62711864]
F1-Score per label: [0.43137255 0.7816092  0.67625899 0.71830986 0.41509434 0.67741935
 0.57894737 0.68518519]
{np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8)}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [174]:
import os

scorer_script = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_8/camelbert_finetuned_epochs_2_eval_f1_0.7694_greater_threshold_0.3/CAMeL-Lab-bert-base-arabic-camelbert-mix-experiment-8_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 68.98 %
MACRO AVERAGE RECALL SCORE: 65.61 %
MACRO AVERAGE F1-SCORE: 62.05 %
MACRO AVERAGE ACCURACY: 73.54 %



### EXPERIMENT 9

In [175]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dev_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_8/camelbert_finetuned_epochs_2_eval_f1_0.7694_greater_threshold_0.3",
    labels=labels,
    threshold=0.3,
    exp_num=9
)
trainer.train(
    num_train_epochs=2,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24
)
trainer.evaluate(dev_path=dev_path)

 50%|█████     | 252/504 [00:19<00:18, 13.85it/s]
 50%|█████     | 252/504 [00:19<00:18, 13.85it/s]

{'eval_loss': 0.4918179214000702, 'eval_f1': 0.7811874048553802, 'eval_roc_auc': 0.7815018201807157, 'eval_accuracy': 0.16989567809239942, 'eval_runtime': 0.495, 'eval_samples_per_second': 1355.615, 'eval_steps_per_second': 56.568, 'epoch': 1.0}


100%|█████████▉| 502/504 [00:41<00:00, 13.14it/s]

{'loss': 0.3169, 'grad_norm': 3.382401466369629, 'learning_rate': 4.99e-05, 'epoch': 1.98}


100%|██████████| 504/504 [00:41<00:00, 13.63it/s]
100%|██████████| 504/504 [00:44<00:00, 13.63it/s]

{'eval_loss': 0.5153090953826904, 'eval_f1': 0.7765307806568741, 'eval_roc_auc': 0.7810091627168709, 'eval_accuracy': 0.16095380029806258, 'eval_runtime': 0.4839, 'eval_samples_per_second': 1386.632, 'eval_steps_per_second': 57.862, 'epoch': 2.0}


100%|██████████| 504/504 [00:46<00:00, 10.79it/s]


{'train_runtime': 46.7301, 'train_samples_per_second': 258.249, 'train_steps_per_second': 10.785, 'train_loss': 0.3173308519143907, 'epoch': 2.0}
Subset Accuracy: 0.1833
Hamming Loss: 0.2406
Micro Precision: 0.6791
Micro Recall: 0.6657
Micro F1-Score: 0.6723
Precision per label: [0.78571429 0.76744186 0.57746479 0.73134328 0.84210526 0.56060606
 0.66666667 0.74074074]
Recall per label: [0.31428571 0.84615385 0.82       0.765625   0.38095238 0.80434783
 0.47619048 0.6779661 ]
F1-Score per label: [0.44897959 0.80487805 0.67768595 0.7480916  0.52459016 0.66071429
 0.55555556 0.7079646 ]
{np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(8)}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [176]:
import os

scorer_script = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_9/camelbert_finetuned_epochs_2_eval_f1_0.7812_greater_threshold_0.3/-home-lara.hassan-Documents-Cross-Country-Dialectal-Arabic-Identification-exp_8-camelbert_finetuned_epochs_2_eval_f1_0.7694_greater_threshold_0.3-experiment-9_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 70.90 %
MACRO AVERAGE RECALL SCORE: 63.57 %
MACRO AVERAGE F1-SCORE: 64.11 %
MACRO AVERAGE ACCURACY: 75.94 %



### EXPERIMENT 10

In [181]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv", "balanced_multilabel_dataset_750.csv"]
dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[5]}"
dev_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
    labels=labels,
    threshold=0.3,
    exp_num=10
)
trainer.train(
    num_train_epochs=6,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24
)
trainer.evaluate(dev_path=dev_path)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-mix and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 17%|█▋        | 330/1980 [00:25<03:22,  8.15it/s]
 17%|█▋        | 330/1980 [00:26<03:22,  8.15it/s]

{'eval_loss': 0.4739709496498108, 'eval_f1': 0.7463900752491357, 'eval_roc_auc': 0.7755923502913882, 'eval_accuracy': 0.11161731207289294, 'eval_runtime': 0.6115, 'eval_samples_per_second': 1435.763, 'eval_steps_per_second': 60.505, 'epoch': 1.0}


 25%|██▌       | 502/1980 [00:42<02:00, 12.23it/s]

{'loss': 0.5121, 'grad_norm': 3.3443150520324707, 'learning_rate': 4.99e-05, 'epoch': 1.52}


 33%|███▎      | 660/1980 [00:55<02:11, 10.03it/s]
 33%|███▎      | 660/1980 [00:56<02:11, 10.03it/s]

{'eval_loss': 0.454266220331192, 'eval_f1': 0.7598915235380015, 'eval_roc_auc': 0.7901453281480804, 'eval_accuracy': 0.17539863325740318, 'eval_runtime': 0.6578, 'eval_samples_per_second': 1334.809, 'eval_steps_per_second': 56.25, 'epoch': 2.0}


 50%|█████     | 990/1980 [01:26<01:18, 12.68it/s]
 50%|█████     | 990/1980 [01:26<01:18, 12.68it/s]

{'eval_loss': 0.475048303604126, 'eval_f1': 0.7562757722636186, 'eval_roc_auc': 0.7892164231072305, 'eval_accuracy': 0.16856492027334852, 'eval_runtime': 0.6607, 'eval_samples_per_second': 1328.888, 'eval_steps_per_second': 56.001, 'epoch': 3.0}


 51%|█████     | 1002/1980 [01:30<02:36,  6.27it/s]

{'loss': 0.3603, 'grad_norm': 1.35929274559021, 'learning_rate': 3.314189189189189e-05, 'epoch': 3.03}


 67%|██████▋   | 1320/1980 [01:57<00:52, 12.62it/s]
 67%|██████▋   | 1320/1980 [01:57<00:52, 12.62it/s]

{'eval_loss': 0.5146341323852539, 'eval_f1': 0.758403801803061, 'eval_roc_auc': 0.7890565235205352, 'eval_accuracy': 0.16287015945330297, 'eval_runtime': 0.6829, 'eval_samples_per_second': 1285.638, 'eval_steps_per_second': 54.178, 'epoch': 4.0}


 76%|███████▌  | 1502/1980 [02:15<00:39, 11.95it/s]

{'loss': 0.2479, 'grad_norm': 2.6936798095703125, 'learning_rate': 1.6250000000000002e-05, 'epoch': 4.55}


 83%|████████▎ | 1650/1980 [02:28<00:27, 11.80it/s]
 83%|████████▎ | 1650/1980 [02:28<00:27, 11.80it/s]

{'eval_loss': 0.5459094643592834, 'eval_f1': 0.7602208400307499, 'eval_roc_auc': 0.7907542852717375, 'eval_accuracy': 0.16173120728929385, 'eval_runtime': 0.7121, 'eval_samples_per_second': 1233.033, 'eval_steps_per_second': 51.962, 'epoch': 5.0}


100%|██████████| 1980/1980 [02:58<00:00, 13.02it/s]
100%|██████████| 1980/1980 [03:01<00:00, 13.02it/s]

{'eval_loss': 0.5531436800956726, 'eval_f1': 0.7577061437539677, 'eval_roc_auc': 0.7889758052175457, 'eval_accuracy': 0.15831435079726652, 'eval_runtime': 0.7475, 'eval_samples_per_second': 1174.636, 'eval_steps_per_second': 49.501, 'epoch': 6.0}


100%|██████████| 1980/1980 [03:04<00:00, 10.73it/s]


{'train_runtime': 184.5783, 'train_samples_per_second': 256.704, 'train_steps_per_second': 10.727, 'train_loss': 0.32922791375054256, 'epoch': 6.0}
Subset Accuracy: 0.1583
Hamming Loss: 0.2594
Micro Precision: 0.6801
Micro Recall: 0.5674
Micro F1-Score: 0.6187
Precision per label: [0.69230769 0.775      0.6        0.71929825 0.69230769 0.62264151
 0.46153846 0.77083333]
Recall per label: [0.25714286 0.79487179 0.72       0.640625   0.21428571 0.7173913
 0.28571429 0.62711864]
F1-Score per label: [0.375      0.78481013 0.65454545 0.67768595 0.32727273 0.66666667
 0.35294118 0.69158879]
{np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [182]:
import os

scorer_script = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "exp_10/camelbert_finetuned_epochs_6_eval_f1_0.7602_greater_threshold_0.3/CAMeL-Lab-bert-base-arabic-camelbert-mix-experiment-10_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 66.67 %
MACRO AVERAGE RECALL SCORE: 53.21 %
MACRO AVERAGE F1-SCORE: 56.63 %
MACRO AVERAGE ACCURACY: 74.06 %



In [183]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv", "balanced_multilabel_dataset_750.csv"]
dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dev_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_10/camelbert_finetuned_epochs_6_eval_f1_0.7602_greater_threshold_0.3",
    labels=labels,
    threshold=0.3,
    exp_num=10
)
trainer.train(
    num_train_epochs=6,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24
)
trainer.evaluate(dev_path=dev_path)


 17%|█▋        | 252/1512 [00:19<01:30, 13.87it/s]
 17%|█▋        | 252/1512 [00:19<01:30, 13.87it/s]

{'eval_loss': 0.28491029143333435, 'eval_f1': 0.8595482203884305, 'eval_roc_auc': 0.8654463416464854, 'eval_accuracy': 0.28315946348733234, 'eval_runtime': 0.4594, 'eval_samples_per_second': 1460.564, 'eval_steps_per_second': 60.948, 'epoch': 1.0}


 33%|███▎      | 502/1512 [00:41<01:19, 12.73it/s]

{'loss': 0.2935, 'grad_norm': 3.5294675827026367, 'learning_rate': 5e-05, 'epoch': 1.98}


 33%|███▎      | 504/1512 [00:41<01:15, 13.29it/s]
 33%|███▎      | 504/1512 [00:41<01:15, 13.29it/s]

{'eval_loss': 0.30023330450057983, 'eval_f1': 0.8561597851628063, 'eval_roc_auc': 0.8624923837192617, 'eval_accuracy': 0.2786885245901639, 'eval_runtime': 0.4719, 'eval_samples_per_second': 1422.021, 'eval_steps_per_second': 59.339, 'epoch': 2.0}


 50%|█████     | 756/1512 [01:03<00:55, 13.60it/s]
 50%|█████     | 756/1512 [01:04<00:55, 13.60it/s]

{'eval_loss': 0.33522099256515503, 'eval_f1': 0.8580319596299412, 'eval_roc_auc': 0.8644862487499366, 'eval_accuracy': 0.28912071535022354, 'eval_runtime': 0.4806, 'eval_samples_per_second': 1396.254, 'eval_steps_per_second': 58.264, 'epoch': 3.0}


 66%|██████▋   | 1002/1512 [01:25<00:39, 12.86it/s]

{'loss': 0.2058, 'grad_norm': 2.2834112644195557, 'learning_rate': 2.5296442687747035e-05, 'epoch': 3.97}


 67%|██████▋   | 1008/1512 [01:25<00:38, 13.25it/s]
 67%|██████▋   | 1008/1512 [01:26<00:38, 13.25it/s]

{'eval_loss': 0.3422885537147522, 'eval_f1': 0.8558310376492194, 'eval_roc_auc': 0.8618272837252222, 'eval_accuracy': 0.2906110283159464, 'eval_runtime': 0.4943, 'eval_samples_per_second': 1357.393, 'eval_steps_per_second': 56.642, 'epoch': 4.0}


 83%|████████▎ | 1260/1512 [01:48<00:18, 13.55it/s]
 83%|████████▎ | 1260/1512 [01:48<00:18, 13.55it/s]

{'eval_loss': 0.3467765152454376, 'eval_f1': 0.8621884241656105, 'eval_roc_auc': 0.8688706876949601, 'eval_accuracy': 0.30849478390462, 'eval_runtime': 0.4589, 'eval_samples_per_second': 1462.286, 'eval_steps_per_second': 61.019, 'epoch': 5.0}


 99%|█████████▉| 1502/1512 [02:10<00:00, 13.01it/s]

{'loss': 0.1434, 'grad_norm': 1.2461748123168945, 'learning_rate': 5.928853754940711e-07, 'epoch': 5.95}


100%|██████████| 1512/1512 [02:10<00:00, 13.59it/s]
100%|██████████| 1512/1512 [02:13<00:00, 13.59it/s]

{'eval_loss': 0.35399869084358215, 'eval_f1': 0.8584212747994935, 'eval_roc_auc': 0.8651041609176625, 'eval_accuracy': 0.30998509687034276, 'eval_runtime': 0.5414, 'eval_samples_per_second': 1239.478, 'eval_steps_per_second': 51.722, 'epoch': 6.0}


100%|██████████| 1512/1512 [02:16<00:00, 11.11it/s]


{'train_runtime': 136.0435, 'train_samples_per_second': 266.121, 'train_steps_per_second': 11.114, 'train_loss': 0.21364403985164784, 'epoch': 6.0}
Subset Accuracy: 0.1667
Hamming Loss: 0.2687
Micro Precision: 0.6960
Micro Recall: 0.4888
Micro F1-Score: 0.5743
Precision per label: [0.69230769 0.74358974 0.63265306 0.73170732 0.76923077 0.65789474
 0.5        0.76744186]
Recall per label: [0.25714286 0.74358974 0.62       0.46875    0.23809524 0.54347826
 0.33333333 0.55932203]
F1-Score per label: [0.375      0.74358974 0.62626263 0.57142857 0.36363636 0.5952381
 0.4        0.64705882]
{np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [184]:
import os

scorer_script = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_10/camelbert_finetuned_epochs_6_eval_f1_0.8622_greater_threshold_0.3/-home-lara.hassan-Documents-Cross-Country-Dialectal-Arabic-Identification-exp_10-camelbert_finetuned_epochs_6_eval_f1_0.7602_greater_threshold_0.3-experiment-10_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 68.69 %
MACRO AVERAGE RECALL SCORE: 47.05 %
MACRO AVERAGE F1-SCORE: 54.03 %
MACRO AVERAGE ACCURACY: 73.12 %



### EXPERIMENT 11

In [186]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_750.csv"]
dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dev_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
    labels=labels,
    threshold=0.3,
    exp_num=11
)
trainer.train(
    num_train_epochs=4,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=24
)
trainer.evaluate(dev_path=dev_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-mix and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 13%|█▎        | 503/3952 [00:27<02:59, 19.22it/s]

{'loss': 0.5519, 'grad_norm': 1.2335233688354492, 'learning_rate': 4.97e-05, 'epoch': 0.51}


                                                  
 25%|██▌       | 988/3952 [00:54<02:33, 19.25it/s]

{'eval_loss': 0.48687317967414856, 'eval_f1': 0.7363865383373167, 'eval_roc_auc': 0.7617819014921697, 'eval_accuracy': 0.12414578587699317, 'eval_runtime': 0.608, 'eval_samples_per_second': 1444.047, 'eval_steps_per_second': 60.854, 'epoch': 1.0}


 25%|██▌       | 1002/3952 [00:57<05:29,  8.95it/s]

{'loss': 0.4956, 'grad_norm': 3.4177091121673584, 'learning_rate': 4.2801274623406724e-05, 'epoch': 1.01}


 38%|███▊      | 1502/3952 [01:26<02:30, 16.31it/s]

{'loss': 0.409, 'grad_norm': 3.1355440616607666, 'learning_rate': 3.555909617612978e-05, 'epoch': 1.52}


                                                   
 50%|█████     | 1976/3952 [01:56<02:11, 15.06it/s]

{'eval_loss': 0.46634554862976074, 'eval_f1': 0.7590600808385285, 'eval_roc_auc': 0.7884490538348937, 'eval_accuracy': 0.17995444191343962, 'eval_runtime': 0.6889, 'eval_samples_per_second': 1274.564, 'eval_steps_per_second': 53.712, 'epoch': 2.0}


 51%|█████     | 2002/3952 [02:00<02:16, 14.32it/s]

{'loss': 0.3971, 'grad_norm': 3.05737566947937, 'learning_rate': 2.8316917728852837e-05, 'epoch': 2.02}


 63%|██████▎   | 2502/3952 [02:31<01:22, 17.65it/s]

{'loss': 0.3024, 'grad_norm': 1.4415926933288574, 'learning_rate': 2.10747392815759e-05, 'epoch': 2.53}


                                                   
 75%|███████▌  | 2964/3952 [03:00<01:03, 15.50it/s]

{'eval_loss': 0.48994776606559753, 'eval_f1': 0.7606552726756968, 'eval_roc_auc': 0.7920201280773378, 'eval_accuracy': 0.1765375854214123, 'eval_runtime': 0.695, 'eval_samples_per_second': 1263.324, 'eval_steps_per_second': 53.238, 'epoch': 3.0}


 76%|███████▌  | 3002/3952 [03:05<00:58, 16.31it/s]

{'loss': 0.3012, 'grad_norm': 3.860936403274536, 'learning_rate': 1.3832560834298958e-05, 'epoch': 3.04}


 89%|████████▊ | 3502/3952 [03:35<00:26, 16.75it/s]

{'loss': 0.2342, 'grad_norm': 2.205425500869751, 'learning_rate': 6.590382387022016e-06, 'epoch': 3.54}


                                                   
100%|██████████| 3952/3952 [04:05<00:00, 18.63it/s]

{'eval_loss': 0.5275660753250122, 'eval_f1': 0.7625708091474929, 'eval_roc_auc': 0.7929900864074229, 'eval_accuracy': 0.17198177676537585, 'eval_runtime': 0.6341, 'eval_samples_per_second': 1384.718, 'eval_steps_per_second': 58.354, 'epoch': 4.0}


100%|██████████| 3952/3952 [04:08<00:00, 15.93it/s]


{'train_runtime': 248.0563, 'train_samples_per_second': 127.342, 'train_steps_per_second': 15.932, 'train_loss': 0.3670956538273738, 'epoch': 4.0}
Subset Accuracy: 0.1833
Hamming Loss: 0.2573
Micro Precision: 0.6860
Micro Recall: 0.5646
Micro F1-Score: 0.6194
Precision per label: [0.75       0.74418605 0.5862069  0.72222222 0.84615385 0.61538462
 0.46666667 0.80434783]
Recall per label: [0.25714286 0.82051282 0.68       0.609375   0.26190476 0.69565217
 0.33333333 0.62711864]
F1-Score per label: [0.38297872 0.7804878  0.62962963 0.66101695 0.4        0.65306122
 0.38888889 0.7047619 ]
{np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(8)}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [187]:
import os

scorer_script = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_11/camelbert_finetuned_epochs_4_eval_f1_0.7626_greater_threshold_0.3/CAMeL-Lab-bert-base-arabic-camelbert-mix-experiment-11_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 69.19 %
MACRO AVERAGE RECALL SCORE: 53.56 %
MACRO AVERAGE F1-SCORE: 57.51 %
MACRO AVERAGE ACCURACY: 74.27 %



### EXPERIMENT 12

In [199]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dev_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_9/camelbert_finetuned_epochs_2_eval_f1_0.7812_greater_threshold_0.3",
    labels=labels,
    threshold=0.3,
    exp_num=12
)
trainer.train(
    num_train_epochs=5,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=24
)
trainer.evaluate(dev_path=dev_path)


  5%|▌         | 52/988 [02:54<00:52, 17.78it/s]  

{'loss': 0.3145, 'grad_norm': 3.7815072536468506, 'learning_rate': 4.99e-05, 'epoch': 0.99}



[A
[A
[A
[A
[A

[A[A                                         
                                                  
  5%|▌         | 52/988 [02:55<00:52, 17.78it/s]
[A

{'eval_loss': 0.5411572456359863, 'eval_f1': 0.7787280360752604, 'eval_roc_auc': 0.7744481507891129, 'eval_accuracy': 0.16393442622950818, 'eval_runtime': 0.4921, 'eval_samples_per_second': 1363.594, 'eval_steps_per_second': 56.901, 'epoch': 1.0}



  5%|▌         | 52/988 [03:26<00:52, 17.78it/s]  

{'loss': 0.3036, 'grad_norm': 2.7231392860412598, 'learning_rate': 3.7617866004962784e-05, 'epoch': 1.99}



[A
[A
[A
[A
[A                                                

                                                
  5%|▌         | 52/988 [03:27<00:52, 17.78it/s]
[A

{'eval_loss': 0.5048529505729675, 'eval_f1': 0.7767005978348683, 'eval_roc_auc': 0.7780685608761594, 'eval_accuracy': 0.15797317436661698, 'eval_runtime': 0.4733, 'eval_samples_per_second': 1417.646, 'eval_steps_per_second': 59.157, 'epoch': 2.0}



  5%|▌         | 52/988 [03:59<00:52, 17.78it/s]   

{'loss': 0.2531, 'grad_norm': 1.3901019096374512, 'learning_rate': 2.5210918114143922e-05, 'epoch': 2.98}



[A
[A
[A
[A

[A[A                                         
                                                   
  5%|▌         | 52/988 [04:00<00:52, 17.78it/s]
[A

{'eval_loss': 0.5629974603652954, 'eval_f1': 0.7656576200417536, 'eval_roc_auc': 0.7780476988897891, 'eval_accuracy': 0.15201192250372578, 'eval_runtime': 0.4748, 'eval_samples_per_second': 1413.085, 'eval_steps_per_second': 58.966, 'epoch': 3.0}



  5%|▌         | 52/988 [04:31<00:52, 17.78it/s]   

{'loss': 0.2008, 'grad_norm': 1.9427447319030762, 'learning_rate': 1.2803970223325062e-05, 'epoch': 3.98}



[A
[A
[A
[A

[A[A                                         
                                                   
  5%|▌         | 52/988 [04:32<00:52, 17.78it/s]
[A

{'eval_loss': 0.5819849967956543, 'eval_f1': 0.7798867798867799, 'eval_roc_auc': 0.7856383988591031, 'eval_accuracy': 0.16095380029806258, 'eval_runtime': 0.4735, 'eval_samples_per_second': 1416.998, 'eval_steps_per_second': 59.13, 'epoch': 4.0}



  5%|▌         | 52/988 [05:03<00:52, 17.78it/s]   

{'loss': 0.1635, 'grad_norm': 1.539159893989563, 'learning_rate': 3.970223325062035e-07, 'epoch': 4.97}



[A
[A
[A
[A
[A

[A[A                                         
                                                   
  5%|▌         | 52/988 [05:07<00:52, 17.78it/s]
[A

{'eval_loss': 0.599375307559967, 'eval_f1': 0.7798488664987405, 'eval_roc_auc': 0.7867945833894436, 'eval_accuracy': 0.15350223546944858, 'eval_runtime': 0.4795, 'eval_samples_per_second': 1399.392, 'eval_steps_per_second': 58.395, 'epoch': 5.0}



100%|██████████| 2515/2515 [02:44<00:00, 15.27it/s]


{'train_runtime': 164.7329, 'train_samples_per_second': 183.145, 'train_steps_per_second': 15.267, 'train_loss': 0.24653972858937076, 'epoch': 5.0}
Subset Accuracy: 0.1583
Hamming Loss: 0.2635
Micro Precision: 0.6873
Micro Recall: 0.5309
Micro F1-Score: 0.5990
Precision per label: [0.8        0.7804878  0.60377358 0.71111111 0.66666667 0.61904762
 0.625      0.72916667]
Recall per label: [0.34285714 0.82051282 0.64       0.5        0.23809524 0.56521739
 0.47619048 0.59322034]
F1-Score per label: [0.48       0.8        0.62135922 0.58715596 0.35087719 0.59090909
 0.54054054 0.65420561]
{np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8)}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [195]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lara.hassan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [200]:
import os

scorer_script = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_12/camelbert_finetuned_epochs_5_eval_f1_0.7799_greater_threshold_0.3/-home-lara.hassan-Documents-Cross-Country-Dialectal-Arabic-Identification-exp_9-camelbert_finetuned_epochs_2_eval_f1_0.7812_greater_threshold_0.3-experiment-12_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 69.19 %
MACRO AVERAGE RECALL SCORE: 52.20 %
MACRO AVERAGE F1-SCORE: 57.81 %
MACRO AVERAGE ACCURACY: 73.65 %



In [201]:
trainer.predict(["الله اكبر"])

(array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 array([[0.9784671 , 0.98201376, 0.97339284, 0.97706646, 0.9714559 ,
         0.98421544, 0.98141783, 0.9826909 , 0.9749646 , 0.98521465,
         0.9819447 , 0.9868787 , 0.97414124, 0.9840936 , 0.98228765,
         0.97850823, 0.9845754 , 0.987375  ]], dtype=float32))

In [202]:
trainer.predict(["انا مصرى ياسطااااا"])

(array([[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1]]),
 array([[0.02100437, 0.56673807, 0.96826136, 0.6016521 , 0.6660625 ,
         0.7235049 , 0.4344013 , 0.5190795 , 0.01851105, 0.41750893,
         0.5469499 , 0.5705108 , 0.7168137 , 0.7305783 , 0.40751025,
         0.01428194, 0.30002728, 0.4107563 ]], dtype=float32))

In [203]:
trainer.predict(['بدى فتوش و بدك ثومية'])

(array([[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]]),
 array([[0.0186178 , 0.00944908, 0.05379965, 0.11636177, 0.87768656,
         0.0102486 , 0.91181   , 0.02546808, 0.01876109, 0.00818791,
         0.8321002 , 0.00912564, 0.02909076, 0.01317195, 0.8830344 ,
         0.01879707, 0.00575254, 0.01016966]], dtype=float32))

In [208]:
trainer.predict([' خى ما قصرت'])

(array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 array([[0.13660839, 0.01159736, 0.08344117, 0.15241033, 0.03704717,
         0.02442309, 0.02052779, 0.19329959, 0.10212548, 0.01141961,
         0.0226292 , 0.01036816, 0.07504072, 0.01890545, 0.02465686,
         0.20323272, 0.00656425, 0.0113756 ]], dtype=float32))

In [209]:
trainer.predict(['الحمد لله يا زلمة'])

(array([[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0]]),
 array([[0.0082516 , 0.78727776, 0.9410069 , 0.87997437, 0.9512329 ,
         0.84376645, 0.8782099 , 0.6583077 , 0.00854433, 0.36274344,
         0.9314625 , 0.71452844, 0.9380106 , 0.5837373 , 0.88602656,
         0.00919655, 0.32562777, 0.27338037]], dtype=float32))

In [210]:
trainer.predict(['الحمد لله  '])

(array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 array([[0.97029626, 0.98527133, 0.9717254 , 0.9797453 , 0.9780936 ,
         0.9874236 , 0.9840936 , 0.9809491 , 0.9636434 , 0.9873262 ,
         0.98481095, 0.98926485, 0.97879386, 0.9832145 , 0.98486924,
         0.9704086 , 0.98646784, 0.98844737]], dtype=float32))

In [211]:
trainer.predict(["كل زول ليه الزول بتاعه"])

(array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]),
 array([[0.0349467 , 0.03507867, 0.8701566 , 0.22695492, 0.07303239,
         0.05964694, 0.03890198, 0.17497347, 0.03704717, 0.0193451 ,
         0.06176271, 0.02640552, 0.09704755, 0.4174496 , 0.03725677,
         0.02992975, 0.01802074, 0.03015741]], dtype=float32))

### EXPERIMENT 13

In [71]:
# CHANGE DATA TO BE MORE BALANCED
import pandas as pd
from sklearn.utils import shuffle, resample

directory = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/"
dataset_path = directory + "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv"
output_path  = directory + "SORTED_multilabel_dataset.csv"
df = pd.read_csv(dataset_path)
label_columns = df.columns.difference(['tweet'])
df['binary_label'] = df[label_columns].astype(str).agg(''.join, axis=1)
sorted_df = df.sort_values(by='binary_label').reset_index(drop=True)
sorted_df.to_csv(output_path, index=False)

In [72]:
label_columns = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait', 'Lebanon', 
                     'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar', 'Saudi_Arabia', 
                     'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']

In [73]:
# FUNCTION USED TO KNOW THE BREAKDOWN OF EVERY DIALECT IN A CLASS
def get_class_insight(balanced_df):
    # Initialize a dictionary to store counts for each class (count of active labels)
    counts = {i: (balanced_df[label_columns].sum(axis=1) == i).sum() for i in range(19)}

    # Print the counts for each class to verify the balance
    print("Counts of rows by number of active labels:")
    for count, value in counts.items():
        print(f"Class with {count} active labels: {value}")

    label_counts_by_class = {}

    # Loop over each possible count of active labels (1 to number of label columns)
    for num_classes in range(1, len(label_columns) + 1):
        # Filter rows where the number of active labels equals num_classes
        subset = balanced_df[balanced_df[label_columns].sum(axis=1) == num_classes]
        
        # Count occurrences of each label being active within this subset
        label_counts = subset[label_columns].sum()
        
        # Store the results in the dictionary
        label_counts_by_class[num_classes] = label_counts.to_dict()

    # Display the results
    for num_classes, counts in label_counts_by_class.items():
        print(f"\nFor class group with {num_classes} active labels:")
        for label, count in counts.items():
            print(f"  {label}: {count} occurrences")


In [77]:
# FUNCTION USED TO SPLIT THE DATA INTO BALANCED CLASSES

def split_balanced_threshold(df, label_columns, threshold=750, uni_label=True):
    balanced_df = pd.DataFrame()
    for num_classes in range(1, len(label_columns) + 1):
        # Filter rows with exactly 'num_classes' active labels
        subset = df[df[label_columns].sum(axis=1) == num_classes]

        if num_classes == 1 and uni_label:
            # For the subset with only 1 active label, balance each label to have equal occurrences
            balanced_subset = pd.DataFrame()
            
            # Iterate over each label (dialect) and resample to match the threshold
            for label in label_columns:
                # Filter rows where the current dialect is active (set to 1)
                label_rows = subset[subset[label] == 1]
                
                # Downsample or upsample to match the threshold for this dialect
                if len(label_rows) > threshold//15:
                    label_rows = resample(label_rows, n_samples=threshold//15, random_state=42, replace=False)
                elif len(label_rows) < threshold//15:
                    label_rows = resample(label_rows, n_samples=threshold//15, random_state=42, replace=True)
                
                # Append the balanced rows for the current dialect to the balanced_subset DataFrame
                balanced_subset = pd.concat([balanced_subset, label_rows], ignore_index=True)
            
            # Add the balanced subset for one active label to the main balanced DataFrame
            balanced_df = pd.concat([balanced_df, balanced_subset], ignore_index=True)
        
        else:
            # For other cases, apply the general threshold limit to the subset
            if len(subset) > threshold:
                subset = shuffle(subset).head(threshold)
            balanced_df = pd.concat([balanced_df, subset], ignore_index=True)

    return balanced_df


In [78]:
# FUNCTION TO REMOVE UNDESIRED CLASSES
def modify_dataset_for_active_labels(df):                     
    mask = df[label_columns].sum(axis=1).isin([16, 17])
    df.loc[mask, label_columns] = 1  
    return df


In [79]:
df = pd.read_csv(output_path)
df.head()

Unnamed: 0,id,tweet,Algeria,Bahrain,Egypt,Iraq,Jordan,Kuwait,Lebanon,Libya,...,Palestine,Qatar,Saudi_Arabia,Sudan,Syria,Tunisia,UAE,Yemen,Computed,binary_label
0,10012,تقريبا كلام السيسي بدل يتكلم عربي بيتكلم انجليزي,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,no,00no000000000000000110012
1,10032,وای ایشالا که حالش خوبه,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,no,00no000000000000000110032
2,10041,اعجبني فيديو علي قوات مدعومه اماراتيا تابعه لنجل,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,no,00no000000000000000110041
3,10064,حداقل دیگه حسرت اینکه چرا نگفتم رو نداری,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,no,00no000000000000000110064
4,10087,مرعبه كلمه سامحني لان اغلب الاوقات يكون بعدها ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,no,00no000000000000000110087


In [80]:
get_class_insight(df)

Counts of rows by number of active labels:
Class with 0 active labels: 0
Class with 1 active labels: 41837
Class with 2 active labels: 3220
Class with 3 active labels: 2329
Class with 4 active labels: 2580
Class with 5 active labels: 1416
Class with 6 active labels: 1709
Class with 7 active labels: 1794
Class with 8 active labels: 1108
Class with 9 active labels: 536
Class with 10 active labels: 168
Class with 11 active labels: 137
Class with 12 active labels: 334
Class with 13 active labels: 273
Class with 14 active labels: 256
Class with 15 active labels: 532
Class with 16 active labels: 11
Class with 17 active labels: 26
Class with 18 active labels: 502

For class group with 1 active labels:
  Algeria: 2712 occurrences
  Bahrain: 417 occurrences
  Egypt: 10609 occurrences
  Iraq: 6026 occurrences
  Jordan: 762 occurrences
  Kuwait: 672 occurrences
  Lebanon: 1245 occurrences
  Libya: 2567 occurrences
  Morocco: 2213 occurrences
  Oman: 2077 occurrences
  Palestine: 794 occurrences
 

In [81]:
df = modify_dataset_for_active_labels(df)

In [82]:
get_class_insight(df)

Counts of rows by number of active labels:
Class with 0 active labels: 0
Class with 1 active labels: 41837
Class with 2 active labels: 3220
Class with 3 active labels: 2329
Class with 4 active labels: 2580
Class with 5 active labels: 1416
Class with 6 active labels: 1709
Class with 7 active labels: 1794
Class with 8 active labels: 1108
Class with 9 active labels: 536
Class with 10 active labels: 168
Class with 11 active labels: 137
Class with 12 active labels: 334
Class with 13 active labels: 273
Class with 14 active labels: 256
Class with 15 active labels: 532
Class with 16 active labels: 0
Class with 17 active labels: 0
Class with 18 active labels: 539

For class group with 1 active labels:
  Algeria: 2712 occurrences
  Bahrain: 417 occurrences
  Egypt: 10609 occurrences
  Iraq: 6026 occurrences
  Jordan: 762 occurrences
  Kuwait: 672 occurrences
  Lebanon: 1245 occurrences
  Libya: 2567 occurrences
  Morocco: 2213 occurrences
  Oman: 2077 occurrences
  Palestine: 794 occurrences
  Q

In [84]:
df = split_balanced_threshold(df, label_columns, threshold=500, uni_label=True)

In [85]:
get_class_insight(df)

Counts of rows by number of active labels:
Class with 0 active labels: 0
Class with 1 active labels: 594
Class with 2 active labels: 500
Class with 3 active labels: 500
Class with 4 active labels: 500
Class with 5 active labels: 500
Class with 6 active labels: 500
Class with 7 active labels: 500
Class with 8 active labels: 500
Class with 9 active labels: 500
Class with 10 active labels: 168
Class with 11 active labels: 137
Class with 12 active labels: 334
Class with 13 active labels: 273
Class with 14 active labels: 256
Class with 15 active labels: 500
Class with 16 active labels: 0
Class with 17 active labels: 0
Class with 18 active labels: 500

For class group with 1 active labels:
  Algeria: 33 occurrences
  Bahrain: 33 occurrences
  Egypt: 33 occurrences
  Iraq: 33 occurrences
  Jordan: 33 occurrences
  Kuwait: 33 occurrences
  Lebanon: 33 occurrences
  Libya: 33 occurrences
  Morocco: 33 occurrences
  Oman: 33 occurrences
  Palestine: 33 occurrences
  Qatar: 33 occurrences
  Saudi

In [86]:
df.to_csv(directory + " BALANCED_NEW_DATASET_500.csv", index=False)

In [69]:
dataset_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/ BALANCED_NEW_DATASET_750.csv"
dev_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_8/camelbert_finetuned_epochs_2_eval_f1_0.7694_greater_threshold_0.3",
    labels=labels,
    threshold=0.3,
    exp_num=13
)
trainer.train(
    num_train_epochs=2,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24
)
trainer.evaluate(dev_path=dev_path)

                                                 
 50%|█████     | 374/748 [00:32<00:27, 13.65it/s]

{'eval_loss': 0.4429030120372772, 'eval_f1': 0.7262548011197187, 'eval_roc_auc': 0.7910349145094046, 'eval_accuracy': 0.15060240963855423, 'eval_runtime': 0.7421, 'eval_samples_per_second': 1342.187, 'eval_steps_per_second': 56.598, 'epoch': 1.0}


 67%|██████▋   | 501/748 [00:43<00:19, 12.81it/s]

{'loss': 0.4215, 'grad_norm': 4.750375270843506, 'learning_rate': 5e-05, 'epoch': 1.34}


                                                 
100%|██████████| 748/748 [01:05<00:00, 13.22it/s]

{'eval_loss': 0.4512268900871277, 'eval_f1': 0.7295721119192713, 'eval_roc_auc': 0.7940113323604024, 'eval_accuracy': 0.16967871485943775, 'eval_runtime': 0.7301, 'eval_samples_per_second': 1364.209, 'eval_steps_per_second': 57.527, 'epoch': 2.0}


100%|██████████| 748/748 [01:08<00:00, 10.96it/s]


{'train_runtime': 68.2553, 'train_samples_per_second': 262.397, 'train_steps_per_second': 10.959, 'train_loss': 0.4062513361640155, 'epoch': 2.0}
Subset Accuracy: 0.1917
Hamming Loss: 0.2740
Micro Precision: 0.6768
Micro Recall: 0.5000
Micro F1-Score: 0.5751
Precision per label: [0.8        0.89655172 0.58823529 0.66666667 0.66666667 0.55813953
 0.64705882 0.73809524]
Recall per label: [0.34285714 0.66666667 0.6        0.5        0.28571429 0.52173913
 0.52380952 0.52542373]
F1-Score per label: [0.48       0.76470588 0.59405941 0.57142857 0.4        0.53932584
 0.57894737 0.61386139]
{np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(8)}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [70]:
import os

scorer_script = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_13/camelbert_finetuned_epochs_2_eval_f1_0.7296_greater_threshold_0.3/-home-lara.hassan-Documents-Cross-Country-Dialectal-Arabic-Identification-exp_8-camelbert_finetuned_epochs_2_eval_f1_0.7694_greater_threshold_0.3-experiment-13_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 69.52 %
MACRO AVERAGE RECALL SCORE: 49.58 %
MACRO AVERAGE F1-SCORE: 56.78 %
MACRO AVERAGE ACCURACY: 72.60 %



### EXPERIMENT 14

In [88]:
dataset_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/ BALANCED_NEW_DATASET_500.csv"
dev_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_8/camelbert_finetuned_epochs_2_eval_f1_0.7694_greater_threshold_0.3",
    labels=labels,
    threshold=0.3,
    exp_num=14
)
trainer.train(
    num_train_epochs=5,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24
)
trainer.evaluate(dev_path=dev_path)

 20%|█▉        | 253/1270 [00:19<01:17, 13.17it/s]
 20%|██        | 254/1270 [00:20<01:17, 13.17it/s]

{'eval_loss': 0.41958171129226685, 'eval_f1': 0.7857666313818092, 'eval_roc_auc': 0.7992212978015347, 'eval_accuracy': 0.17429837518463812, 'eval_runtime': 0.514, 'eval_samples_per_second': 1317.176, 'eval_steps_per_second': 56.423, 'epoch': 1.0}


 39%|███▉      | 501/1270 [00:41<00:58, 13.14it/s]

{'loss': 0.3978, 'grad_norm': 2.486278533935547, 'learning_rate': 5e-05, 'epoch': 1.97}


 40%|███▉      | 507/1270 [00:41<00:58, 13.04it/s]
 40%|████      | 508/1270 [00:42<00:58, 13.04it/s]

{'eval_loss': 0.43696141242980957, 'eval_f1': 0.7823587385019711, 'eval_roc_auc': 0.7964695693203278, 'eval_accuracy': 0.16248153618906944, 'eval_runtime': 0.4917, 'eval_samples_per_second': 1376.887, 'eval_steps_per_second': 58.98, 'epoch': 2.0}


 60%|█████▉    | 761/1270 [01:04<00:40, 12.70it/s]
 60%|██████    | 762/1270 [01:04<00:39, 12.70it/s]

{'eval_loss': 0.4539303481578827, 'eval_f1': 0.7873862732974118, 'eval_roc_auc': 0.8072853133004487, 'eval_accuracy': 0.16691285081240767, 'eval_runtime': 0.502, 'eval_samples_per_second': 1348.666, 'eval_steps_per_second': 57.772, 'epoch': 3.0}


 79%|███████▉  | 1001/1270 [01:25<00:21, 12.52it/s]

{'loss': 0.2821, 'grad_norm': 1.5508099794387817, 'learning_rate': 1.7532467532467535e-05, 'epoch': 3.94}


 80%|███████▉  | 1015/1270 [01:26<00:19, 12.87it/s]
 80%|████████  | 1016/1270 [01:27<00:19, 12.87it/s]

{'eval_loss': 0.486063688993454, 'eval_f1': 0.7885090785099309, 'eval_roc_auc': 0.8056512106004009, 'eval_accuracy': 0.17872968980797638, 'eval_runtime': 0.5283, 'eval_samples_per_second': 1281.507, 'eval_steps_per_second': 54.895, 'epoch': 4.0}


100%|█████████▉| 1269/1270 [01:49<00:00, 13.10it/s]
100%|██████████| 1270/1270 [01:52<00:00, 13.10it/s]

{'eval_loss': 0.4963809847831726, 'eval_f1': 0.7906378600823045, 'eval_roc_auc': 0.8081659234473502, 'eval_accuracy': 0.17725258493353027, 'eval_runtime': 0.5111, 'eval_samples_per_second': 1324.562, 'eval_steps_per_second': 56.739, 'epoch': 5.0}


100%|██████████| 1270/1270 [01:55<00:00, 10.99it/s]


{'train_runtime': 115.591, 'train_samples_per_second': 263.213, 'train_steps_per_second': 10.987, 'train_loss': 0.31099345590185934, 'epoch': 5.0}
Subset Accuracy: 0.1583
Hamming Loss: 0.2844
Micro Precision: 0.6554
Micro Recall: 0.4916
Micro F1-Score: 0.5618
Precision per label: [0.81818182 0.90322581 0.50909091 0.63265306 0.55       0.55813953
 0.61538462 0.8       ]
Recall per label: [0.25714286 0.71794872 0.56       0.484375   0.26190476 0.52173913
 0.38095238 0.61016949]
F1-Score per label: [0.39130435 0.8        0.53333333 0.54867257 0.35483871 0.53932584
 0.47058824 0.69230769]
{np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(8)}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [89]:
import os

scorer_script = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_14/camelbert_finetuned_epochs_5_eval_f1_0.7906_greater_threshold_0.3/-home-lara.hassan-Documents-Cross-Country-Dialectal-Arabic-Identification-exp_8-camelbert_finetuned_epochs_2_eval_f1_0.7694_greater_threshold_0.3-experiment-14_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 67.33 %
MACRO AVERAGE RECALL SCORE: 47.43 %
MACRO AVERAGE F1-SCORE: 54.13 %
MACRO AVERAGE ACCURACY: 71.56 %



### EXPERIMENT 15

In [90]:
dataset_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/ BALANCED_NEW_DATASET_500.csv"
dev_path = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_8/camelbert_finetuned_epochs_2_eval_f1_0.7694_greater_threshold_0.3",
    labels=labels,
    threshold=0.3,
    exp_num=14
)
trainer.train(
    num_train_epochs=20,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24
)
trainer.evaluate(dev_path=dev_path)

  5%|▌         | 254/5080 [00:19<06:05, 13.19it/s]
  5%|▌         | 254/5080 [00:20<06:05, 13.19it/s]

{'eval_loss': 0.41958171129226685, 'eval_f1': 0.7857666313818092, 'eval_roc_auc': 0.7992212978015347, 'eval_accuracy': 0.17429837518463812, 'eval_runtime': 0.5048, 'eval_samples_per_second': 1340.993, 'eval_steps_per_second': 57.443, 'epoch': 1.0}


 10%|▉         | 502/5080 [00:41<05:50, 13.05it/s]

{'loss': 0.3978, 'grad_norm': 2.486278533935547, 'learning_rate': 5e-05, 'epoch': 1.97}


 10%|█         | 508/5080 [00:42<05:39, 13.46it/s]
 10%|█         | 508/5080 [00:42<05:39, 13.46it/s]

{'eval_loss': 0.4369806945323944, 'eval_f1': 0.7821660234830445, 'eval_roc_auc': 0.7962522410096933, 'eval_accuracy': 0.16100443131462333, 'eval_runtime': 0.507, 'eval_samples_per_second': 1335.188, 'eval_steps_per_second': 57.194, 'epoch': 2.0}


 15%|█▌        | 762/5080 [01:04<05:22, 13.40it/s]
 15%|█▌        | 762/5080 [01:04<05:22, 13.40it/s]

{'eval_loss': 0.46652302145957947, 'eval_f1': 0.7806145301442264, 'eval_roc_auc': 0.8020244655851325, 'eval_accuracy': 0.16395864106351551, 'eval_runtime': 0.4911, 'eval_samples_per_second': 1378.409, 'eval_steps_per_second': 59.046, 'epoch': 3.0}


 20%|█▉        | 1002/5080 [01:25<05:15, 12.94it/s]

{'loss': 0.2916, 'grad_norm': 2.4166393280029297, 'learning_rate': 4.4541484716157205e-05, 'epoch': 3.94}


 20%|██        | 1016/5080 [01:26<05:00, 13.54it/s]
 20%|██        | 1016/5080 [01:27<05:00, 13.54it/s]

{'eval_loss': 0.498250812292099, 'eval_f1': 0.7837244639540774, 'eval_roc_auc': 0.8001604489455602, 'eval_accuracy': 0.17134416543574593, 'eval_runtime': 0.5226, 'eval_samples_per_second': 1295.344, 'eval_steps_per_second': 55.487, 'epoch': 4.0}


 25%|██▌       | 1270/5080 [01:48<04:40, 13.61it/s]
 25%|██▌       | 1270/5080 [01:49<04:40, 13.61it/s]

{'eval_loss': 0.520680844783783, 'eval_f1': 0.7880685276980607, 'eval_roc_auc': 0.8067535362052624, 'eval_accuracy': 0.1757754800590842, 'eval_runtime': 0.5062, 'eval_samples_per_second': 1337.525, 'eval_steps_per_second': 57.294, 'epoch': 5.0}


 30%|██▉       | 1502/5080 [02:09<04:34, 13.04it/s]

{'loss': 0.2098, 'grad_norm': 1.620110034942627, 'learning_rate': 3.9082969432314415e-05, 'epoch': 5.91}


 30%|███       | 1524/5080 [02:10<04:24, 13.46it/s]
 30%|███       | 1524/5080 [02:11<04:24, 13.46it/s]

{'eval_loss': 0.554932713508606, 'eval_f1': 0.7831502465184672, 'eval_roc_auc': 0.8015938122357977, 'eval_accuracy': 0.15952732644017725, 'eval_runtime': 0.4949, 'eval_samples_per_second': 1367.842, 'eval_steps_per_second': 58.593, 'epoch': 6.0}


 35%|███▌      | 1778/5080 [02:33<04:04, 13.50it/s]
 35%|███▌      | 1778/5080 [02:33<04:04, 13.50it/s]

{'eval_loss': 0.5746973752975464, 'eval_f1': 0.7826010852441799, 'eval_roc_auc': 0.8020177751854611, 'eval_accuracy': 0.15214180206794684, 'eval_runtime': 0.5001, 'eval_samples_per_second': 1353.702, 'eval_steps_per_second': 57.987, 'epoch': 7.0}


 39%|███▉      | 2002/5080 [02:53<03:56, 13.03it/s]

{'loss': 0.1568, 'grad_norm': 1.9167410135269165, 'learning_rate': 3.362445414847162e-05, 'epoch': 7.87}


 40%|████      | 2032/5080 [02:55<03:45, 13.50it/s]
 40%|████      | 2032/5080 [02:55<03:45, 13.50it/s]

{'eval_loss': 0.5851396918296814, 'eval_f1': 0.7846004757290107, 'eval_roc_auc': 0.8044437579777531, 'eval_accuracy': 0.14180206794682423, 'eval_runtime': 0.5022, 'eval_samples_per_second': 1347.95, 'eval_steps_per_second': 57.741, 'epoch': 8.0}


 45%|████▌     | 2286/5080 [03:18<03:30, 13.26it/s]
 45%|████▌     | 2286/5080 [03:19<03:30, 13.26it/s]

{'eval_loss': 0.6327815651893616, 'eval_f1': 0.7814134964972954, 'eval_roc_auc': 0.8019523244312996, 'eval_accuracy': 0.1447562776957164, 'eval_runtime': 0.5027, 'eval_samples_per_second': 1346.74, 'eval_steps_per_second': 57.689, 'epoch': 9.0}


 49%|████▉     | 2502/5080 [03:39<03:27, 12.40it/s]

{'loss': 0.116, 'grad_norm': 2.9025323390960693, 'learning_rate': 2.816593886462882e-05, 'epoch': 9.84}


 50%|█████     | 2540/5080 [03:42<03:09, 13.40it/s]
 50%|█████     | 2540/5080 [03:42<03:09, 13.40it/s]

{'eval_loss': 0.626867949962616, 'eval_f1': 0.7873134328358209, 'eval_roc_auc': 0.8076503239579264, 'eval_accuracy': 0.16248153618906944, 'eval_runtime': 0.4999, 'eval_samples_per_second': 1354.185, 'eval_steps_per_second': 58.008, 'epoch': 10.0}


 55%|█████▌    | 2794/5080 [04:05<02:50, 13.41it/s]
 55%|█████▌    | 2794/5080 [04:05<02:50, 13.41it/s]

{'eval_loss': 0.6606573462486267, 'eval_f1': 0.7840213049267644, 'eval_roc_auc': 0.8044923182147118, 'eval_accuracy': 0.14771048744460857, 'eval_runtime': 0.5358, 'eval_samples_per_second': 1263.519, 'eval_steps_per_second': 54.124, 'epoch': 11.0}


 59%|█████▉    | 3002/5080 [04:24<02:48, 12.32it/s]

{'loss': 0.0863, 'grad_norm': 0.8163240551948547, 'learning_rate': 2.2707423580786028e-05, 'epoch': 11.81}


 60%|██████    | 3048/5080 [04:28<02:41, 12.57it/s]
 60%|██████    | 3048/5080 [04:28<02:41, 12.57it/s]

{'eval_loss': 0.6866637468338013, 'eval_f1': 0.7799626301272355, 'eval_roc_auc': 0.8008625844553279, 'eval_accuracy': 0.1432791728212703, 'eval_runtime': 0.5534, 'eval_samples_per_second': 1223.363, 'eval_steps_per_second': 52.404, 'epoch': 12.0}


 65%|██████▌   | 3302/5080 [04:51<02:12, 13.38it/s]
 65%|██████▌   | 3302/5080 [04:52<02:12, 13.38it/s]

{'eval_loss': 0.7243456244468689, 'eval_f1': 0.7818601348952786, 'eval_roc_auc': 0.8024372413091161, 'eval_accuracy': 0.14918759231905465, 'eval_runtime': 0.5135, 'eval_samples_per_second': 1318.463, 'eval_steps_per_second': 56.478, 'epoch': 13.0}


 69%|██████▉   | 3502/5080 [05:10<02:08, 12.29it/s]

{'loss': 0.0639, 'grad_norm': 0.9643259048461914, 'learning_rate': 1.7248908296943234e-05, 'epoch': 13.78}


 70%|███████   | 3556/5080 [05:15<01:54, 13.26it/s]
 70%|███████   | 3556/5080 [05:15<01:54, 13.26it/s]

{'eval_loss': 0.7442327737808228, 'eval_f1': 0.7813854728833053, 'eval_roc_auc': 0.8017394929466736, 'eval_accuracy': 0.14771048744460857, 'eval_runtime': 0.515, 'eval_samples_per_second': 1314.548, 'eval_steps_per_second': 56.31, 'epoch': 14.0}


 75%|███████▌  | 3810/5080 [05:37<01:34, 13.47it/s]
 75%|███████▌  | 3810/5080 [05:38<01:34, 13.47it/s]

{'eval_loss': 0.7518910765647888, 'eval_f1': 0.7786892634995042, 'eval_roc_auc': 0.8007468734446191, 'eval_accuracy': 0.1536189069423929, 'eval_runtime': 0.5041, 'eval_samples_per_second': 1343.01, 'eval_steps_per_second': 57.529, 'epoch': 15.0}


 79%|███████▉  | 4002/5080 [05:55<01:25, 12.57it/s]

{'loss': 0.0479, 'grad_norm': 0.5296791195869446, 'learning_rate': 1.1790393013100438e-05, 'epoch': 15.75}


 80%|████████  | 4064/5080 [06:00<01:15, 13.55it/s]
 80%|████████  | 4064/5080 [06:01<01:15, 13.55it/s]

{'eval_loss': 0.7658830881118774, 'eval_f1': 0.7787119856887299, 'eval_roc_auc': 0.8001246662751869, 'eval_accuracy': 0.1447562776957164, 'eval_runtime': 0.4997, 'eval_samples_per_second': 1354.687, 'eval_steps_per_second': 58.029, 'epoch': 16.0}


 85%|████████▌ | 4318/5080 [06:24<01:02, 12.26it/s]
 85%|████████▌ | 4318/5080 [06:24<01:02, 12.26it/s]

{'eval_loss': 0.7807112336158752, 'eval_f1': 0.779446354260901, 'eval_roc_auc': 0.8017343928879078, 'eval_accuracy': 0.14771048744460857, 'eval_runtime': 0.6318, 'eval_samples_per_second': 1071.517, 'eval_steps_per_second': 45.9, 'epoch': 17.0}


 89%|████████▊ | 4502/5080 [06:41<00:44, 12.90it/s]

{'loss': 0.0383, 'grad_norm': 2.551100254058838, 'learning_rate': 6.342794759825328e-06, 'epoch': 17.72}


 90%|█████████ | 4572/5080 [06:47<00:37, 13.57it/s]
 90%|█████████ | 4572/5080 [06:47<00:37, 13.57it/s]

{'eval_loss': 0.7894988656044006, 'eval_f1': 0.7802996321880327, 'eval_roc_auc': 0.801847005475818, 'eval_accuracy': 0.1536189069423929, 'eval_runtime': 0.4988, 'eval_samples_per_second': 1357.157, 'eval_steps_per_second': 58.135, 'epoch': 18.0}


 95%|█████████▌| 4826/5080 [07:09<00:18, 13.68it/s]
 95%|█████████▌| 4826/5080 [07:10<00:18, 13.68it/s]

{'eval_loss': 0.7982764840126038, 'eval_f1': 0.7787069508137757, 'eval_roc_auc': 0.8005562244736575, 'eval_accuracy': 0.15066469719350073, 'eval_runtime': 0.5016, 'eval_samples_per_second': 1349.693, 'eval_steps_per_second': 57.816, 'epoch': 19.0}


 98%|█████████▊| 5002/5080 [07:26<00:06, 12.83it/s]

{'loss': 0.0324, 'grad_norm': 0.5551343560218811, 'learning_rate': 8.842794759825327e-07, 'epoch': 19.69}


100%|██████████| 5080/5080 [07:32<00:00, 13.52it/s]
100%|██████████| 5080/5080 [07:35<00:00, 13.52it/s]

{'eval_loss': 0.799960196018219, 'eval_f1': 0.7806718160589187, 'eval_roc_auc': 0.8022875573263056, 'eval_accuracy': 0.155096011816839, 'eval_runtime': 0.5184, 'eval_samples_per_second': 1305.893, 'eval_steps_per_second': 55.939, 'epoch': 20.0}


100%|██████████| 5080/5080 [07:41<00:00, 11.01it/s]


{'train_runtime': 461.3046, 'train_samples_per_second': 263.817, 'train_steps_per_second': 11.012, 'train_loss': 0.14231886192569582, 'epoch': 20.0}
Subset Accuracy: 0.1333
Hamming Loss: 0.2969
Micro Precision: 0.6473
Micro Recall: 0.4382
Micro F1-Score: 0.5226
Precision per label: [0.76923077 0.88       0.5        0.65853659 0.46153846 0.61764706
 0.53333333 0.74      ]
Recall per label: [0.28571429 0.56410256 0.5        0.421875   0.14285714 0.45652174
 0.38095238 0.62711864]
F1-Score per label: [0.41666667 0.6875     0.5        0.51428571 0.21818182 0.525
 0.44444444 0.67889908]
{np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(8)}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [92]:
import os

scorer_script = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_14/camelbert_finetuned_epochs_20_eval_f1_0.7881_greater_threshold_0.3/-home-lara.hassan-Documents-Cross-Country-Dialectal-Arabic-Identification-exp_8-camelbert_finetuned_epochs_2_eval_f1_0.7694_greater_threshold_0.3-experiment-14_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 64.50 %
MACRO AVERAGE RECALL SCORE: 42.24 %
MACRO AVERAGE F1-SCORE: 49.81 %
MACRO AVERAGE ACCURACY: 70.31 %



In [93]:
trainer.predict(["الله اكبر"])

(array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 array([[0.9507779 , 0.98521465, 0.97288233, 0.97754294, 0.98201376,
         0.9885804 , 0.98475236, 0.96628344, 0.9546474 , 0.98197925,
         0.9845754 , 0.9857729 , 0.9827572 , 0.97788346, 0.98521465,
         0.94539934, 0.98340684, 0.9881765 ]], dtype=float32))

In [94]:
trainer.predict(["انا مصرى ياسطااااا"])

(array([[0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1]]),
 array([[0.00941259, 0.624179  , 0.7916601 , 0.4977589 , 0.6311397 ,
         0.73278314, 0.33133605, 0.26999873, 0.00754792, 0.4181622 ,
         0.47498128, 0.66093874, 0.86025184, 0.3869372 , 0.33654907,
         0.00871138, 0.32637876, 0.47081262]], dtype=float32))

## Experiment 16 Mekky

In [6]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
# dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dataset_path = '/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/balanced_multilabel_dataset_500.csv'
dev_path = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/exp_16/marbert_finetuned_epochs_2_eval_f1_0.7607_greater_threshold_0.3",
    labels=labels,
    threshold=0.3,
    exp_num=17
)
trainer.train(
    num_train_epochs=10,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24
)
trainer.evaluate(dev_path=dev_path)

                                                  
 10%|█         | 252/2520 [00:22<03:11, 11.85it/s]

{'eval_loss': 0.467267781496048, 'eval_f1': 0.7741051549922932, 'eval_roc_auc': 0.7878222018813346, 'eval_accuracy': 0.16542473919523099, 'eval_runtime': 0.5593, 'eval_samples_per_second': 1199.706, 'eval_steps_per_second': 50.062, 'epoch': 1.0}


 20%|█▉        | 502/2520 [00:49<03:01, 11.11it/s]

{'loss': 0.4168, 'grad_norm': 6.076703071594238, 'learning_rate': 5e-05, 'epoch': 1.98}


                                                  
 20%|██        | 504/2520 [00:49<02:55, 11.50it/s]

{'eval_loss': 0.47096696496009827, 'eval_f1': 0.7664578147798472, 'eval_roc_auc': 0.7806076513088027, 'eval_accuracy': 0.12667660208643816, 'eval_runtime': 0.515, 'eval_samples_per_second': 1302.944, 'eval_steps_per_second': 54.37, 'epoch': 2.0}


                                                  
 30%|███       | 756/2520 [01:15<02:28, 11.86it/s]

{'eval_loss': 0.5002021789550781, 'eval_f1': 0.772990812594773, 'eval_roc_auc': 0.7912177076066527, 'eval_accuracy': 0.14008941877794337, 'eval_runtime': 0.5085, 'eval_samples_per_second': 1319.577, 'eval_steps_per_second': 55.064, 'epoch': 3.0}


 40%|███▉      | 1002/2520 [01:41<02:14, 11.28it/s]

{'loss': 0.3131, 'grad_norm': 3.2008445262908936, 'learning_rate': 3.7673267326732673e-05, 'epoch': 3.97}


                                                   
 40%|████      | 1008/2520 [01:42<02:08, 11.81it/s]

{'eval_loss': 0.5147558450698853, 'eval_f1': 0.7785293097467916, 'eval_roc_auc': 0.7934895058052991, 'eval_accuracy': 0.15052160953800298, 'eval_runtime': 0.5046, 'eval_samples_per_second': 1329.877, 'eval_steps_per_second': 55.494, 'epoch': 4.0}


                                                   
 50%|█████     | 1260/2520 [02:09<01:46, 11.80it/s]

{'eval_loss': 0.544553816318512, 'eval_f1': 0.7688691564377121, 'eval_roc_auc': 0.7845958507551555, 'eval_accuracy': 0.14307004470938897, 'eval_runtime': 0.5047, 'eval_samples_per_second': 1329.552, 'eval_steps_per_second': 55.481, 'epoch': 5.0}


 60%|█████▉    | 1501/2520 [02:34<01:30, 11.22it/s]

{'loss': 0.2126, 'grad_norm': 12.227588653564453, 'learning_rate': 2.5321782178217822e-05, 'epoch': 5.95}


                                                   
 60%|██████    | 1512/2520 [02:35<01:28, 11.41it/s]

{'eval_loss': 0.5832158327102661, 'eval_f1': 0.7710987601539119, 'eval_roc_auc': 0.784712128920743, 'eval_accuracy': 0.14605067064083457, 'eval_runtime': 0.5035, 'eval_samples_per_second': 1332.608, 'eval_steps_per_second': 55.608, 'epoch': 6.0}


                                                   
 70%|███████   | 1764/2520 [03:02<01:03, 11.87it/s]

{'eval_loss': 0.6284805536270142, 'eval_f1': 0.7659278574532812, 'eval_roc_auc': 0.7815790613766535, 'eval_accuracy': 0.12965722801788376, 'eval_runtime': 0.5013, 'eval_samples_per_second': 1338.444, 'eval_steps_per_second': 55.852, 'epoch': 7.0}


 79%|███████▉  | 2002/2520 [03:26<00:45, 11.33it/s]

{'loss': 0.1538, 'grad_norm': 1.6658316850662231, 'learning_rate': 1.2945544554455447e-05, 'epoch': 7.94}


                                                   
 80%|████████  | 2016/2520 [03:28<00:43, 11.71it/s]

{'eval_loss': 0.6430134177207947, 'eval_f1': 0.7737251512532412, 'eval_roc_auc': 0.7884810466867804, 'eval_accuracy': 0.15648286140089418, 'eval_runtime': 0.5069, 'eval_samples_per_second': 1323.688, 'eval_steps_per_second': 55.236, 'epoch': 8.0}


                                                   
 90%|█████████ | 2268/2520 [03:55<00:21, 11.79it/s]

{'eval_loss': 0.6525330543518066, 'eval_f1': 0.7699037620297463, 'eval_roc_auc': 0.7861548168653606, 'eval_accuracy': 0.12965722801788376, 'eval_runtime': 0.5062, 'eval_samples_per_second': 1325.542, 'eval_steps_per_second': 55.313, 'epoch': 9.0}


 99%|█████████▉| 2502/2520 [04:19<00:01, 11.29it/s]

{'loss': 0.1132, 'grad_norm': 1.9419139623641968, 'learning_rate': 5.693069306930693e-07, 'epoch': 9.92}


                                                   
100%|██████████| 2520/2520 [04:25<00:00, 11.81it/s]

{'eval_loss': 0.6510425209999084, 'eval_f1': 0.7713090465400035, 'eval_roc_auc': 0.7870727950749379, 'eval_accuracy': 0.13710879284649777, 'eval_runtime': 0.5029, 'eval_samples_per_second': 1334.228, 'eval_steps_per_second': 55.676, 'epoch': 10.0}


100%|██████████| 2520/2520 [04:30<00:00,  9.31it/s]


{'train_runtime': 270.5929, 'train_samples_per_second': 222.992, 'train_steps_per_second': 9.313, 'train_loss': 0.24082989210174197, 'epoch': 10.0}
Subset Accuracy: 0.1667
Hamming Loss: 0.2510
Micro Precision: 0.7061
Micro Recall: 0.5534
Micro F1-Score: 0.6205
Precision per label: [0.66666667 0.90909091 0.6        0.72222222 0.77777778 0.59259259
 0.6        0.84444444]
Recall per label: [0.17142857 0.76923077 0.78       0.609375   0.16666667 0.69565217
 0.28571429 0.6440678 ]
F1-Score per label: [0.27272727 0.83333333 0.67826087 0.66101695 0.2745098  0.64
 0.38709677 0.73076923]
{0, 1, 2, 3, 4, 5, 6}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [8]:
import os

scorer_script = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/exp_17/marbert_finetuned_epochs_10_eval_f1_0.7785_greater_threshold_0.3/-home-ali.mekky-Documents-NLP-Project-Cross-Country-Dialectal-Arabic-Identification-exp_16-marbert_finetuned_epochs_2_eval_f1_0.7607_greater_threshold_0.3-experiment-17_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 71.41 %
MACRO AVERAGE RECALL SCORE: 51.53 %
MACRO AVERAGE F1-SCORE: 55.97 %
MACRO AVERAGE ACCURACY: 74.90 %



## EXP 18

In [9]:
import pandas as pd
from sklearn.utils import shuffle

dataset_path = '/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/lr_binary_classifiers/annotated_multi_label_logisitc_regression.csv'
df = pd.read_csv(dataset_path)

# Load the dataset
label_columns = df.columns[2:-1]  # Excludes 'id', 'tweet', and 'Computed' columns
df[label_columns] = df[label_columns].astype(int)  # Ensure labels are integers

threshold = 500  
balanced_df = pd.DataFrame()

# Iterate over possible combinations of active labels    model_name="/home/lara.hassan/Documents/Cross-Country-Dialectal-Arabic-Identification/exp_8/camelbert_finetuned_epochs_2_eval_f1_0.7694_greater_threshold_0.3",

for num_classes in range(1, len(label_columns) + 1):
    subset = df[df[label_columns].sum(axis=1) == num_classes]  # Filter rows with num_classes active labels
    
    # Shuffle and sample the subset if it exceeds the threshold
    if len(subset) > threshold:
        subset = shuffle(subset).head(threshold)
    
    # Append sampled subset to the balanced dataset
    balanced_df = pd.concat([balanced_df, subset], ignore_index=True)

# Shuffle the final balanced DataFrame and save it to a new CSV
balanced_df = shuffle(balanced_df).reset_index(drop=True)
balanced_df.to_csv('balanced_multilabel_dataset_lr_' + str(threshold) + '.csv', index=False)

print("Balanced dataset created and saved as 'balanced_multilabel_dataset_500.csv'")

label_columns = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait', 'Lebanon', 
                 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar', 'Saudi_Arabia', 
                 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']

# Initialize a dictionary to store counts
counts = {}

# Loop to count rows where the sum of 1s in label columns equals i (from 0 to 18)
for i in range(19):
    counts[i] = (balanced_df[label_columns].sum(axis=1) == i).sum()

Balanced dataset created and saved as 'balanced_multilabel_dataset_500.csv'


In [21]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
# dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dataset_path = '/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/lr_binary_classifiers/annotated_multi_label_logisitc_regression.csv'
dev_path = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="CAMeL-Lab/bert-base-arabic-camelbert-ca",
    labels=labels,
    threshold=0.3,
    exp_num=18
)
trainer.train(
    num_train_epochs=10,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
)
trainer.evaluate(dev_path=dev_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

  0%|          | 17/22040 [07:48<28:04, 13.07it/s] 

{'loss': 0.5757, 'grad_norm': 1.2024204730987549, 'learning_rate': 4.99e-05, 'epoch': 0.23}



  0%|          | 17/22040 [08:27<28:04, 13.07it/s] 

{'loss': 0.4805, 'grad_norm': 1.5924893617630005, 'learning_rate': 4.884168987929434e-05, 'epoch': 0.45}



  0%|          | 17/22040 [09:05<28:04, 13.07it/s]  

{'loss': 0.4364, 'grad_norm': 1.4749375581741333, 'learning_rate': 4.768105849582173e-05, 'epoch': 0.68}



  0%|          | 17/22040 [09:44<28:04, 13.07it/s]  

{'loss': 0.3985, 'grad_norm': 2.038799524307251, 'learning_rate': 4.652042711234912e-05, 'epoch': 0.91}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    

  0%|          | 17/22040 [10:04<28:04, 13.07it/s]
[A
[A

{'eval_loss': 0.3685016334056854, 'eval_f1': 0.7405778894472361, 'eval_roc_auc': 0.8159436906682538, 'eval_accuracy': 0.08133401395269696, 'eval_runtime': 5.0018, 'eval_samples_per_second': 1174.988, 'eval_steps_per_second': 48.983, 'epoch': 1.0}



  0%|          | 17/22040 [10:31<28:04, 13.07it/s]  

{'loss': 0.3649, 'grad_norm': 1.7188373804092407, 'learning_rate': 4.535979572887651e-05, 'epoch': 1.13}



  0%|          | 17/22040 [11:13<28:04, 13.07it/s]  

{'loss': 0.3397, 'grad_norm': 1.6450800895690918, 'learning_rate': 4.420380687093779e-05, 'epoch': 1.36}



  0%|          | 17/22040 [11:55<28:04, 13.07it/s]  

{'loss': 0.3251, 'grad_norm': 1.400956392288208, 'learning_rate': 4.3043175487465184e-05, 'epoch': 1.59}



  0%|          | 17/22040 [12:36<28:04, 13.07it/s]  

{'loss': 0.3164, 'grad_norm': 1.8205997943878174, 'learning_rate': 4.188254410399257e-05, 'epoch': 1.81}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    

  0%|          | 17/22040 [13:16<28:04, 13.07it/s]
[A
[A

{'eval_loss': 0.30282822251319885, 'eval_f1': 0.7869438855354348, 'eval_roc_auc': 0.8536088198945435, 'eval_accuracy': 0.12183086608814021, 'eval_runtime': 5.059, 'eval_samples_per_second': 1161.685, 'eval_steps_per_second': 48.428, 'epoch': 2.0}



  0%|          | 17/22040 [13:26<28:04, 13.07it/s]  

{'loss': 0.2967, 'grad_norm': 1.816288948059082, 'learning_rate': 4.0721912720519964e-05, 'epoch': 2.04}



  0%|          | 17/22040 [14:07<28:04, 13.07it/s]  

{'loss': 0.26, 'grad_norm': 1.423289179801941, 'learning_rate': 3.956128133704736e-05, 'epoch': 2.27}



  0%|          | 17/22040 [14:49<28:04, 13.07it/s]  

{'loss': 0.2553, 'grad_norm': 2.1077473163604736, 'learning_rate': 3.8400649953574744e-05, 'epoch': 2.5}



  0%|          | 17/22040 [15:31<28:04, 13.07it/s]  

{'loss': 0.2498, 'grad_norm': 1.6912955045700073, 'learning_rate': 3.724001857010214e-05, 'epoch': 2.72}



  0%|          | 17/22040 [16:12<28:04, 13.07it/s]  

{'loss': 0.2472, 'grad_norm': 1.5335619449615479, 'learning_rate': 3.6079387186629524e-05, 'epoch': 2.95}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    

  0%|          | 17/22040 [16:27<28:04, 13.07it/s]
[A
[A

{'eval_loss': 0.2689467966556549, 'eval_f1': 0.8189405034957306, 'eval_roc_auc': 0.873100242242893, 'eval_accuracy': 0.16590096988259315, 'eval_runtime': 4.7482, 'eval_samples_per_second': 1237.735, 'eval_steps_per_second': 51.599, 'epoch': 3.0}



  0%|          | 17/22040 [16:58<28:04, 13.07it/s]  

{'loss': 0.2104, 'grad_norm': 1.7322285175323486, 'learning_rate': 3.4918755803156924e-05, 'epoch': 3.18}



  0%|          | 17/22040 [17:35<28:04, 13.07it/s]  

{'loss': 0.1963, 'grad_norm': 1.6607365608215332, 'learning_rate': 3.376044568245125e-05, 'epoch': 3.4}



  0%|          | 17/22040 [18:12<28:04, 13.07it/s]  

{'loss': 0.198, 'grad_norm': 1.6215678453445435, 'learning_rate': 3.2599814298978647e-05, 'epoch': 3.63}



  0%|          | 17/22040 [18:49<28:04, 13.07it/s]  

{'loss': 0.197, 'grad_norm': 1.16743004322052, 'learning_rate': 3.143918291550604e-05, 'epoch': 3.86}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    

  0%|          | 17/22040 [19:16<28:04, 13.07it/s]
[A
[A

{'eval_loss': 0.248249351978302, 'eval_f1': 0.8360323886639676, 'eval_roc_auc': 0.8859395341354289, 'eval_accuracy': 0.19567806704100732, 'eval_runtime': 4.4819, 'eval_samples_per_second': 1311.265, 'eval_steps_per_second': 54.664, 'epoch': 4.0}



  0%|          | 17/22040 [19:32<28:04, 13.07it/s]  

{'loss': 0.1782, 'grad_norm': 1.5679394006729126, 'learning_rate': 3.0278551532033426e-05, 'epoch': 4.08}



  0%|          | 17/22040 [20:09<28:04, 13.07it/s]  

{'loss': 0.1531, 'grad_norm': 2.017313241958618, 'learning_rate': 2.9117920148560816e-05, 'epoch': 4.31}



  0%|          | 17/22040 [20:46<28:04, 13.07it/s]  

{'loss': 0.1524, 'grad_norm': 1.603104591369629, 'learning_rate': 2.7957288765088206e-05, 'epoch': 4.54}



  0%|          | 17/22040 [21:23<28:04, 13.07it/s]   

{'loss': 0.1486, 'grad_norm': 2.064905881881714, 'learning_rate': 2.6796657381615596e-05, 'epoch': 4.76}



  0%|          | 17/22040 [22:00<28:04, 13.07it/s]   

{'loss': 0.1525, 'grad_norm': 1.762682318687439, 'learning_rate': 2.5636025998142993e-05, 'epoch': 4.99}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                     

  0%|          | 17/22040 [22:06<28:04, 13.07it/s]
[A
[A

{'eval_loss': 0.2502104341983795, 'eval_f1': 0.8405396056727776, 'eval_roc_auc': 0.8897603483821204, 'eval_accuracy': 0.19925131870001703, 'eval_runtime': 4.5219, 'eval_samples_per_second': 1299.669, 'eval_steps_per_second': 54.181, 'epoch': 5.0}



  0%|          | 17/22040 [22:44<28:04, 13.07it/s]   

{'loss': 0.1163, 'grad_norm': 1.4346753358840942, 'learning_rate': 2.4475394614670383e-05, 'epoch': 5.22}



  0%|          | 17/22040 [23:21<28:04, 13.07it/s]   

{'loss': 0.114, 'grad_norm': 1.737872838973999, 'learning_rate': 2.3314763231197773e-05, 'epoch': 5.44}



  0%|          | 17/22040 [23:58<28:04, 13.07it/s]   

{'loss': 0.1156, 'grad_norm': 1.7948397397994995, 'learning_rate': 2.2154131847725163e-05, 'epoch': 5.67}



  0%|          | 17/22040 [24:35<28:04, 13.07it/s]   

{'loss': 0.1145, 'grad_norm': 1.6340928077697754, 'learning_rate': 2.0993500464252556e-05, 'epoch': 5.9}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                     

  0%|          | 17/22040 [24:56<28:04, 13.07it/s]
[A
[A

{'eval_loss': 0.2612988352775574, 'eval_f1': 0.8399665483061652, 'eval_roc_auc': 0.8906921463061697, 'eval_accuracy': 0.2028245703590267, 'eval_runtime': 4.5313, 'eval_samples_per_second': 1296.973, 'eval_steps_per_second': 54.068, 'epoch': 6.0}



  0%|          | 17/22040 [25:19<28:04, 13.07it/s]   

{'loss': 0.1008, 'grad_norm': 1.9445706605911255, 'learning_rate': 1.9832869080779946e-05, 'epoch': 6.13}



  0%|          | 17/22040 [25:56<28:04, 13.07it/s]   

{'loss': 0.0867, 'grad_norm': 1.8219438791275024, 'learning_rate': 1.8672237697307336e-05, 'epoch': 6.35}



  0%|          | 17/22040 [26:33<28:04, 13.07it/s]   

{'loss': 0.0872, 'grad_norm': 1.6519575119018555, 'learning_rate': 1.7511606313834726e-05, 'epoch': 6.58}



  0%|          | 17/22040 [27:10<28:04, 13.07it/s]   

{'loss': 0.0848, 'grad_norm': 2.5256311893463135, 'learning_rate': 1.6350974930362116e-05, 'epoch': 6.81}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                     

  0%|          | 17/22040 [27:46<28:04, 13.07it/s]
[A
[A

{'eval_loss': 0.26733845472335815, 'eval_f1': 0.848101802757158, 'eval_roc_auc': 0.8923125768656276, 'eval_accuracy': 0.22154160285860133, 'eval_runtime': 4.4769, 'eval_samples_per_second': 1312.737, 'eval_steps_per_second': 54.725, 'epoch': 7.0}



  0%|          | 17/22040 [27:54<28:04, 13.07it/s]   

{'loss': 0.0812, 'grad_norm': 1.622444748878479, 'learning_rate': 1.5192664809656454e-05, 'epoch': 7.03}



  0%|          | 17/22040 [28:31<28:04, 13.07it/s]   

{'loss': 0.0644, 'grad_norm': 1.2063298225402832, 'learning_rate': 1.4032033426183844e-05, 'epoch': 7.26}



  0%|          | 17/22040 [29:08<28:04, 13.07it/s]   

{'loss': 0.0649, 'grad_norm': 2.188608407974243, 'learning_rate': 1.2876044568245126e-05, 'epoch': 7.49}



  0%|          | 17/22040 [29:45<28:04, 13.07it/s]   

{'loss': 0.0632, 'grad_norm': 1.4193909168243408, 'learning_rate': 1.1715413184772516e-05, 'epoch': 7.71}



  0%|          | 17/22040 [30:22<28:04, 13.07it/s]   

{'loss': 0.0632, 'grad_norm': 2.0214288234710693, 'learning_rate': 1.0554781801299908e-05, 'epoch': 7.94}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                     

  0%|          | 17/22040 [30:36<28:04, 13.07it/s]
[A
[A

{'eval_loss': 0.2804032564163208, 'eval_f1': 0.8489762650890235, 'eval_roc_auc': 0.8939247695055154, 'eval_accuracy': 0.2298791900629573, 'eval_runtime': 4.4993, 'eval_samples_per_second': 1306.212, 'eval_steps_per_second': 54.453, 'epoch': 8.0}



  0%|          | 17/22040 [31:06<28:04, 13.07it/s]   

{'loss': 0.0536, 'grad_norm': 1.3284273147583008, 'learning_rate': 9.394150417827298e-06, 'epoch': 8.17}



  0%|          | 17/22040 [31:43<28:04, 13.07it/s]   

{'loss': 0.049, 'grad_norm': 1.6149990558624268, 'learning_rate': 8.23351903435469e-06, 'epoch': 8.39}



  0%|          | 17/22040 [32:20<28:04, 13.07it/s]   

{'loss': 0.0482, 'grad_norm': 1.2953239679336548, 'learning_rate': 7.0752089136490255e-06, 'epoch': 8.62}



  0%|          | 17/22040 [32:57<28:04, 13.07it/s]   

{'loss': 0.0472, 'grad_norm': 1.3703434467315674, 'learning_rate': 5.914577530176416e-06, 'epoch': 8.85}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                     

  0%|          | 17/22040 [33:27<28:04, 13.07it/s]
[A
[A

{'eval_loss': 0.28599122166633606, 'eval_f1': 0.8500497080527045, 'eval_roc_auc': 0.8953067030531436, 'eval_accuracy': 0.23038965458567295, 'eval_runtime': 4.5147, 'eval_samples_per_second': 1301.752, 'eval_steps_per_second': 54.267, 'epoch': 9.0}



  0%|          | 17/22040 [33:41<28:04, 13.07it/s]   

{'loss': 0.0435, 'grad_norm': 1.9104195833206177, 'learning_rate': 4.753946146703807e-06, 'epoch': 9.07}



  0%|          | 17/22040 [34:18<28:04, 13.07it/s]   

{'loss': 0.039, 'grad_norm': 1.6241123676300049, 'learning_rate': 3.593314763231198e-06, 'epoch': 9.3}



  0%|          | 17/22040 [34:55<28:04, 13.07it/s]   

{'loss': 0.0374, 'grad_norm': 1.7759971618652344, 'learning_rate': 2.4326833797585887e-06, 'epoch': 9.53}



  0%|          | 17/22040 [35:32<28:04, 13.07it/s]   

{'loss': 0.0369, 'grad_norm': 2.0826504230499268, 'learning_rate': 1.2720519962859795e-06, 'epoch': 9.75}



  0%|          | 17/22040 [36:09<28:04, 13.07it/s]   

{'loss': 0.0362, 'grad_norm': 0.8331364393234253, 'learning_rate': 1.1142061281337048e-07, 'epoch': 9.98}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                     

  0%|          | 17/22040 [36:19<28:04, 13.07it/s]
[A
[A

{'eval_loss': 0.2876950204372406, 'eval_f1': 0.8528719330591406, 'eval_roc_auc': 0.895477467419483, 'eval_accuracy': 0.24025863535817593, 'eval_runtime': 4.5196, 'eval_samples_per_second': 1300.323, 'eval_steps_per_second': 54.208, 'epoch': 10.0}



100%|██████████| 22040/22040 [29:11<00:00, 12.58it/s]


{'train_runtime': 1751.485, 'train_samples_per_second': 301.978, 'train_steps_per_second': 12.584, 'train_loss': 0.17430943555710754, 'epoch': 10.0}
Subset Accuracy: 0.1083
Hamming Loss: 0.3323
Micro Precision: 0.5481
Micro Recall: 0.5927
Micro F1-Score: 0.5695
Precision per label: [0.5        0.6        0.51851852 0.58490566 0.73076923 0.53846154
 0.38636364 0.5862069 ]
Recall per label: [0.57142857 0.69230769 0.56       0.484375   0.45238095 0.76086957
 0.80952381 0.57627119]
F1-Score per label: [0.53333333 0.64285714 0.53846154 0.52991453 0.55882353 0.63063063
 0.52307692 0.58119658]
{0, 1, 2, 3, 4, 5, 6, 7, 8}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [12]:
import os

scorer_script = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/exp_18/marbert_finetuned_epochs_10_eval_f1_0.8353_greater_threshold_0.3/CAMeL-Lab-bert-base-arabic-camelbert-ca-experiment-18_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 46.80 %
MACRO AVERAGE RECALL SCORE: 64.37 %
MACRO AVERAGE F1-SCORE: 52.38 %
MACRO AVERAGE ACCURACY: 58.44 %



In [13]:
trainer.predict(["الله اكبر"])

(array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 array([[0.99621993, 0.99293363, 0.9789554 , 0.9881765 , 0.99378294,
         0.99346113, 0.98922324, 0.99135584, 0.9954261 , 0.9932811 ,
         0.994292  , 0.9939731 , 0.994089  , 0.9908035 , 0.99158704,
         0.99317604, 0.9935369 , 0.9952632 ]], dtype=float32),
 0.007803493075900558)

In [17]:
trainer.predict(["إيه يا عم الجو حر موت النهارده، لازم نشرب حاجة ساقعة"])

(array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]]),
 array([[0.00602695, 0.01210039, 0.02551661, 0.13398075, 0.02532306,
         0.01596765, 0.12995382, 0.0085113 , 0.00219124, 0.01200735,
         0.04689926, 0.00372171, 0.01778039, 0.00564191, 0.43236297,
         0.0102883 , 0.01764446, 0.0124797 ]], dtype=float32),
 0.9489779008759393)

## Experiment 19

In [22]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
# dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dataset_path = '/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/lr_binary_classifiers/annotated_multi_label_logisitc_regression.csv'
dev_path = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="UBC-NLP/MARBERT",
    labels=labels,
    threshold=0.3,
    exp_num=18
)
trainer.train(
    num_train_epochs=10,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
)
trainer.evaluate(dev_path=dev_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                  
  0%|          | 17/22040 [3:31:48<28:04, 13.07it/s]

{'loss': 0.5377, 'grad_norm': 2.3052988052368164, 'learning_rate': 5e-05, 'epoch': 0.23}


                                                    
  0%|          | 17/22040 [3:32:33<28:04, 13.07it/s]

{'loss': 0.4135, 'grad_norm': 1.8364241123199463, 'learning_rate': 4.883936861652739e-05, 'epoch': 0.45}


                                                    
  0%|          | 17/22040 [3:33:18<28:04, 13.07it/s]

{'loss': 0.3668, 'grad_norm': 2.281428098678589, 'learning_rate': 4.768105849582173e-05, 'epoch': 0.68}


                                                    
  0%|          | 17/22040 [3:34:04<28:04, 13.07it/s]

{'loss': 0.33, 'grad_norm': 2.3681914806365967, 'learning_rate': 4.652042711234912e-05, 'epoch': 0.91}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                                 
  0%|          | 17/22040 [3:34:27<28:04, 13.07it/s]
[A

{'eval_loss': 0.30051612854003906, 'eval_f1': 0.7969495534439868, 'eval_roc_auc': 0.8578380791930998, 'eval_accuracy': 0.12710566615620214, 'eval_runtime': 4.8859, 'eval_samples_per_second': 1202.838, 'eval_steps_per_second': 50.144, 'epoch': 1.0}


                                                    
  0%|          | 17/22040 [3:34:59<28:04, 13.07it/s]

{'loss': 0.2879, 'grad_norm': 3.490952968597412, 'learning_rate': 4.535979572887651e-05, 'epoch': 1.13}


                                                    
  0%|          | 17/22040 [3:35:46<28:04, 13.07it/s]

{'loss': 0.257, 'grad_norm': 3.019273042678833, 'learning_rate': 4.4199164345403905e-05, 'epoch': 1.36}


                                                    
  0%|          | 17/22040 [3:36:33<28:04, 13.07it/s]

{'loss': 0.2451, 'grad_norm': 3.494957208633423, 'learning_rate': 4.303853296193129e-05, 'epoch': 1.59}


                                                    
  0%|          | 17/22040 [3:37:20<28:04, 13.07it/s]

{'loss': 0.2384, 'grad_norm': 4.560043811798096, 'learning_rate': 4.188022284122563e-05, 'epoch': 1.81}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                                 
  0%|          | 17/22040 [3:38:04<28:04, 13.07it/s]
[A

{'eval_loss': 0.24287019670009613, 'eval_f1': 0.8345204436711614, 'eval_roc_auc': 0.887667324454597, 'eval_accuracy': 0.18461800238216777, 'eval_runtime': 5.0269, 'eval_samples_per_second': 1169.116, 'eval_steps_per_second': 48.738, 'epoch': 2.0}


                                                    
  0%|          | 17/22040 [3:38:16<28:04, 13.07it/s]

{'loss': 0.2193, 'grad_norm': 2.826425075531006, 'learning_rate': 4.071959145775302e-05, 'epoch': 2.04}


                                                    
  0%|          | 17/22040 [3:39:03<28:04, 13.07it/s]

{'loss': 0.1644, 'grad_norm': 4.39860725402832, 'learning_rate': 3.955896007428041e-05, 'epoch': 2.27}


                                                    
  0%|          | 17/22040 [3:39:51<28:04, 13.07it/s]

{'loss': 0.162, 'grad_norm': 3.645512104034424, 'learning_rate': 3.83983286908078e-05, 'epoch': 2.5}


                                                    
  0%|          | 17/22040 [3:40:33<28:04, 13.07it/s]

{'loss': 0.1606, 'grad_norm': 4.56809663772583, 'learning_rate': 3.724001857010214e-05, 'epoch': 2.72}


                                                    
  0%|          | 17/22040 [3:41:15<28:04, 13.07it/s]

{'loss': 0.1592, 'grad_norm': 4.311180591583252, 'learning_rate': 3.6079387186629524e-05, 'epoch': 2.95}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                                 
  0%|          | 17/22040 [3:41:29<28:04, 13.07it/s]
[A

{'eval_loss': 0.23717337846755981, 'eval_f1': 0.8507988622051145, 'eval_roc_auc': 0.8942350016502764, 'eval_accuracy': 0.2222222222222222, 'eval_runtime': 4.5348, 'eval_samples_per_second': 1295.974, 'eval_steps_per_second': 54.026, 'epoch': 3.0}


                                                    
  0%|          | 17/22040 [3:42:06<28:04, 13.07it/s]

{'loss': 0.1169, 'grad_norm': 2.881850242614746, 'learning_rate': 3.4918755803156924e-05, 'epoch': 3.18}


                                                    
  0%|          | 17/22040 [3:42:48<28:04, 13.07it/s]

{'loss': 0.1001, 'grad_norm': 3.451972246170044, 'learning_rate': 3.375812441968431e-05, 'epoch': 3.4}


                                                    
  0%|          | 17/22040 [3:43:30<28:04, 13.07it/s]

{'loss': 0.1012, 'grad_norm': 3.4587137699127197, 'learning_rate': 3.2597493036211704e-05, 'epoch': 3.63}


                                                    
  0%|          | 17/22040 [3:44:12<28:04, 13.07it/s]

{'loss': 0.1014, 'grad_norm': 3.3298377990722656, 'learning_rate': 3.143686165273909e-05, 'epoch': 3.86}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                                 
  0%|          | 17/22040 [3:44:43<28:04, 13.07it/s]
[A

{'eval_loss': 0.24726076424121857, 'eval_f1': 0.8516292944719601, 'eval_roc_auc': 0.8935461907367093, 'eval_accuracy': 0.22596562872213716, 'eval_runtime': 4.5281, 'eval_samples_per_second': 1297.89, 'eval_steps_per_second': 54.106, 'epoch': 4.0}


                                                    
  0%|          | 17/22040 [3:45:02<28:04, 13.07it/s]

{'loss': 0.0855, 'grad_norm': 2.2144157886505127, 'learning_rate': 3.027623026926648e-05, 'epoch': 4.08}


                                                    
  0%|          | 17/22040 [3:45:44<28:04, 13.07it/s]

{'loss': 0.0616, 'grad_norm': 3.439931631088257, 'learning_rate': 2.911559888579387e-05, 'epoch': 4.31}


                                                    
  0%|          | 17/22040 [3:46:26<28:04, 13.07it/s]

{'loss': 0.0616, 'grad_norm': 2.223876714706421, 'learning_rate': 2.7954967502321267e-05, 'epoch': 4.54}


                                                    
  0%|          | 17/22040 [3:47:08<28:04, 13.07it/s] 

{'loss': 0.0608, 'grad_norm': 4.037909507751465, 'learning_rate': 2.6796657381615596e-05, 'epoch': 4.76}


                                                    
  0%|          | 17/22040 [3:47:50<28:04, 13.07it/s] 

{'loss': 0.0631, 'grad_norm': 3.489192008972168, 'learning_rate': 2.5636025998142993e-05, 'epoch': 4.99}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                                  
  0%|          | 17/22040 [3:47:57<28:04, 13.07it/s]
[A

{'eval_loss': 0.273395299911499, 'eval_f1': 0.850458689212895, 'eval_roc_auc': 0.8915369591754604, 'eval_accuracy': 0.23175089331291476, 'eval_runtime': 4.5245, 'eval_samples_per_second': 1298.938, 'eval_steps_per_second': 54.15, 'epoch': 5.0}


                                                    
  0%|          | 17/22040 [3:48:41<28:04, 13.07it/s] 

{'loss': 0.0374, 'grad_norm': 2.9217259883880615, 'learning_rate': 2.4475394614670383e-05, 'epoch': 5.22}


                                                    
  0%|          | 17/22040 [3:49:23<28:04, 13.07it/s] 

{'loss': 0.0372, 'grad_norm': 1.4874317646026611, 'learning_rate': 2.3314763231197773e-05, 'epoch': 5.44}


                                                    
  0%|          | 17/22040 [3:50:05<28:04, 13.07it/s] 

{'loss': 0.0377, 'grad_norm': 2.8260183334350586, 'learning_rate': 2.2154131847725163e-05, 'epoch': 5.67}


                                                    
  0%|          | 17/22040 [3:50:47<28:04, 13.07it/s] 

{'loss': 0.0368, 'grad_norm': 2.603254556655884, 'learning_rate': 2.0993500464252556e-05, 'epoch': 5.9}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                                  
  0%|          | 17/22040 [3:51:10<28:04, 13.07it/s]
[A

{'eval_loss': 0.30497220158576965, 'eval_f1': 0.8452645367048237, 'eval_roc_auc': 0.8891073677067548, 'eval_accuracy': 0.21728773183597072, 'eval_runtime': 4.5055, 'eval_samples_per_second': 1304.398, 'eval_steps_per_second': 54.378, 'epoch': 6.0}


                                                    
  0%|          | 17/22040 [3:51:37<28:04, 13.07it/s] 

{'loss': 0.0296, 'grad_norm': 1.689935326576233, 'learning_rate': 1.9835190343546892e-05, 'epoch': 6.13}


                                                    
  0%|          | 17/22040 [3:52:19<28:04, 13.07it/s] 

{'loss': 0.0228, 'grad_norm': 1.8808188438415527, 'learning_rate': 1.8674558960074282e-05, 'epoch': 6.35}


                                                    
  0%|          | 17/22040 [3:53:01<28:04, 13.07it/s] 

{'loss': 0.0233, 'grad_norm': 2.7396364212036133, 'learning_rate': 1.7513927576601672e-05, 'epoch': 6.58}


                                                    
  0%|          | 17/22040 [3:53:44<28:04, 13.07it/s] 

{'loss': 0.0219, 'grad_norm': 5.668574333190918, 'learning_rate': 1.6353296193129066e-05, 'epoch': 6.81}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                                  
  0%|          | 17/22040 [3:54:24<28:04, 13.07it/s]
[A

{'eval_loss': 0.3278944790363312, 'eval_f1': 0.8469969184254899, 'eval_roc_auc': 0.8873300295222103, 'eval_accuracy': 0.22443423515399014, 'eval_runtime': 4.4972, 'eval_samples_per_second': 1306.805, 'eval_steps_per_second': 54.478, 'epoch': 7.0}


                                                    
  0%|          | 17/22040 [3:54:34<28:04, 13.07it/s] 

{'loss': 0.02, 'grad_norm': 1.0567728281021118, 'learning_rate': 1.5192664809656454e-05, 'epoch': 7.03}


                                                    
  0%|          | 17/22040 [3:55:18<28:04, 13.07it/s] 

{'loss': 0.0134, 'grad_norm': 2.4436826705932617, 'learning_rate': 1.4032033426183844e-05, 'epoch': 7.26}


                                                    
  0%|          | 17/22040 [3:56:04<28:04, 13.07it/s] 

{'loss': 0.0135, 'grad_norm': 1.4169901609420776, 'learning_rate': 1.2871402042711237e-05, 'epoch': 7.49}


                                                    
  0%|          | 17/22040 [3:56:48<28:04, 13.07it/s] 

{'loss': 0.0133, 'grad_norm': 1.036405086517334, 'learning_rate': 1.1710770659238625e-05, 'epoch': 7.71}


                                                    
  0%|          | 17/22040 [3:57:32<28:04, 13.07it/s] 

{'loss': 0.013, 'grad_norm': 2.0383734703063965, 'learning_rate': 1.0550139275766017e-05, 'epoch': 7.94}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                                  
  0%|          | 17/22040 [3:57:47<28:04, 13.07it/s]
[A

{'eval_loss': 0.3496192693710327, 'eval_f1': 0.8465674230380112, 'eval_roc_auc': 0.8878549000025162, 'eval_accuracy': 0.22613578356304237, 'eval_runtime': 4.5112, 'eval_samples_per_second': 1302.762, 'eval_steps_per_second': 54.309, 'epoch': 8.0}


                                                    
  0%|          | 17/22040 [3:58:22<28:04, 13.07it/s] 

{'loss': 0.0097, 'grad_norm': 1.4177751541137695, 'learning_rate': 9.394150417827298e-06, 'epoch': 8.17}


                                                    
  0%|          | 17/22040 [3:59:04<28:04, 13.07it/s] 

{'loss': 0.0079, 'grad_norm': 0.9546614289283752, 'learning_rate': 8.23351903435469e-06, 'epoch': 8.39}


                                                    
  0%|          | 17/22040 [3:59:46<28:04, 13.07it/s] 

{'loss': 0.0078, 'grad_norm': 0.8336026668548584, 'learning_rate': 7.07288765088208e-06, 'epoch': 8.62}


                                                    
  0%|          | 17/22040 [4:00:28<28:04, 13.07it/s] 

{'loss': 0.0074, 'grad_norm': 0.28554850816726685, 'learning_rate': 5.912256267409471e-06, 'epoch': 8.85}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                                  
  0%|          | 17/22040 [4:01:01<28:04, 13.07it/s]
[A

{'eval_loss': 0.36540687084198, 'eval_f1': 0.8474516695957821, 'eval_roc_auc': 0.8865577468982784, 'eval_accuracy': 0.23158073847200952, 'eval_runtime': 4.5015, 'eval_samples_per_second': 1305.557, 'eval_steps_per_second': 54.426, 'epoch': 9.0}


                                                    
  0%|          | 17/22040 [4:01:18<28:04, 13.07it/s] 

{'loss': 0.0063, 'grad_norm': 0.3903147578239441, 'learning_rate': 4.751624883936862e-06, 'epoch': 9.07}


                                                    
  0%|          | 17/22040 [4:02:00<28:04, 13.07it/s] 

{'loss': 0.0055, 'grad_norm': 1.0045398473739624, 'learning_rate': 3.590993500464253e-06, 'epoch': 9.3}


                                                    
  0%|          | 17/22040 [4:02:42<28:04, 13.07it/s] 

{'loss': 0.0049, 'grad_norm': 0.40790706872940063, 'learning_rate': 2.4303621169916438e-06, 'epoch': 9.53}


                                                    
  0%|          | 17/22040 [4:03:24<28:04, 13.07it/s] 

{'loss': 0.0048, 'grad_norm': 0.6210858821868896, 'learning_rate': 1.2697307335190344e-06, 'epoch': 9.75}


                                                    
  0%|          | 17/22040 [4:04:06<28:04, 13.07it/s] 

{'loss': 0.0047, 'grad_norm': 0.9460890889167786, 'learning_rate': 1.0909935004642526e-07, 'epoch': 9.98}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                                  
  0%|          | 17/22040 [4:04:18<28:04, 13.07it/s]
[A

{'eval_loss': 0.3677639365196228, 'eval_f1': 0.8469693880523111, 'eval_roc_auc': 0.8870306658177636, 'eval_accuracy': 0.2285179513357155, 'eval_runtime': 4.508, 'eval_samples_per_second': 1303.674, 'eval_steps_per_second': 54.347, 'epoch': 10.0}


                                                    
100%|██████████| 22040/22040 [33:18<00:00, 11.03it/s]


{'train_runtime': 1998.5777, 'train_samples_per_second': 264.643, 'train_steps_per_second': 11.028, 'train_loss': 0.10601788969835547, 'epoch': 10.0}
Subset Accuracy: 0.0917
Hamming Loss: 0.3302
Micro Precision: 0.5499
Micro Recall: 0.6039
Micro F1-Score: 0.5756
Precision per label: [0.54901961 0.68181818 0.54       0.58695652 0.63333333 0.50724638
 0.34693878 0.61538462]
Recall per label: [0.8        0.76923077 0.54       0.421875   0.45238095 0.76086957
 0.80952381 0.54237288]
F1-Score per label: [0.65116279 0.72289157 0.54       0.49090909 0.52777778 0.60869565
 0.48571429 0.57657658]
{0, 1, 2, 3, 4, 5, 6, 7, 8}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [1]:
import os

scorer_script = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/exp_18/marbert_finetuned_epochs_10_eval_f1_0.8516_greater_threshold_0.3/UBC-NLP-MARBERT-experiment-18_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"


OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 55.76 %
MACRO AVERAGE RECALL SCORE: 63.70 %
MACRO AVERAGE F1-SCORE: 57.55 %
MACRO AVERAGE ACCURACY: 66.98 %



## Experiment 20

balancing the dataset produced by the logistic regression and finetuning marbert

In [7]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
# dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dataset_path = '/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/balanced_multilabel_dataset_lr_500.csv'
dev_path = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/exp_20/marbert_finetuned_epochs_10_eval_f1_0.8621_greater_threshold_0.3",
    labels=labels,
    threshold=0.3,
    exp_num=21
)
trainer.train(
    num_train_epochs=20,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
)
trainer.evaluate(dev_path=dev_path)



AttributeError: module 'triton.language' has no attribute 'core'

In [1]:
import os

scorer_script = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/exp_20/marbert_finetuned_epochs_20_eval_f1_0.8577_greater_threshold_0.3/-home-ali.mekky-Documents-NLP-Project-Cross-Country-Dialectal-Arabic-Identification-exp_20-marbert_finetuned_epochs_10_eval_f1_0.8621_greater_threshold_0.3-experiment-20_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"


OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 53.12 %
MACRO AVERAGE RECALL SCORE: 64.70 %
MACRO AVERAGE F1-SCORE: 55.99 %
MACRO AVERAGE ACCURACY: 64.48 %



## EXP 21

balancing the dataset produced by the logistic regression and finetuning camelbert

In [3]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
# dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dataset_path = '/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/balanced_multilabel_dataset_lr_500.csv'
dev_path = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="CAMeL-Lab/bert-base-arabic-camelbert-ca",
    labels=labels,
    threshold=0.3,
    exp_num=21
)
trainer.train(
    num_train_epochs=20,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
)
trainer.evaluate(dev_path=dev_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                   
  5%|▌         | 319/6380 [00:30<07:29, 13.48it/s]

{'eval_loss': 0.5511866211891174, 'eval_f1': 0.7437626750931472, 'eval_roc_auc': 0.6177656031477913, 'eval_accuracy': 0.07529411764705882, 'eval_runtime': 0.7482, 'eval_samples_per_second': 1136.014, 'eval_steps_per_second': 48.114, 'epoch': 1.0}


  8%|▊         | 501/6380 [00:47<07:57, 12.31it/s]

{'loss': 0.5907, 'grad_norm': 6.361403942108154, 'learning_rate': 4.96e-05, 'epoch': 1.57}


                                                  
 10%|█         | 638/6380 [00:59<07:41, 12.43it/s]

{'eval_loss': 0.49951958656311035, 'eval_f1': 0.7718301581860032, 'eval_roc_auc': 0.6728528798889093, 'eval_accuracy': 0.08235294117647059, 'eval_runtime': 0.5506, 'eval_samples_per_second': 1543.774, 'eval_steps_per_second': 65.383, 'epoch': 2.0}


                                                  
 15%|█▌        | 957/6380 [01:28<07:26, 12.14it/s]

{'eval_loss': 0.4561315178871155, 'eval_f1': 0.8000838662333578, 'eval_roc_auc': 0.7355714535748665, 'eval_accuracy': 0.0988235294117647, 'eval_runtime': 0.5717, 'eval_samples_per_second': 1486.689, 'eval_steps_per_second': 62.966, 'epoch': 3.0}


 16%|█▌        | 1001/6380 [01:34<07:38, 11.74it/s]

{'loss': 0.4842, 'grad_norm': 1.7892683744430542, 'learning_rate': 4.5782312925170066e-05, 'epoch': 3.13}


                                                   
 20%|██        | 1276/6380 [01:57<07:04, 12.02it/s]

{'eval_loss': 0.4299485385417938, 'eval_f1': 0.8184115126881248, 'eval_roc_auc': 0.7736265977137893, 'eval_accuracy': 0.10235294117647059, 'eval_runtime': 0.5718, 'eval_samples_per_second': 1486.479, 'eval_steps_per_second': 62.957, 'epoch': 4.0}


 24%|██▎       | 1501/6380 [02:18<06:41, 12.15it/s]

{'loss': 0.3728, 'grad_norm': 1.6473968029022217, 'learning_rate': 4.153061224489796e-05, 'epoch': 4.7}


                                                   
 25%|██▌       | 1595/6380 [02:26<06:27, 12.34it/s]

{'eval_loss': 0.41549763083457947, 'eval_f1': 0.8282073813708261, 'eval_roc_auc': 0.7853263786034731, 'eval_accuracy': 0.11058823529411765, 'eval_runtime': 0.5629, 'eval_samples_per_second': 1510.111, 'eval_steps_per_second': 63.958, 'epoch': 5.0}


                                                   
 30%|███       | 1914/6380 [02:55<06:11, 12.01it/s]

{'eval_loss': 0.41730809211730957, 'eval_f1': 0.8345218709641192, 'eval_roc_auc': 0.7993547225951115, 'eval_accuracy': 0.12117647058823529, 'eval_runtime': 0.5703, 'eval_samples_per_second': 1490.544, 'eval_steps_per_second': 63.129, 'epoch': 6.0}


 31%|███▏      | 2001/6380 [03:05<06:09, 11.84it/s]

{'loss': 0.2875, 'grad_norm': 1.2916576862335205, 'learning_rate': 3.727891156462585e-05, 'epoch': 6.27}


                                                   
 35%|███▌      | 2233/6380 [03:25<05:33, 12.44it/s]

{'eval_loss': 0.41028809547424316, 'eval_f1': 0.836555163332773, 'eval_roc_auc': 0.8011264284688202, 'eval_accuracy': 0.11647058823529412, 'eval_runtime': 0.5653, 'eval_samples_per_second': 1503.722, 'eval_steps_per_second': 63.687, 'epoch': 7.0}


 39%|███▉      | 2501/6380 [03:49<05:24, 11.96it/s]

{'loss': 0.2105, 'grad_norm': 1.6996122598648071, 'learning_rate': 3.302721088435374e-05, 'epoch': 7.84}


                                                   
 40%|████      | 2552/6380 [03:54<05:15, 12.13it/s]

{'eval_loss': 0.4211941063404083, 'eval_f1': 0.8384971808184001, 'eval_roc_auc': 0.8023545896758918, 'eval_accuracy': 0.12705882352941175, 'eval_runtime': 0.5642, 'eval_samples_per_second': 1506.544, 'eval_steps_per_second': 63.807, 'epoch': 8.0}


                                                   
 45%|████▌     | 2871/6380 [04:23<04:45, 12.28it/s]

{'eval_loss': 0.43933889269828796, 'eval_f1': 0.8359919119299034, 'eval_roc_auc': 0.8011575943107271, 'eval_accuracy': 0.1188235294117647, 'eval_runtime': 0.5867, 'eval_samples_per_second': 1448.751, 'eval_steps_per_second': 61.359, 'epoch': 9.0}


 47%|████▋     | 3001/6380 [04:37<04:42, 11.97it/s]

{'loss': 0.1482, 'grad_norm': 1.660212516784668, 'learning_rate': 2.8775510204081635e-05, 'epoch': 9.4}


                                                   
 50%|█████     | 3190/6380 [04:53<04:22, 12.15it/s]

{'eval_loss': 0.4412885010242462, 'eval_f1': 0.8401271715680708, 'eval_roc_auc': 0.8090094257064121, 'eval_accuracy': 0.13058823529411764, 'eval_runtime': 0.5685, 'eval_samples_per_second': 1495.264, 'eval_steps_per_second': 63.329, 'epoch': 10.0}


 55%|█████▍    | 3501/6380 [05:21<03:58, 12.05it/s]

{'loss': 0.1047, 'grad_norm': 1.1694947481155396, 'learning_rate': 2.4523809523809523e-05, 'epoch': 10.97}


                                                   
 55%|█████▌    | 3509/6380 [05:22<03:52, 12.36it/s]

{'eval_loss': 0.46076735854148865, 'eval_f1': 0.8412990901762064, 'eval_roc_auc': 0.8142748018182688, 'eval_accuracy': 0.1388235294117647, 'eval_runtime': 0.5929, 'eval_samples_per_second': 1433.657, 'eval_steps_per_second': 60.72, 'epoch': 11.0}


                                                   
 60%|██████    | 3828/6380 [05:52<03:33, 11.97it/s]

{'eval_loss': 0.4691307246685028, 'eval_f1': 0.8411923496640055, 'eval_roc_auc': 0.8134435407038278, 'eval_accuracy': 0.1411764705882353, 'eval_runtime': 0.578, 'eval_samples_per_second': 1470.688, 'eval_steps_per_second': 62.288, 'epoch': 12.0}


 63%|██████▎   | 4001/6380 [06:08<03:18, 11.96it/s]

{'loss': 0.0726, 'grad_norm': 1.3389629125595093, 'learning_rate': 2.0272108843537416e-05, 'epoch': 12.54}


                                                   
 65%|██████▌   | 4147/6380 [06:21<03:01, 12.28it/s]

{'eval_loss': 0.48363080620765686, 'eval_f1': 0.8430549968392621, 'eval_roc_auc': 0.815733507696931, 'eval_accuracy': 0.14941176470588236, 'eval_runtime': 0.5698, 'eval_samples_per_second': 1491.861, 'eval_steps_per_second': 63.185, 'epoch': 13.0}


                                                   
 70%|███████   | 4466/6380 [06:50<02:39, 12.01it/s]

{'eval_loss': 0.4931025505065918, 'eval_f1': 0.843399930707934, 'eval_roc_auc': 0.8174246158139197, 'eval_accuracy': 0.15764705882352942, 'eval_runtime': 0.5832, 'eval_samples_per_second': 1457.583, 'eval_steps_per_second': 61.733, 'epoch': 14.0}


 71%|███████   | 4501/6380 [06:55<02:38, 11.83it/s]

{'loss': 0.0538, 'grad_norm': 1.497381567955017, 'learning_rate': 1.6020408163265308e-05, 'epoch': 14.11}


                                                   
 75%|███████▌  | 4785/6380 [07:20<02:09, 12.31it/s]

{'eval_loss': 0.5058793425559998, 'eval_f1': 0.844122756224667, 'eval_roc_auc': 0.8189954980838167, 'eval_accuracy': 0.14705882352941177, 'eval_runtime': 0.5678, 'eval_samples_per_second': 1497.078, 'eval_steps_per_second': 63.406, 'epoch': 15.0}


 78%|███████▊  | 5001/6380 [07:40<01:55, 11.97it/s]

{'loss': 0.0398, 'grad_norm': 0.752790093421936, 'learning_rate': 1.1768707482993198e-05, 'epoch': 15.67}


                                                   
 80%|████████  | 5104/6380 [07:49<01:45, 12.10it/s]

{'eval_loss': 0.5167987942695618, 'eval_f1': 0.8434346346808143, 'eval_roc_auc': 0.8171050111371445, 'eval_accuracy': 0.14705882352941177, 'eval_runtime': 0.5673, 'eval_samples_per_second': 1498.223, 'eval_steps_per_second': 63.454, 'epoch': 16.0}


                                                   
 85%|████████▌ | 5423/6380 [08:18<01:17, 12.28it/s]

{'eval_loss': 0.5189736485481262, 'eval_f1': 0.8436211602113698, 'eval_roc_auc': 0.8191901297985077, 'eval_accuracy': 0.14823529411764705, 'eval_runtime': 0.593, 'eval_samples_per_second': 1433.499, 'eval_steps_per_second': 60.713, 'epoch': 17.0}


 86%|████████▌ | 5501/6380 [08:27<01:13, 11.97it/s]

{'loss': 0.0323, 'grad_norm': 0.48990684747695923, 'learning_rate': 7.5170068027210886e-06, 'epoch': 17.24}


                                                   
 90%|█████████ | 5742/6380 [08:48<00:52, 12.27it/s]

{'eval_loss': 0.5246774554252625, 'eval_f1': 0.8456507899762717, 'eval_roc_auc': 0.8205923862880203, 'eval_accuracy': 0.14352941176470588, 'eval_runtime': 0.5655, 'eval_samples_per_second': 1503.036, 'eval_steps_per_second': 63.658, 'epoch': 18.0}


 94%|█████████▍| 6001/6380 [09:09<00:28, 13.50it/s]

{'loss': 0.0271, 'grad_norm': 0.5851431488990784, 'learning_rate': 3.26530612244898e-06, 'epoch': 18.81}


                                                   
 95%|█████████▌| 6061/6380 [09:14<00:22, 13.96it/s]

{'eval_loss': 0.5259179472923279, 'eval_f1': 0.8454016298020954, 'eval_roc_auc': 0.8218422191070139, 'eval_accuracy': 0.14823529411764705, 'eval_runtime': 0.507, 'eval_samples_per_second': 1676.544, 'eval_steps_per_second': 71.007, 'epoch': 19.0}


                                                   
100%|██████████| 6380/6380 [09:42<00:00, 13.67it/s]

{'eval_loss': 0.5265424847602844, 'eval_f1': 0.8449036452286974, 'eval_roc_auc': 0.8205344921247426, 'eval_accuracy': 0.14941176470588236, 'eval_runtime': 0.5301, 'eval_samples_per_second': 1603.392, 'eval_steps_per_second': 67.908, 'epoch': 20.0}


100%|██████████| 6380/6380 [09:45<00:00, 10.91it/s]


{'train_runtime': 585.0119, 'train_samples_per_second': 261.533, 'train_steps_per_second': 10.906, 'train_loss': 0.19146193889988627, 'epoch': 20.0}


  df_replaced = dev.replace({'y': 1, 'n': 0})


Subset Accuracy: 0.1000
Hamming Loss: 0.3635
Micro Precision: 0.5081
Micro Recall: 0.6152
Micro F1-Score: 0.5565
Precision per label: [0.42105263 0.50909091 0.49152542 0.62745098 0.65517241 0.53030303
 0.29787234 0.56716418]
Recall per label: [0.68571429 0.71794872 0.58       0.5        0.45238095 0.76086957
 0.66666667 0.6440678 ]
F1-Score per label: [0.52173913 0.59574468 0.53211009 0.55652174 0.53521127 0.625
 0.41176471 0.6031746 ]
{0, 1, 2, 3, 4, 5, 6, 7, 8}


In [5]:
import os

scorer_script = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/exp_21/marbert_finetuned_epochs_20_eval_f1_0.8457_greater_threshold_0.3/CAMeL-Lab-bert-base-arabic-camelbert-ca-experiment-21_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 51.25 %
MACRO AVERAGE RECALL SCORE: 62.60 %
MACRO AVERAGE F1-SCORE: 54.77 %
MACRO AVERAGE ACCURACY: 63.65 %



## EXP 22

freezing lower layers and adding dropout

In [9]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
# dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dataset_path = '/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/lr_binary_classifiers/annotated_multi_label_logisitc_regression.csv'
dev_path = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="UBC-NLP/MARBERT",
    labels=labels,
    threshold=0.3,
    exp_num=22
)
trainer.train(
    num_train_epochs=10,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
)
trainer.evaluate(dev_path=dev_path)

  1%|          | 119/22040 [17:08<52:38:49,  8.65s/it]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 502/22040 [00:34<25:11, 14.25it/s]

{'loss': 0.4872, 'grad_norm': 2.7567391395568848, 'learning_rate': 5e-05, 'epoch': 0.23}


  5%|▍         | 1002/22040 [01:08<25:13, 13.90it/s]

{'loss': 0.4106, 'grad_norm': 2.746913194656372, 'learning_rate': 5e-05, 'epoch': 0.45}


  7%|▋         | 1502/22040 [01:43<23:29, 14.57it/s]

{'loss': 0.3744, 'grad_norm': 3.761868953704834, 'learning_rate': 5e-05, 'epoch': 0.68}


  9%|▉         | 2002/22040 [02:17<22:55, 14.56it/s]

{'loss': 0.3473, 'grad_norm': 3.082247495651245, 'learning_rate': 5e-05, 'epoch': 0.91}


 10%|█         | 2204/22040 [02:30<22:20, 14.80it/s]
 10%|█         | 2204/22040 [02:35<22:20, 14.80it/s]

{'eval_loss': 0.31659334897994995, 'eval_f1': 0.7849032858198134, 'eval_roc_auc': 0.8475967977783262, 'eval_accuracy': 0.13850604049685214, 'eval_runtime': 4.8319, 'eval_samples_per_second': 1216.299, 'eval_steps_per_second': 50.705, 'epoch': 1.0}


 11%|█▏        | 2502/22040 [02:59<22:55, 14.20it/s]  

{'loss': 0.3059, 'grad_norm': 4.013148307800293, 'learning_rate': 5e-05, 'epoch': 1.13}


 14%|█▎        | 3002/22040 [03:34<22:17, 14.24it/s]

{'loss': 0.2746, 'grad_norm': 4.039422512054443, 'learning_rate': 5e-05, 'epoch': 1.36}


 16%|█▌        | 3502/22040 [04:09<21:44, 14.21it/s]

{'loss': 0.2628, 'grad_norm': 3.6592023372650146, 'learning_rate': 5e-05, 'epoch': 1.59}


 18%|█▊        | 4002/22040 [04:44<21:14, 14.15it/s]

{'loss': 0.2578, 'grad_norm': 3.9496212005615234, 'learning_rate': 5e-05, 'epoch': 1.81}


 20%|██        | 4408/22040 [05:13<20:41, 14.21it/s]
 20%|██        | 4408/22040 [05:18<20:41, 14.21it/s]

{'eval_loss': 0.2645597755908966, 'eval_f1': 0.8199577972908584, 'eval_roc_auc': 0.8758989645539288, 'eval_accuracy': 0.158924621405479, 'eval_runtime': 4.9064, 'eval_samples_per_second': 1197.82, 'eval_steps_per_second': 49.935, 'epoch': 2.0}


 20%|██        | 4502/22040 [05:27<21:32, 13.57it/s]  

{'loss': 0.24, 'grad_norm': 3.2590744495391846, 'learning_rate': 5e-05, 'epoch': 2.04}


 23%|██▎       | 5002/22040 [06:02<20:04, 14.14it/s]

{'loss': 0.186, 'grad_norm': 3.98660945892334, 'learning_rate': 5e-05, 'epoch': 2.27}


 25%|██▍       | 5502/22040 [06:37<17:24, 15.83it/s]

{'loss': 0.183, 'grad_norm': 4.314870834350586, 'learning_rate': 5e-05, 'epoch': 2.5}


 27%|██▋       | 6002/22040 [07:08<16:56, 15.78it/s]

{'loss': 0.183, 'grad_norm': 5.021315574645996, 'learning_rate': 5e-05, 'epoch': 2.72}


 30%|██▉       | 6502/22040 [07:40<16:21, 15.82it/s]

{'loss': 0.1821, 'grad_norm': 4.458516597747803, 'learning_rate': 5e-05, 'epoch': 2.95}


 30%|███       | 6612/22040 [07:47<16:12, 15.87it/s]
 30%|███       | 6612/22040 [07:51<16:12, 15.87it/s]

{'eval_loss': 0.25569793581962585, 'eval_f1': 0.8340886203423967, 'eval_roc_auc': 0.8831668167738843, 'eval_accuracy': 0.19669899608643865, 'eval_runtime': 4.4823, 'eval_samples_per_second': 1311.148, 'eval_steps_per_second': 54.659, 'epoch': 3.0}


 32%|███▏      | 7002/22040 [08:19<15:44, 15.91it/s]  

{'loss': 0.1389, 'grad_norm': 3.714444875717163, 'learning_rate': 5e-05, 'epoch': 3.18}


 34%|███▍      | 7502/22040 [08:50<15:16, 15.87it/s]

{'loss': 0.1241, 'grad_norm': 4.097795009613037, 'learning_rate': 5e-05, 'epoch': 3.4}


 36%|███▋      | 8002/22040 [09:21<14:49, 15.79it/s]

{'loss': 0.1275, 'grad_norm': 3.6288466453552246, 'learning_rate': 5e-05, 'epoch': 3.63}


 39%|███▊      | 8502/22040 [09:53<14:18, 15.77it/s]

{'loss': 0.1291, 'grad_norm': 3.088449478149414, 'learning_rate': 5e-05, 'epoch': 3.86}


 40%|████      | 8816/22040 [10:12<13:49, 15.94it/s]
 40%|████      | 8816/22040 [10:17<13:49, 15.94it/s]

{'eval_loss': 0.27678731083869934, 'eval_f1': 0.8337880728325631, 'eval_roc_auc': 0.8836161360711593, 'eval_accuracy': 0.19550791220010208, 'eval_runtime': 4.4663, 'eval_samples_per_second': 1315.845, 'eval_steps_per_second': 54.855, 'epoch': 4.0}


 41%|████      | 9002/22040 [10:31<13:43, 15.84it/s]  

{'loss': 0.1122, 'grad_norm': 2.8917648792266846, 'learning_rate': 5e-05, 'epoch': 4.08}


 43%|████▎     | 9502/22040 [11:02<13:13, 15.81it/s]

{'loss': 0.0862, 'grad_norm': 3.912954330444336, 'learning_rate': 5e-05, 'epoch': 4.31}


 45%|████▌     | 10002/22040 [11:34<12:55, 15.52it/s]

{'loss': 0.0872, 'grad_norm': 2.9539389610290527, 'learning_rate': 5e-05, 'epoch': 4.54}


 48%|████▊     | 10502/22040 [12:05<12:12, 15.76it/s]

{'loss': 0.0885, 'grad_norm': 4.532630920410156, 'learning_rate': 5e-05, 'epoch': 4.76}


 50%|████▉     | 11002/22040 [12:36<11:37, 15.83it/s]

{'loss': 0.094, 'grad_norm': 4.640472888946533, 'learning_rate': 5e-05, 'epoch': 4.99}


 50%|█████     | 11020/22040 [12:38<11:44, 15.63it/s]
 50%|█████     | 11020/22040 [12:42<11:44, 15.63it/s]

{'eval_loss': 0.31265273690223694, 'eval_f1': 0.8298505777940823, 'eval_roc_auc': 0.8786760389194355, 'eval_accuracy': 0.18291645397311554, 'eval_runtime': 4.5224, 'eval_samples_per_second': 1299.54, 'eval_steps_per_second': 54.175, 'epoch': 5.0}


 50%|█████     | 11020/22040 [12:46<12:46, 14.37it/s]


{'train_runtime': 766.6979, 'train_samples_per_second': 689.854, 'train_steps_per_second': 28.747, 'train_loss': 0.21260312045983523, 'epoch': 5.0}
Subset Accuracy: 0.1000
Hamming Loss: 0.3156
Micro Precision: 0.5692
Micro Recall: 0.6124
Micro F1-Score: 0.5900
Precision per label: [0.54761905 0.71052632 0.54716981 0.64814815 0.73913043 0.53731343
 0.34782609 0.58333333]
Recall per label: [0.65714286 0.69230769 0.58       0.546875   0.4047619  0.7826087
 0.76190476 0.59322034]
F1-Score per label: [0.5974026  0.7012987  0.5631068  0.59322034 0.52307692 0.63716814
 0.47761194 0.58823529]
{0, 1, 2, 3, 4, 5, 6, 7, 8}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [11]:
import os

scorer_script = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/exp_22/marbert_finetuned_epochs_10_eval_f1_0.8341_greater_threshold_0.3/UBC-NLP-MARBERT-experiment-22_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 58.26 %
MACRO AVERAGE RECALL SCORE: 62.74 %
MACRO AVERAGE F1-SCORE: 58.51 %
MACRO AVERAGE ACCURACY: 68.44 %



## EXP 23

same as above but using camelbert

In [12]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
# dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dataset_path = '/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/lr_binary_classifiers/annotated_multi_label_logisitc_regression.csv'
dev_path = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']
trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="CAMeL-Lab/bert-base-arabic-camelbert-ca",
    labels=labels,
    threshold=0.3,
    exp_num=23
)
trainer.train(
    num_train_epochs=10,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
)
trainer.evaluate(dev_path=dev_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 502/22040 [00:28<20:15, 17.71it/s]

{'loss': 0.5649, 'grad_norm': 1.48406183719635, 'learning_rate': 5e-05, 'epoch': 0.23}


  5%|▍         | 1002/22040 [00:57<20:23, 17.19it/s]

{'loss': 0.4982, 'grad_norm': 1.0999702215194702, 'learning_rate': 5e-05, 'epoch': 0.45}


  7%|▋         | 1502/22040 [01:26<19:48, 17.28it/s]

{'loss': 0.4645, 'grad_norm': 1.0644991397857666, 'learning_rate': 5e-05, 'epoch': 0.68}


  9%|▉         | 2002/22040 [01:55<19:24, 17.21it/s]

{'loss': 0.4377, 'grad_norm': 1.2332335710525513, 'learning_rate': 5e-05, 'epoch': 0.91}


 10%|█         | 2204/22040 [02:07<19:18, 17.13it/s]
 10%|█         | 2204/22040 [02:12<19:18, 17.13it/s]

{'eval_loss': 0.4055422842502594, 'eval_f1': 0.7099067198689322, 'eval_roc_auc': 0.7901697753917134, 'eval_accuracy': 0.0781010719754977, 'eval_runtime': 4.9592, 'eval_samples_per_second': 1185.07, 'eval_steps_per_second': 49.403, 'epoch': 1.0}


 11%|█▏        | 2502/22040 [02:31<19:16, 16.90it/s]  

{'loss': 0.4105, 'grad_norm': 0.9981072545051575, 'learning_rate': 5e-05, 'epoch': 1.13}


 14%|█▎        | 3002/22040 [03:00<19:02, 16.66it/s]

{'loss': 0.3896, 'grad_norm': 2.099914312362671, 'learning_rate': 5e-05, 'epoch': 1.36}


 16%|█▌        | 3502/22040 [03:30<18:27, 16.73it/s]

{'loss': 0.376, 'grad_norm': 1.4147392511367798, 'learning_rate': 5e-05, 'epoch': 1.59}


 18%|█▊        | 4002/22040 [03:59<17:56, 16.75it/s]

{'loss': 0.3704, 'grad_norm': 2.7498104572296143, 'learning_rate': 5e-05, 'epoch': 1.81}


 20%|██        | 4408/22040 [04:23<17:39, 16.64it/s]
 20%|██        | 4408/22040 [04:28<17:39, 16.64it/s]

{'eval_loss': 0.3480108380317688, 'eval_f1': 0.7540106951871658, 'eval_roc_auc': 0.8264540125901423, 'eval_accuracy': 0.09783903352050366, 'eval_runtime': 4.9726, 'eval_samples_per_second': 1181.87, 'eval_steps_per_second': 49.27, 'epoch': 2.0}


 20%|██        | 4502/22040 [04:35<17:27, 16.74it/s]  

{'loss': 0.3528, 'grad_norm': 1.365309238433838, 'learning_rate': 5e-05, 'epoch': 2.04}


 23%|██▎       | 5002/22040 [05:05<16:57, 16.74it/s]

{'loss': 0.3264, 'grad_norm': 1.3290443420410156, 'learning_rate': 5e-05, 'epoch': 2.27}


 25%|██▍       | 5502/22040 [05:35<16:46, 16.44it/s]

{'loss': 0.3218, 'grad_norm': 2.5377919673919678, 'learning_rate': 5e-05, 'epoch': 2.5}


 27%|██▋       | 6002/22040 [06:05<16:00, 16.70it/s]

{'loss': 0.3147, 'grad_norm': 1.8217381238937378, 'learning_rate': 5e-05, 'epoch': 2.72}


 30%|██▉       | 6502/22040 [06:34<15:43, 16.46it/s]

{'loss': 0.3112, 'grad_norm': 1.522733211517334, 'learning_rate': 5e-05, 'epoch': 2.95}


 30%|███       | 6612/22040 [06:41<16:10, 15.89it/s]
 30%|███       | 6612/22040 [06:46<16:10, 15.89it/s]

{'eval_loss': 0.32097285985946655, 'eval_f1': 0.782281024953944, 'eval_roc_auc': 0.8434294707617457, 'eval_accuracy': 0.12948783392887528, 'eval_runtime': 5.0717, 'eval_samples_per_second': 1158.791, 'eval_steps_per_second': 48.308, 'epoch': 3.0}


 32%|███▏      | 7002/22040 [07:12<14:59, 16.73it/s]  

{'loss': 0.2837, 'grad_norm': 1.4617409706115723, 'learning_rate': 5e-05, 'epoch': 3.18}


 34%|███▍      | 7502/22040 [07:41<14:41, 16.49it/s]

{'loss': 0.273, 'grad_norm': 1.5124166011810303, 'learning_rate': 5e-05, 'epoch': 3.4}


 36%|███▋      | 8002/22040 [08:11<14:29, 16.14it/s]

{'loss': 0.2737, 'grad_norm': 1.4198979139328003, 'learning_rate': 5e-05, 'epoch': 3.63}


 39%|███▊      | 8502/22040 [08:41<13:24, 16.82it/s]

{'loss': 0.269, 'grad_norm': 1.253678560256958, 'learning_rate': 5e-05, 'epoch': 3.86}


 40%|████      | 8816/22040 [08:59<13:15, 16.62it/s]
 40%|████      | 8816/22040 [09:04<13:15, 16.62it/s]

{'eval_loss': 0.2930040955543518, 'eval_f1': 0.7997309244666538, 'eval_roc_auc': 0.8588660718502263, 'eval_accuracy': 0.1524587374510805, 'eval_runtime': 5.032, 'eval_samples_per_second': 1167.917, 'eval_steps_per_second': 48.688, 'epoch': 4.0}


 41%|████      | 9002/22040 [09:17<13:05, 16.60it/s]  

{'loss': 0.2539, 'grad_norm': 1.4674493074417114, 'learning_rate': 5e-05, 'epoch': 4.08}


 43%|████▎     | 9502/22040 [09:47<12:26, 16.79it/s]

{'loss': 0.2358, 'grad_norm': 1.907118797302246, 'learning_rate': 5e-05, 'epoch': 4.31}


 45%|████▌     | 10002/22040 [10:16<12:01, 16.69it/s]

{'loss': 0.2363, 'grad_norm': 1.558811068534851, 'learning_rate': 5e-05, 'epoch': 4.54}


 48%|████▊     | 10502/22040 [10:46<11:35, 16.59it/s]

{'loss': 0.2318, 'grad_norm': 1.9798033237457275, 'learning_rate': 5e-05, 'epoch': 4.76}


 50%|████▉     | 11002/22040 [11:16<10:59, 16.73it/s]

{'loss': 0.2345, 'grad_norm': 1.7607308626174927, 'learning_rate': 5e-05, 'epoch': 4.99}


 50%|█████     | 11020/22040 [11:17<11:06, 16.55it/s]
 50%|█████     | 11020/22040 [11:22<11:06, 16.55it/s]

{'eval_loss': 0.28577277064323425, 'eval_f1': 0.8072094130449569, 'eval_roc_auc': 0.8658987054182624, 'eval_accuracy': 0.15705291815552153, 'eval_runtime': 5.0708, 'eval_samples_per_second': 1158.99, 'eval_steps_per_second': 48.316, 'epoch': 5.0}


 52%|█████▏    | 11502/22040 [11:52<10:33, 16.62it/s]  

{'loss': 0.2022, 'grad_norm': 1.5855951309204102, 'learning_rate': 5e-05, 'epoch': 5.22}


 54%|█████▍    | 12002/22040 [12:22<09:59, 16.75it/s]

{'loss': 0.2022, 'grad_norm': 1.5022097826004028, 'learning_rate': 5e-05, 'epoch': 5.44}


 57%|█████▋    | 12502/22040 [12:51<09:31, 16.70it/s]

{'loss': 0.2038, 'grad_norm': 2.084074020385742, 'learning_rate': 5e-05, 'epoch': 5.67}


 59%|█████▉    | 13002/22040 [13:21<09:05, 16.56it/s]

{'loss': 0.2048, 'grad_norm': 1.5258960723876953, 'learning_rate': 5e-05, 'epoch': 5.9}


 60%|█████▉    | 13223/22040 [13:33<07:56, 18.50it/s]
 60%|██████    | 13224/22040 [13:37<07:56, 18.50it/s]

{'eval_loss': 0.2837856709957123, 'eval_f1': 0.8147543478561645, 'eval_roc_auc': 0.8696570737623799, 'eval_accuracy': 0.16743236345074017, 'eval_runtime': 4.5042, 'eval_samples_per_second': 1304.769, 'eval_steps_per_second': 54.393, 'epoch': 6.0}


 61%|██████▏   | 13503/22040 [13:54<07:34, 18.77it/s]  

{'loss': 0.1872, 'grad_norm': 1.551269769668579, 'learning_rate': 5e-05, 'epoch': 6.13}


 64%|██████▎   | 14002/22040 [14:20<07:12, 18.58it/s]

{'loss': 0.1734, 'grad_norm': 1.4010202884674072, 'learning_rate': 5e-05, 'epoch': 6.35}


 66%|██████▌   | 14502/22040 [14:49<07:28, 16.79it/s]

{'loss': 0.1775, 'grad_norm': 1.9627329111099243, 'learning_rate': 5e-05, 'epoch': 6.58}


 68%|██████▊   | 15002/22040 [15:19<07:03, 16.63it/s]

{'loss': 0.1772, 'grad_norm': 2.12345027923584, 'learning_rate': 5e-05, 'epoch': 6.81}


 70%|███████   | 15428/22040 [15:45<06:38, 16.58it/s]
 70%|███████   | 15428/22040 [15:50<06:38, 16.58it/s]

{'eval_loss': 0.2869652211666107, 'eval_f1': 0.8176449485332671, 'eval_roc_auc': 0.8722715311141812, 'eval_accuracy': 0.17083546026884464, 'eval_runtime': 5.0575, 'eval_samples_per_second': 1162.034, 'eval_steps_per_second': 48.443, 'epoch': 7.0}


 70%|███████   | 15502/22040 [15:56<06:39, 16.38it/s]  

{'loss': 0.1732, 'grad_norm': 1.748171091079712, 'learning_rate': 5e-05, 'epoch': 7.03}


 73%|███████▎  | 16002/22040 [16:25<05:57, 16.88it/s]

{'loss': 0.1461, 'grad_norm': 1.665237545967102, 'learning_rate': 5e-05, 'epoch': 7.26}


 75%|███████▍  | 16502/22040 [16:55<05:28, 16.87it/s]

{'loss': 0.1513, 'grad_norm': 2.2277581691741943, 'learning_rate': 5e-05, 'epoch': 7.49}


 77%|███████▋  | 17002/22040 [17:24<05:02, 16.65it/s]

{'loss': 0.1535, 'grad_norm': 1.8187083005905151, 'learning_rate': 5e-05, 'epoch': 7.71}


 79%|███████▉  | 17502/22040 [17:54<04:30, 16.79it/s]

{'loss': 0.1539, 'grad_norm': 2.229557752609253, 'learning_rate': 5e-05, 'epoch': 7.94}


 80%|████████  | 17632/22040 [18:02<04:22, 16.82it/s]
 80%|████████  | 17632/22040 [18:07<04:22, 16.82it/s]

{'eval_loss': 0.29364240169525146, 'eval_f1': 0.8242024327041532, 'eval_roc_auc': 0.8738783607227875, 'eval_accuracy': 0.1970393057682491, 'eval_runtime': 5.0136, 'eval_samples_per_second': 1172.214, 'eval_steps_per_second': 48.867, 'epoch': 8.0}


 82%|████████▏ | 18002/22040 [18:30<04:01, 16.74it/s]  

{'loss': 0.1374, 'grad_norm': 2.4780025482177734, 'learning_rate': 5e-05, 'epoch': 8.17}


 84%|████████▍ | 18502/22040 [19:00<03:30, 16.83it/s]

{'loss': 0.1276, 'grad_norm': 1.5429753065109253, 'learning_rate': 5e-05, 'epoch': 8.39}


 86%|████████▌ | 19002/22040 [19:29<03:01, 16.73it/s]

{'loss': 0.1318, 'grad_norm': 1.8818618059158325, 'learning_rate': 5e-05, 'epoch': 8.62}


 88%|████████▊ | 19502/22040 [19:59<02:30, 16.86it/s]

{'loss': 0.1332, 'grad_norm': 1.835778832435608, 'learning_rate': 5e-05, 'epoch': 8.85}


 90%|█████████ | 19836/22040 [20:18<02:09, 16.98it/s]
 90%|█████████ | 19836/22040 [20:23<02:09, 16.98it/s]

{'eval_loss': 0.30559656023979187, 'eval_f1': 0.8216548012780985, 'eval_roc_auc': 0.8733135069799041, 'eval_accuracy': 0.18478815722307299, 'eval_runtime': 4.9725, 'eval_samples_per_second': 1181.903, 'eval_steps_per_second': 49.271, 'epoch': 9.0}


 91%|█████████ | 20002/22040 [20:35<02:00, 16.87it/s]

{'loss': 0.1266, 'grad_norm': 2.3318207263946533, 'learning_rate': 5e-05, 'epoch': 9.07}


 93%|█████████▎| 20502/22040 [21:04<01:30, 16.94it/s]

{'loss': 0.1119, 'grad_norm': 2.0546348094940186, 'learning_rate': 5e-05, 'epoch': 9.3}


 95%|█████████▌| 21002/22040 [21:35<01:05, 15.92it/s]

{'loss': 0.114, 'grad_norm': 2.1959190368652344, 'learning_rate': 5e-05, 'epoch': 9.53}


 98%|█████████▊| 21502/22040 [22:07<00:33, 16.25it/s]

{'loss': 0.1166, 'grad_norm': 1.897908091545105, 'learning_rate': 5e-05, 'epoch': 9.75}


100%|█████████▉| 22002/22040 [22:36<00:02, 16.63it/s]

{'loss': 0.1172, 'grad_norm': 2.7256805896759033, 'learning_rate': 5e-05, 'epoch': 9.98}


100%|██████████| 22040/22040 [22:39<00:00, 16.71it/s]
100%|██████████| 22040/22040 [22:45<00:00, 16.71it/s]

{'eval_loss': 0.3176794946193695, 'eval_f1': 0.8254703494024133, 'eval_roc_auc': 0.8735565425375277, 'eval_accuracy': 0.19499744767738642, 'eval_runtime': 4.9762, 'eval_samples_per_second': 1181.022, 'eval_steps_per_second': 49.234, 'epoch': 10.0}


100%|██████████| 22040/22040 [22:47<00:00, 16.12it/s]


{'train_runtime': 1367.1415, 'train_samples_per_second': 386.873, 'train_steps_per_second': 16.121, 'train_loss': 0.24643782885667417, 'epoch': 10.0}
Subset Accuracy: 0.1083
Hamming Loss: 0.3344
Micro Precision: 0.5482
Micro Recall: 0.5590
Micro F1-Score: 0.5535
Precision per label: [0.54545455 0.59574468 0.57777778 0.61702128 0.64       0.51612903
 0.35714286 0.56862745]
Recall per label: [0.68571429 0.71794872 0.52       0.453125   0.38095238 0.69565217
 0.71428571 0.49152542]
F1-Score per label: [0.60759494 0.65116279 0.54736842 0.52252252 0.47761194 0.59259259
 0.47619048 0.52727273]
{0, 1, 2, 3, 4, 5, 6, 7, 8}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [13]:
import os

scorer_script = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/exp_23/marbert_finetuned_epochs_10_eval_f1_0.8255_greater_threshold_0.3/CAMeL-Lab-bert-base-arabic-camelbert-ca-experiment-23_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 55.22 %
MACRO AVERAGE RECALL SCORE: 58.24 %
MACRO AVERAGE F1-SCORE: 55.03 %
MACRO AVERAGE ACCURACY: 66.56 %



Random baseline

In [17]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
# dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dataset_path = '/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/lr_binary_classifiers/annotated_multi_label_logisitc_regression.csv'
dev_path = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']

trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/exp_5/marbert_finetuned",
    labels=labels,
    threshold=0.3,
    exp_num=5
)

trainer.save_dir = f'./exp_{trainer.exp_num}'

# trainer.train(
#     num_train_epochs=10,
#     metric_for_best_model="eval_f1",
#     greater_is_better=True,
#     per_device_train_batch_size=24,
#     per_device_eval_batch_size=24,
# )
trainer.evaluate(dev_path=dev_path)

Subset Accuracy: 0.1000
Hamming Loss: 0.6292
Micro Precision: 0.3708
Micro Recall: 1.0000
Micro F1-Score: 0.5410
Precision per label: [0.29166667 0.325      0.41666667 0.53333333 0.35       0.38333333
 0.175      0.49166667]
Recall per label: [1. 1. 1. 1. 1. 1. 1. 1.]
F1-Score per label: [0.4516129  0.49056604 0.58823529 0.69565217 0.51851852 0.55421687
 0.29787234 0.65921788]
{8}


  df_replaced = dev.replace({'y': 1, 'n': 0})


In [18]:
import os

scorer_script = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/exp_5/-home-ali.mekky-Documents-NLP-Project-Cross-Country-Dialectal-Arabic-Identification-exp_5-marbert_finetuned-experiment-5_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 37.08 %
MACRO AVERAGE RECALL SCORE: 100.00 %
MACRO AVERAGE F1-SCORE: 53.20 %
MACRO AVERAGE ACCURACY: 37.08 %



## exp 24

In [4]:
dataset = pd.read_csv("/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/multilabel/NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv")

In [5]:
dataset = dataset[dataset['Computed'] == 'yes']

In [6]:
dataset

Unnamed: 0,id,tweet,Algeria,Bahrain,Egypt,Iraq,Jordan,Kuwait,Lebanon,Libya,...,Oman,Palestine,Qatar,Saudi_Arabia,Sudan,Syria,Tunisia,UAE,Yemen,Computed
0,0,الفار العور يشوف فقط كيسي ومايشوف ماتويد,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,yes
1,1,ديني ربنا يستر,0,0,1,1,1,0,1,0,...,0,1,0,0,1,1,0,0,0,yes
2,2,اساسا نسبكم قذر ونجس بلاش تتفاخروا بنجاستكم وه...,0,0,1,1,1,0,1,0,...,0,1,0,1,0,1,0,0,1,yes
3,3,المشاعر تحتاج الي المشاعر تحتاج الي رفيق يخذل ...,0,0,1,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,yes
4,4,ني حاضرها لايف,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58742,58734,جماعه الخير المنخفض الي جاي ايام منخفض قوي ماط...,0,0,0,0,1,0,1,0,...,0,1,0,0,0,1,0,0,0,yes
58743,58735,انا بايع الكل وشاري عيونك,0,0,1,0,1,0,0,0,...,0,1,0,1,1,0,0,0,0,yes
58744,58736,USER USER USER شكلها نست يوم ترامب قال للامريك...,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,yes
58745,58737,السطلات البنقو الحشيش معاكم URL ΉМĄDĄ,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,yes


In [7]:
label_columns = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait', 'Lebanon', 
                 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar', 'Saudi_Arabia', 
                 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']

In [8]:
dataset['dialect_sum'] = dataset[label_columns].sum(axis=1)

In [9]:
dataset['dialect_sum'].value_counts()

dialect_sum
1     14829
2      3220
4      2580
3      2329
7      1794
6      1709
5      1416
8      1108
9       536
15      532
18      502
12      334
13      273
14      256
10      168
11      137
17       26
16       11
Name: count, dtype: int64

In [10]:
desired_samples = 1500

In [11]:
for index, row in dataset.iterrows():
    if row['dialect_sum'] in [16, 17]:
        # Update `dialect_sum` to 18
        dataset.at[index, 'dialect_sum'] = 18
        
        # Set all dialect columns with 0 to 1
        dataset.loc[index, label_columns] = row[label_columns].apply(lambda x: 1 if x == 0 else x)

# Verify the changes
print(dataset['dialect_sum'].value_counts().sort_index())

dialect_sum
1     14829
2      3220
3      2329
4      2580
5      1416
6      1709
7      1794
8      1108
9       536
10      168
11      137
12      334
13      273
14      256
15      532
18      539
Name: count, dtype: int64


In [12]:
from sklearn.utils import resample

# Define dialect columns
dialect_columns = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']

# Filter rows with `dialect_sum` equal to 1
rows_with_sum_1 = dataset[dataset['dialect_sum'] == 1]

# Create an empty DataFrame to store balanced rows
balanced_rows = pd.DataFrame()

# Balance rows across all dialects
for dialect in dialect_columns:
    # Select rows where the specific dialect column is 1
    dialect_rows = rows_with_sum_1[rows_with_sum_1[dialect] == 1]
    
    # Resample to the size of the smallest group
    resampled_rows = resample(dialect_rows, replace=True, 
                              n_samples=150, 
                              random_state=42)
    balanced_rows = pd.concat([balanced_rows, resampled_rows])

# Combine balanced rows with the rest of the dataset
other_rows = dataset[dataset['dialect_sum'] != 1]
balanced_dataset = pd.concat([balanced_rows, other_rows]).reset_index(drop=True)

# Verify the distribution
print("Balanced rows with dialect_sum = 1:")
print(balanced_rows[dialect_columns].sum())


Balanced rows with dialect_sum = 1:
Algeria         150
Bahrain         150
Egypt           150
Iraq            150
Jordan          150
Kuwait          150
Lebanon         150
Libya           150
Morocco         150
Oman            150
Palestine       150
Qatar           150
Saudi_Arabia    150
Sudan           150
Syria           150
Tunisia         150
UAE             150
Yemen           150
dtype: int64


In [13]:
from sklearn.utils import resample
import pandas as pd

# Define the desired number of samples per group
desired_samples = 1500

# Filter rows with `dialect_sum` values between 2-16 (inclusive) and 18, excluding 1, 16, and 17
filtered_dataset = dataset[dataset['dialect_sum'].isin(list(range(2, 16)) + [18])]

# Create an empty DataFrame for the balanced dataset
balanced_dataset_2 = pd.DataFrame()

# Group by `dialect_sum` and resample
for value, group in filtered_dataset.groupby('dialect_sum'):
    if len(group) > desired_samples:
        # If group is larger than desired samples, undersample
        group = resample(group, replace=False, n_samples=desired_samples, random_state=42)
    elif len(group) < desired_samples:
        # If group is smaller than desired samples, oversample
        group = resample(group, replace=True, n_samples=desired_samples, random_state=42)
    balanced_dataset_2 = pd.concat([balanced_dataset_2, group])

# Shuffle the dataset
balanced_dataset_2 = balanced_dataset_2.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify the distribution
print("Balanced dataset distribution:")
print(balanced_dataset_2['dialect_sum'].value_counts())


Balanced dataset distribution:
dialect_sum
3     1500
7     1500
14    1500
9     1500
12    1500
2     1500
11    1500
18    1500
10    1500
15    1500
4     1500
8     1500
6     1500
5     1500
13    1500
Name: count, dtype: int64


In [14]:
balanced_dataset['dialect_sum'].value_counts()
balanced_dataset = balanced_dataset[balanced_dataset['dialect_sum'] == 1]

In [15]:
new_dataset = pd.concat([balanced_dataset, balanced_dataset_2]).reset_index(drop=True)

In [16]:
new_dataset['dialect_sum'].value_counts()

dialect_sum
1     2700
3     1500
7     1500
14    1500
9     1500
12    1500
2     1500
11    1500
18    1500
10    1500
15    1500
4     1500
8     1500
6     1500
5     1500
13    1500
Name: count, dtype: int64

In [17]:
new_dataset.to_csv("/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/multilabel/NADIcombined_oversampled.csv")

In [20]:
file_name = ["First_200.csv", "NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv", "First_1000.csv", "balanced_multilabel_dataset.csv", "balanced_multilabel_dataset_500.csv"]
# dataset_path = f"/home/lara.hassan/Downloads/NADI2024_subtask1/subtask1/our_data/{file_name[4]}"
dataset_path = '/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/multilabel/NADIcombined_oversampled.csv'
dev_path = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/dev/NADI2024_subtask1_dev2.tsv"
labels = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']

trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
    labels=labels,
    threshold=0.3,
    exp_num=26
)

trainer.save_dir = f'./exp_{trainer.exp_num}'

trainer.train(
    num_train_epochs=10,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
)
trainer.evaluate(dev_path=dev_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-mix and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.516,0.361579,0.830442,0.835948,0.182143
2,0.3233,0.299061,0.855057,0.860643,0.249603
3,0.2309,0.272866,0.875414,0.882615,0.357937
4,0.1742,0.270055,0.883791,0.891444,0.438095
5,0.1392,0.282849,0.890102,0.897536,0.475397
6,0.1152,0.269263,0.898637,0.906114,0.51627
7,0.0946,0.287619,0.89703,0.904455,0.540079


In [30]:
import os

scorer_script = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/NADI2024-ST1-Scorer.py"
gold_file = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/sample_submission/NADI2024_subtask1_dev2_gold.txt"
predictions_file = "/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/exp_24/marbert_finetuned_epochs_3_eval_f1_0.9034_greater_threshold_0.3/CAMeL-Lab-bert-base-arabic-camelbert-mix-experiment-24_predictions.txt"
!python3 "{scorer_script}" "{gold_file}" "{predictions_file}"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 70.54 %
MACRO AVERAGE RECALL SCORE: 44.46 %
MACRO AVERAGE F1-SCORE: 52.90 %
MACRO AVERAGE ACCURACY: 72.50 %

