# Curriculum Learning for Arabic Dialect Classification
## Reorganized and Modular Implementation

## 1. Imports

In [None]:
import torch
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EvalPrediction,
    TrainerCallback,
)
from sklearn.metrics import accuracy_score, hamming_loss, precision_recall_fscore_support
from preprocess import final_eliminations
from transformers import TrainingArguments, EarlyStoppingCallback
from transformers.trainer_utils import IntervalStrategy

# Import custom modules
from bert_trainer import BertTrainer
from config import Config, ExperimentConfig
from data_utils import load_and_prepare_dataset, prepare_all_curriculum_stages
from main_training import train_single_stage, train_curriculum_sequence, train_standalone_experiment
from prepare_data import prepare_curriculum_data, analyze_dataset_distribution

: 

## 2. Data Exploration and Preparation

In [None]:
# Load and analyze dataset
dataset_path = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/multilabel/NADIcombined_cleaned_MULTI_LABEL_MODIFIED_FINAL.csv"
dataset = pd.read_csv(dataset_path)

In [None]:
# Filter and prepare dataset
label_columns = ['Algeria', 'Bahrain', 'Egypt', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Oman', 'Palestine', 'Qatar',
       'Saudi_Arabia', 'Sudan', 'Syria', 'Tunisia', 'UAE', 'Yemen']

dataset = dataset[dataset['Computed'] == 'yes']
dataset['dialect_sum'] = dataset[label_columns].sum(axis=1)

# Filter rows with dialect_sum equal to 1
rows_with_sum_1 = dataset[dataset['dialect_sum'] == 1]

print(f"Total samples: {len(dataset)}")
print(f"Samples with single dialect: {len(rows_with_sum_1)}")

## 3. Option A: Prepare All Curriculum Stages (Run Once)

In [None]:
# Prepare all curriculum learning stages
# This creates stage files from stage_2.csv through stage_18.csv
output_dir = "/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/CL_stages"

stage_paths = prepare_curriculum_data(
    dataset_path=dataset_path,
    output_dir=output_dir,
    computed_filter=True
)

## 4. Option B: Train Standalone Experiment (Single Stage)

In [None]:
# Train a standalone experiment on stage 1 data
train_standalone_experiment(
    exp_num=27,
    dataset_path=Config.get_stage_path(1),
    model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
    epochs=1,
    batch_size=24,
    threshold=0.3
)

## 5. Option C: Train Using Custom Configuration (Single Stage)

In [None]:
# Configure experiment
exp_config = ExperimentConfig(
    exp_num=27,
    stage=0,
    model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
    threshold=0.3,
    batch_size=24,
    epochs=1
)

# Train
train_single_stage(exp_config)

## 6. Option D: Full Curriculum Learning Sequence

In [None]:
# Train initial stage (stage 0) from pretrained model
exp_config_stage0 = ExperimentConfig(
    exp_num=28,
    stage=0,
    model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
    threshold=0.3,
    batch_size=24,
    epochs=2
)

train_single_stage(exp_config_stage0)

In [None]:
# Train subsequent stages using models from previous stages
# This will train stages 1 through 15
train_curriculum_sequence(
    exp_num=28,
    start_stage=1,  # Start from stage 1 (stage 0 already trained above)
    end_stage=15,
    epochs=2,
    batch_size=24
)

## 7. Manual Curriculum Loop (Alternative to Option D)

In [None]:
# Manually loop through curriculum stages with full control
for stage_num in range(1, 16):
    exp_config = ExperimentConfig(
        exp_num=28,
        stage=stage_num,
        threshold=0.3,
        batch_size=24,
        epochs=2,
        use_previous_stage_model=True  # Load model from previous stage
    )
    
    train_single_stage(exp_config)

## 8. Direct Trainer Usage (Low-Level Access)

In [None]:
# For maximum control, use BertTrainer directly
dataset_path = "/home/ali.mekky/Documents/NLP/Project/Cross-Country-Dialectal-Arabic-Identification/CL_stages/stage_1.csv"
dev_path = "/home/ali.mekky/Documents/NLP/Project/NADI2024/subtask1/dev/NADI2024_subtask1_dev2.tsv"

trainer = BertTrainer(
    training_dataset_path=dataset_path,
    model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
    labels=Config.DIALECT_LABELS,
    threshold=0.3,
    exp_num=27
)

trainer.save_dir = f'./exp_{trainer.exp_num}'

trainer.train(
    num_train_epochs=1,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
)

trainer.evaluate(dev_path=dev_path)