In [None]:
# Install required libraries
!pip install -q transformers datasets scikit-learn accelerate pandas openpyxl huggingface_hub

In [19]:
# Handle imports
import pandas as pd
from google.colab import drive
from datasets import Dataset, DatasetDict
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import EarlyStoppingCallback
from huggingface_hub import login

In [20]:
# Load Excel files
train_df = pd.read_excel("/kaggle/input/arabic-authors/train_90percent_per_author.xlsx")
val_df = pd.read_excel("/kaggle/input/arabic-authors/AuthorshipClassficiationVal.xlsx")
test_df = pd.read_excel("/kaggle/input/arabic-authors/heldout_10percent_per_author.xlsx")

# Clean missing values
train_df = train_df.dropna(subset=["text_in_author_style", "author"])
val_df = val_df.dropna(subset=["text_in_author_style", "author"])
test_df = test_df.dropna(subset=["text_in_author_style", "author"])

# Encode author names into numeric labels
all_authors = pd.concat([train_df["author"], val_df["author"]])
label_encoder = LabelEncoder()
label_encoder.fit(all_authors)

train_df["label"] = label_encoder.transform(train_df["author"])
val_df["label"] = label_encoder.transform(val_df["author"])
test_df["label"] = label_encoder.transform(test_df["author"])

# Prepare final datasets
train_df = train_df[["text_in_author_style", "label"]].rename(columns={"text_in_author_style": "text"})
val_df = val_df[["text_in_author_style", "label"]].rename(columns={"text_in_author_style": "text"})
test_df = test_df[["text_in_author_style", "label"]].rename(columns={"text_in_author_style": "text"})

# Convert to DatasetDict
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True)),
})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 31602
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 4157
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3520
    })
})

In [21]:
# Tokenize data
model_name_to_token_length_map = {
    "aubmindlab/bert-base-arabertv2": 512,
    "NAMAA-Space/AraModernBert-Base-V1.0": 512,#defaul 8192
    "answerdotai/ModernBERT-base": 512
}


model_name = "aubmindlab/bert-base-arabertv2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=model_name_to_token_length_map[model_name]
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/31602 [00:00<?, ? examples/s]

Map:   0%|          | 0/4157 [00:00<?, ? examples/s]

Map:   0%|          | 0/3520 [00:00<?, ? examples/s]

In [22]:
# Prepare training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/arabic-author-model",
    eval_strategy="steps",
    eval_steps=2000,
    save_steps=2000,
    save_strategy="steps",
    # logging_steps=1,  # Steps interval for logging
    # eval_steps=20,  # Steps interval for evaluation
    logging_dir="/kaggle/working/logs",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    warmup_ratio=0.05,
    learning_rate=2e-5,
    lr_scheduler_type="linear", # default is linear
    weight_decay=0.01,
    save_total_limit=5,
    load_best_model_at_end=True,
    fp16=True,
    metric_for_best_model="f1",
    report_to=["wandb"],
    run_name="arabic-author-arabert-v2"
)

In [23]:
# Prepare evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

In [25]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY")

wandb.login(key=WANDB_API_KEY)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmabutame[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [26]:
import wandb
from datetime import datetime

now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
project_name = "arabic-author-classification"
wandb.init(
    project=project_name,
    name=f"project_name-{now}",
    config=training_args,
)


In [27]:
# Prepare trainer

num_classes = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# Train data

trainer.train()




Step,Training Loss,Validation Loss,Accuracy,F1
2000,0.5442,0.743358,0.798172,0.783896
4000,0.2995,0.906832,0.835218,0.831561
6000,0.3531,1.908645,0.84893,0.843064
8000,0.2864,2.156011,0.847727,0.848847
10000,0.1463,1.86257,0.871542,0.870087




TrainOutput(global_step=11853, training_loss=0.4073278056532283, metrics={'train_runtime': 6952.4786, 'train_samples_per_second': 13.636, 'train_steps_per_second': 1.705, 'total_flos': 2.494876208101171e+16, 'train_loss': 0.4073278056532283, 'epoch': 3.0})

Metrics for aubmindlab/bert-base-arabertv2 with the following training arguments:
- 2e-5 LR
- Lieaner LR
- total limits 3
- no wramup
- eval + save strategy = EPOCH
- eval starts after one EPOCH

| Epoch | Training Loss | Validation Loss | Accuracy |    F1    |
|-------|---------------|-----------------|----------|----------|
|   1   |    0.296500   |     0.659372    | 0.849170 | 0.847203 |
|   2   |    0.113200   |     0.747239    | 0.861679 | 0.861908 |
|   3   |    0.035400   |     0.700232    | 0.883570 | 0.882200 |

macro f1_score on held-out 10%: 0.9700

---------
Metrics for aubmindlab/bert-base-arabertv2 with the following training arguments:
- 10e-3 LR
- Cosine LR
- total limits 10
- 10% of total steps wramup
- eval + save strategy = steps
- eval starts after 5000 steps

In [None]:
# Save model + tokenizer
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
HF_API_KEY = user_secrets.get_secret("HF_API_KEY")

login(token=HF_API_KEY)
model.push_to_hub("Tami3/arabic-author-classifier")
tokenizer.push_to_hub("Tami3/arabic-author-classifier")
# model.save_pretrained("/kaggle/working/arabic-author-model")
# tokenizer.save_pretrained("/kaggle/working/arabic-author-model")

print("DONE")

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

DONE
