<a href="https://colab.research.google.com/github/ma850419/Fast_UNet/blob/main/akkadian2english_24feb2026.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from datasets import Dataset, concatenate_datasets
import re
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from sklearn.model_selection import train_test_split
import unicodedata

# Download NLTK data
nltk.download("punkt")
nltk.download("punkt_tab")

# Mount drive
from google.colab import drive
drive.mount('/content/drive')

# Load all available data
train = pd.read_csv("/content/drive/MyDrive/Acadian/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Acadian/test.csv")
published = pd.read_csv("/content/drive/MyDrive/Acadian/published_texts.csv")
lexicon = pd.read_csv("/content/drive/MyDrive/Acadian/OA_Lexicon_eBL.csv")

print(f"Train: {len(train)}, Published: {len(published)}, Lexicon: {len(lexicon)}")

# ==================== IMPROVED PREPROCESSING ====================

def advanced_normalize_transliteration(text):
    """Preserve important characters while cleaning"""
    if pd.isna(text) or not isinstance(text, str):
        return ""

    # Normalize Unicode
    text = unicodedata.normalize("NFKC", text)

    # Preserve important Akkadian characters
    # Don't replace these: ḫ, š, ṭ, ṣ, ḥ, â, ê, î, û, ā, ē, ī, ū

    # Remove problematic characters but keep the important ones
    text = re.sub(r"[˹˺\[\]‹›«»]", "", text)  # Remove scribal marks

    # Handle hyphens consistently
    text = text.replace("-", " ")  # Replace hyphens with spaces

    # Normalize multiple spaces
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

def augment_with_lexicon(lexicon_df, train_df):
    """Create additional training pairs from lexicon"""
    augmented_pairs = []

    for _, row in lexicon_df.iterrows():
        if pd.notna(row.get('Akkadian', '')) and pd.notna(row.get('English', '')):
            augmented_pairs.append({
                'transliteration': advanced_normalize_transliteration(row['Akkadian']),
                'translation': row['English']
            })

    return pd.DataFrame(augmented_pairs)

def augment_with_published(published_df):
    """Extract sentence pairs from published texts"""
    pairs = []

    for _, row in published_df.iterrows():
        if pd.notna(row.get('akkadian', '')) and pd.notna(row.get('translation', '')):
            # Split into sentences for better alignment
            akk_sentences = nltk.sent_tokenize(row['akkadian'])
            eng_sentences = nltk.sent_tokenize(row['translation'])

            # Simple alignment by sentence count
            min_len = min(len(akk_sentences), len(eng_sentences))
            for i in range(min_len):
                pairs.append({
                    'transliteration': advanced_normalize_transliteration(akk_sentences[i]),
                    'translation': eng_sentences[i]
                })

    return pd.DataFrame(pairs)

# Apply normalization to all datasets
print("Normalizing training data...")
train['transliteration_norm'] = train['transliteration'].apply(advanced_normalize_transliteration)
train['translation_norm'] = train['translation'].apply(lambda x: x.strip() if pd.notna(x) else "")

# Create augmented data
print("Creating augmented data...")
lexicon_pairs = augment_with_lexicon(lexicon, train)
published_pairs = augment_with_published(published)

# Combine all data
all_data = pd.concat([
    train[['transliteration_norm', 'translation_norm']].rename(
        columns={'transliteration_norm': 'transliteration', 'translation_norm': 'translation'}
    ),
    lexicon_pairs,
    published_pairs
], ignore_index=True)

# Remove duplicates and empty rows
all_data = all_data.dropna(subset=['transliteration', 'translation'])
all_data = all_data[all_data['transliteration'].str.len() > 0]
all_data = all_data[all_data['translation'].str.len() > 0]
all_data = all_data.drop_duplicates(subset=['transliteration'])

print(f"Total training pairs after augmentation: {len(all_data)}")

# Split into train/validation
train_df, val_df = train_test_split(all_data, test_size=0.1, random_state=42)

# ==================== MODEL SETUP ====================

model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

# IMPORTANT: Use the correct language codes
# M2M100 supports many languages - we'll use "akk" if available, otherwise "fr" as fallback
# Check if Akkadian is supported
try:
    tokenizer.src_lang = "akk"  # Try Akkadian
except:
    tokenizer.src_lang = "fr"   # Fallback to French
    print("Akkadian not in language list, using French as source")

tokenizer.tgt_lang = "en"

# Add Akkadian-specific tokens
special_tokens = {
    "additional_special_tokens": [
        "ḫ", "š", "ṭ", "ṣ", "ḥ", "â", "ê", "î", "û", "ā", "ē", "ī", "ū",
        "[", "]", "(", ")", "{", "}", "<", ">"
    ]
}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# ==================== DATASET PREPARATION ====================

def preprocess_function(examples):
    """Tokenize with proper handling"""
    # Source texts
    model_inputs = tokenizer(
        examples["transliteration"],
        max_length=128,
        truncation=True,
        padding=False
    )

    # Target texts
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["translation"],
            max_length=128,
            truncation=True,
            padding=False
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert to datasets
train_dataset = Dataset.from_pandas(train_df[['transliteration', 'translation']])
val_dataset = Dataset.from_pandas(val_df[['transliteration', 'translation']])

# Tokenize
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['transliteration', 'translation']
)
tokenized_val = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['transliteration', 'translation']
)

# ==================== TRAINING ARGUMENTS ====================

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    logging_steps=100,
    learning_rate=3e-5,  # Slightly higher learning rate
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Simulate larger batch
    num_train_epochs=50,
    warmup_steps=1000,
    weight_decay=0.01,
    predict_with_generate=True,
    generation_max_length=128,
    generation_num_beams=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=3,
    fp16=True,  # Mixed precision training
    report_to="none",
    dataloader_num_workers=2,
    optim="adamw_torch",
    lr_scheduler_type="polynomial",
)

# ==================== TRAINER ====================

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)

# Train
trainer.train()

# Save model
save_path = "/content/drive/MyDrive/deep-past-model-improved"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}")