<a href="https://colab.research.google.com/github/ma850419/Fast_UNet/blob/main/Copy_of_arcade2english_v1_23Feb2026.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers datasets sentencepiece sacrebleu


In [None]:
import pandas as pd

# Load core datasets
train = pd.read_csv("/content/drive/MyDrive/Acadian/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Acadian/test.csv")
published = pd.read_csv("/content/drive/MyDrive/Acadian/published_texts.csv")
publications = pd.read_csv("/content/drive/MyDrive/Acadian/publications.csv")
lexicon = pd.read_csv("/content/drive/MyDrive/Acadian/OA_Lexicon_eBL.csv")

print(train.head())
print(test.head())


In [None]:
import re

def normalize_transliteration(text):
    # Remove scribal marks, normalize hyphens
    text = re.sub(r"['’]+", "", text)
    text = text.replace("-", " ")
    return text.lower()

train["translit_norm"] = train["transliteration"].apply(normalize_transliteration)
test["translit_norm"] = test["transliteration"].apply(normalize_transliteration)


In [None]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab") # <-- new requirement
def split_sentences(text):
    return nltk.sent_tokenize(text)

train["translation_sentences"] = train["translation"].apply(split_sentences)


In [None]:
parallel_data = []
for _, row in train.iterrows():
    for sent in row["translation_sentences"]:
        parallel_data.append((row["translit_norm"], sent))

parallel_df = pd.DataFrame(parallel_data, columns=["akkadian", "english"])


In [None]:
from transformers import M2M100Tokenizer

model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)

# Define source and target languages
tokenizer.src_lang = "en"        # or "fr", "de", etc. depending on your source
tokenizer.tgt_lang = "en"        # target is English


In [None]:
import pandas as pd
from datasets import Dataset

# Make sure the columns match what your preprocessing expects
parallel_df = parallel_df.rename(columns={
    "akkadian": "transliteration",
    "english": "translation"
})

# Convert to Dataset
dataset = Dataset.from_pandas(parallel_df[["transliteration", "translation"]])

# Preprocessing function
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["transliteration"], max_length=128, truncation=True
    )
    labels = tokenizer(
        text_target=examples["translation"], max_length=128, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True)


In [None]:
import torch
import pandas as pd
from datasets import Dataset
import transformers
print(transformers.__version__)

from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# Load datasets
train = pd.read_csv("/content/drive/MyDrive/Acadian/train.csv").dropna(subset=["transliteration","translation"])
test = pd.read_csv("/content/drive/MyDrive/Acadian/test.csv")

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train[["transliteration","translation"]])

# Choose multilingual base model
model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

# Add Akkadian transliteration characters if missing
special_tokens = {"additional_special_tokens": ["ḫ", "š", "ṭ", "ū", "ā"]}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# Set source and target languages
tokenizer.src_lang = "fr"   # treat Akkadian transliteration as "fr" (or another supported code)
tokenizer.tgt_lang = "en"   # target is English

# Preprocessing function
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["transliteration"],
        max_length=128,
        truncation=True
    )
    labels = tokenizer(
        text_target=examples["translation"],
        max_length=128,
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = train_dataset.map(preprocess_function, batched=True)

# Training arguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    do_eval=True,
    eval_steps=200,                # still valid
    logging_steps=50,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    num_train_epochs=20,
    save_total_limit=2,
    predict_with_generate=True,
    report_to="none"
)


# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)


# Train
trainer.train()

# Save model and tokenizer
save_path = "/content/drive/MyDrive/deep-past-model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


In [None]:
!pip install kaggle
from google.colab import files
files.upload()   # upload kaggle.json here
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!mkdir my_dataset
!cp -r /content/drive/MyDrive/deep-past-model/* my_dataset/


In [None]:
!kaggle datasets create -p my_dataset

In [None]:
pip install --upgrade transformers


In [None]:
import torch

def translate_texts(texts, model, tokenizer):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)   # ensure model is on GPU if available

    # Tokenize and move inputs to the same device
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)

    # Generate translations
    translated = model.generate(**inputs)

    # Decode outputs
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

# Apply to your test set
test["predicted_translation"] = translate_texts(test["translit_norm"].tolist(), model, tokenizer)



In [None]:
submission = test[["id", "predicted_translation"]]
submission.rename(columns={"predicted_translation": "translation"}, inplace=True)
submission.to_csv("submission.csv", index=False)
