<a href="https://colab.research.google.com/github/maherysm/Translation/blob/main/Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# %pip install torch pandas transformers datasets evaluate boto3 sagemaker sentencepiece ipywidgets

In [25]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
import evaluate
from huggingface_hub import login
import boto3
import sagemaker
from sagemaker.sklearn import SKLearn
from sagemaker.pytorch import PyTorch
from sagemaker.estimator import Estimator
from sagemaker.huggingface import HuggingFace
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
import logging
import sys


# Hardware Accelerator

In [10]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("on GPU.")
else:
    device = torch.device("cpu")
    print("on CPU.")

on CPU.


# Model and Tokenizer

In [None]:
# set HUGGINGFACE_TOKEN={token}

: 

In [5]:
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


# Dataset

In [16]:
dataset = load_dataset("presencesw/wmt15_fr_en")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
train_dataset.save_to_disk("./train")
validation_dataset.save_to_disk("./validation")
test_dataset.save_to_disk("./test")

In [None]:
# aws s3 cp ./train_data s3://{bucket}/train_data --recursive
# aws s3 cp ./validation_data s3://{bucket}/validation_data --recursive

In [17]:
train_s3 = "s3://translate-rag/train"
validation_s3 = "s3://translate-rag/validation"
test_s3 = "s3://translate-rag/test"

In [47]:
logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.getLevelName("INFO"),
    handlers=[logging.StreamHandler(sys.stdout)],
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"

)

In [None]:
# en =[]
# fr = []

# for row in train_dataset:
#     en.append(row["translation"]["en"])
#     fr.append(row["translation"]["fr"])
    
# data = {"en": en,"fr": fr} 
# arrow_dataset = Dataset.from_dict(data)
# arrow_dataset.save_to_disk(f"s3://translate-rag/train")



# arrow_dataset[:4]


# df = pd.DataFrame(data)
# df.to_csv('wmt15_test.csv', index=False)
# df.to_json('wmt15_test.json', index=False, orient="records")




# Preprocess and Tokenize

In [None]:
def preprocess(sentences):
    english =  sentences["en"]
    french = sentences["fr"]
    
    inputs = tokenizer(english, max_length=512, truncation=True, padding=True)
    targets = tokenizer(text_target=french, max_length=512, truncation=True, padding=True)

    inputs["labels"] = targets["input_ids"]
    return inputs

In [23]:
train_dataset2 = train_dataset.select(range(1000000))

In [24]:
tokenized_train_dataset = train_dataset2.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
tokenized_validation_dataset = validation_dataset.map(preprocess, batched=True, remove_columns=validation_dataset.column_names)
tokenized_test_dataset = test_dataset.map(preprocess, batched=True, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

# BLEU metric


In [26]:
metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
def BLEU(bleu_preds):
    preds, labels = bleu_preds
    
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # remove whitespaces from predictions and labels
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]


    bleu = metric.compute(predictions=decoded_preds, references=decoded_labels)["score"]
    return {"bleu": round(bleu, 4)}

# Fine-tune

In [None]:
# training arguments
training_arguments = Seq2SeqTrainingArguments(
    output_dir="./models",
    evaluation_strategy="epoch",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-5,
    weight_decay=0.01,
    fp16=False,
    bf16=True,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=500
)

# collator
data_collator = DataCollatorForSeq2Seq(
            tokenizer,
            model=model,
            pad_to_multiple_of=8 if training_arguments.fp16 else None,
        )


# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_arguments,
    formatting_func=preprocess
)



Map:   0%|          | 0/4503 [00:00<?, ? examples/s]



# Train

In [10]:
trainer.train()
trainer.save_model("./model")
print("Model trained and saved!")

# Training Script

In [2]:
%%writefile train.py

import argparse
from sacrebleu import corpus_bleu
from datasets import load_dataset, Dataset
import os
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainingArguments
import evaluate
from huggingface_hub import login
import boto3
import sagemaker
from sagemaker.pytorch import PyTorch
from peft import LoraConfig
from trl import SFTTrainer
import logging
import sys
import time


# logger
logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.getLevelName("INFO"),
    handlers=[logging.StreamHandler(sys.stdout)],
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"

)

logger.info(f"Hello World")



# preprocess
def preprocess(sentences):
    english =  sentences["en"]
    french = sentences["fr"]
    
    output_texts = []
    
    for en, fr in zip(english, french):
        text = (f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{en}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{fr}")
        output_texts.append(text)

    return output_texts

def train(args):
    # Parse arguments
    lora_alpha = args.lora_alpha
    lora_r = args.lora_r
    learning_rate = args.learning_rate
    batch_size = args.batch_size
    grad_acc_steps = args.grad_acc_steps
    weight_decay = args.weight_decay
    scheduler_type = args.scheduler_type
    fp16 = args.fp16
    bf16 = args.bf16

    # model and tokenizer
    # login("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")
    login(token=os.getenv("HUGGINGFACE_TOKEN"))
    model_name = "CohereForAI/aya-23-8B"
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # tokenizer.padding_side = 'right'
    logger.info(f"{model_name} model and its tokenizer have been loaded")

    # dataset
    dataset = load_dataset("presencesw/wmt15_fr_en")
    train_dataset = dataset['train']
    validation_dataset = dataset['validation']
    test_dataset = dataset['test']
    logger.info(f"train_dataset has {len(train_dataset)} rows")
    logger.info(f"validation_dataset has {len(validation_dataset)} rows")
    logger.info(f"test_dataset has {len(test_dataset)} rows")

    # LoRA configuration
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    )

    # training arguments
    training_arguments = TrainingArguments(
        output_dir="./models",
        num_train_epochs=1,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=grad_acc_steps,
        gradient_checkpointing=True,
        save_steps=50,
        logging_steps=10,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        fp16=fp16,
        bf16=bf16,
        group_by_length=True,
        lr_scheduler_type=scheduler_type,
        report_to="none",
    )

    # trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        peft_config=peft_config,
        tokenizer=tokenizer,
        args=training_arguments,
        formatting_func=preprocess,
    )

    logger.info(f"Fine-tuning completed on {time.ctime(time.time())} !")

    trainer.train()
    logger.info(f"Training completed on {time.ctime(time.time())} !")

    trainer.save_model("models")
    tokenizer.save_pretrained("models")
    logger.info("Model and tokenizer saved!")

    # BLEU score
    predictions, labels = trainer.predict(validation_dataset)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    bleu = corpus_bleu(decoded_preds, [decoded_labels])
    logger.info(f"BLEU score: {bleu.score}")
    logger.info(f"BLEU score: {bleu.score}", flush=True)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--lora_alpha", type=int, default=32)
    parser.add_argument("--lora_r", type=int, default=32)
    parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate")
    parser.add_argument("--batch_size", type=int, default=4, help="Training batch size")
    parser.add_argument("--grad_acc_steps", type=int, default=4, help="Gradient accumulation steps")
    parser.add_argument("--weight_decay", type=float, default=0.001, help ="Weight decay")
    parser.add_argument("--scheduler_type", type=str, default="constant")
    parser.add_argument("--bf16", type=bool,  default=True, help="Enable BF16 precision")
    parser.add_argument("--fp16", type=bool, default=False, help="Enable FP16 precision")

    args = parser.parse_args()

    train(args)
    



Overwriting train.py


# SageMaker Estimator

In [5]:
framework = 'pytorch'
framework_version = '2.5'
region = 'us-east-2'
py_version = 'py311'
transformers_version = '4.6.1'
pytorch_version = '1.8.1'
instance_type = 'ml.p3.2xlarge'
image_scope = 'training'

pytorch_image_uri = sagemaker.image_uris.retrieve(
    framework=framework,
    region=region,
    version=framework_version,
    py_version=py_version,
    instance_type=instance_type,
    image_scope=image_scope
)


print(f"PyTorch Image URI: {pytorch_image_uri}")

PyTorch Image URI: 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-training:2.5-gpu-py311


In [23]:

logging.basicConfig(level=logging.DEBUG)
estimator1 = HuggingFace(
    entry_point="train.py",
    source_dir="./scripts",
    role = "arn:aws:iam::529088268202:role/SageMaker-ML-Engineer",
    instance_count=1,
    py_version='py36',
    pytorch_version=pytorch_version,
    instance_type=instance_type,
    transformers_version=transformers_version,
    hyperparameters={
        "batch-size": 4,
        "learning-rate": 1e-3,
        "weight-decay": 0.001,
        "lora-alpha": 32,
        "lora-r": 32,
    },
    output_path=f"s3://translate-rag/models/",
    sagemaker_session=sess
    


)

estimator1.fit(logs=True)

In [None]:
estimator = PyTorch(
    image_uri=pytorch_image_uri,
    entry_point="train.py",
    role="arn:aws:iam::529088268202:role/service-role/SageMaker-ML-Engineer",
    framework_version=framework_version,
    py_version=py_version,
    instance_count=1,
    instance_type=instance_type,
    hyperparameters={
        "batch-size": 4,
        "num-epochs": 1,
        "learning-rate": 1e-3,
        "weight-decay": 0.001,
        "lora-alpha": 32,
        "lora-r": 32,
    },
    output_path=f"s3://translate-rag/models/",
    
)

estimator.fit(logs=True)


# Fine Tuning

In [None]:
# dynamically pad sequences in batch to the length of the longest sequence in that batch
# collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=1,
    save_steps=50,
    save_total_limit=2,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer
    # data_collator=collator
)

trainer.train()


# Prompt Engineering

In [None]:
# translation prompt
legal_prompt = "Translate this legal document from English to Canadian French: 'This contract is valid for two years.'"

# test the prompt with the fine-tuned model
input_ids = tokenizer(legal_prompt, return_tensors="pt").input_ids.to(model.device)
output = model.generate(input_ids, max_length=100)
translated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Translated Text:", translated_text)


#  Model Evaluation

In [None]:
bleu = evaluate.load("bleu")

# evaluation data for BLEU
def evaluate_bleu(eval_data):
    preds, refs = [], []
    for sentence in eval_data:
        input_text = sentence['translation']['en']
        reference = sentence['translation']['fr']

        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
        output = model.generate(input_ids, max_length=50)
        prediction = tokenizer.decode(output[0], skip_special_tokens=True)

        preds.append(prediction)
        refs.append([reference])

    score = bleu.compute(predictions=preds, references=refs)
    return score

print("BLEU score:", evaluate_bleu(validation_data)['bleu'])


In [18]:
print("Is CUDA available?", torch.cuda.is_available())

Is CUDA available? False
