In [2]:
import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset, concatenate_datasets
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import KFold
import torch
import os

# Download NLTK data
nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
2024-08-15 01:43:55.071233: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-15 01:43:55.077890: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-15 01:43:55.086752: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-15 01:43:55.089453: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-15 01:43:55.0

True

In [3]:
# Load model and tokenizer
def load_model_and_tokenizer(model_name, device_map='auto'):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model.config.use_cache = False
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.padding_side = 'right'
    return model, tokenizer

# Load data
def load_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return train, test

# Preprocess data
def preprocess_data(train, tokenizer):
    template = "translate from Dyula to French: {dyu}"
    train["prompt"] = train.apply(lambda row: template.format(dyu=row['dyu']), axis=1)
    train_ds_raw = Dataset.from_pandas(train, split="train")
    
    tokenized_source_training = train_ds_raw.map(
        lambda x: tokenizer(x["prompt"], truncation=True), 
        batched=True, remove_columns=['fr', 'dyu', 'prompt'])
    
    source_lengths_training = [len(x) for x in tokenized_source_training["input_ids"]]
    target_lengths_training = [len(tokenizer(x, truncation=True)["input_ids"]) for x in train["fr"]]
    
    max_source_length = max(source_lengths_training)
    max_target_length = max(target_lengths_training)
    
    return train_ds_raw, max_source_length, max_target_length

# Tokenize function
def preprocess_function(sample, tokenizer, max_source_length, max_target_length, padding="max_length"):
    model_inputs = tokenizer(sample["prompt"], max_length=max_source_length, padding=padding, truncation=True)
    labels = tokenizer(text_target=sample["fr"], max_length=max_target_length, padding=padding, truncation=True)
    
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Metric computation
def compute_metrics(eval_preds, tokenizer, metric):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Create k-fold datasets
def create_kfold_datasets(dataset, n_splits=5, shuffle=True, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    
    fold_datasets = []
    for train_idx, val_idx in kf.split(dataset):
        train_fold = dataset.select(train_idx)
        val_fold = dataset.select(val_idx)
        fold_datasets.append((train_fold, val_fold))
    
    return fold_datasets

In [7]:

def main():
    # Set up parameters
    model_name = 'google/mt5-base'
    train_path = "/home/bello/workspace/works/final_train_df.csv"
    test_path = "/home/bello/workspace/works/final_val_df.csv"
    LOCAL_SAVE_DIR = "dyu_to_fr_model"
    batch_size = 8
    n_folds = 5

    # Load model and tokenizer
    model, tokenizer = load_model_and_tokenizer(model_name)

    # Load and preprocess data
    train, test = load_data(train_path, test_path)
    train_ds_raw, max_source_length, max_target_length = preprocess_data(train, tokenizer)

    # Tokenize dataset
    tokenized_train_ds = train_ds_raw.map(
        lambda x: preprocess_function(x, tokenizer, max_source_length, max_target_length),
        batched=True, 
        remove_columns=['fr', 'dyu', 'prompt']
    )

    # Prepare for training
    label_pad_token_id = -100
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8)
    metric = evaluate.load("sacrebleu")

    # Create k-fold datasets
    fold_datasets = create_kfold_datasets(tokenized_train_ds, n_splits=n_folds)

    # Train the model for each fold
    for fold, (train_dataset, val_dataset) in enumerate(fold_datasets, 1):
        print(f".....................................Training fold {fold}/{n_folds}.....................................")
        
        # Initialize a new model for each fold
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        
        # Set up the trainer for this fold
        training_args = Seq2SeqTrainingArguments(
            output_dir=f"{LOCAL_SAVE_DIR}/fold_{fold}",
            learning_rate=5e-5,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            generation_max_length=max_target_length,
            weight_decay=0.01,
            num_train_epochs=10,
            predict_with_generate=True,
            fp16=False,
            logging_strategy="steps",
            logging_steps=500,
            evaluation_strategy="steps",
            save_strategy="steps",
            save_total_limit=2,
            load_best_model_at_end=True
        )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer, metric),
        )
        
        # Train the model
        trainer.train()
        
        # Save the model for this fold
        trainer.save_model(f"{LOCAL_SAVE_DIR}/fold_{fold}")
        
        # Evaluate the model on the validation set
        eval_results = trainer.evaluate()
        print(f"Evaluation results for fold {fold}:", eval_results)

    # Select the best model based on BLEU score
    best_fold = None
    best_metric = float('-inf')

    for fold in range(1, n_folds + 1):
        model_path = f"{LOCAL_SAVE_DIR}/fold_{fold}"
        if os.path.exists(model_path):
            model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
            tokenizer = AutoTokenizer.from_pretrained(model_path)
            
            trainer = Seq2SeqTrainer(
                model=model,
                args=training_args,
                tokenizer=tokenizer,
                compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer, metric),
            )
            
            eval_results = trainer.evaluate(eval_dataset=val_dataset)
            current_metric = eval_results['eval_bleu']
            
            if current_metric > best_metric:
                best_metric = current_metric
                best_fold = fold

    print(f"Best model is from fold {best_fold} with BLEU score: {best_metric}")

    # Save the best model
    best_model_path = f"{LOCAL_SAVE_DIR}/fold_{best_fold}"
    best_model = AutoModelForSeq2SeqLM.from_pretrained(best_model_path)
    best_tokenizer = AutoTokenizer.from_pretrained(best_model_path)
    best_model.save_pretrained(f"{LOCAL_SAVE_DIR}/best_model")
    best_tokenizer.save_pretrained(f"{LOCAL_SAVE_DIR}/best_model")

if __name__ == "__main__":
    main()

Map:   0%|          | 0/8065 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 8065/8065 [00:00<00:00, 95609.34 examples/s]
Map: 100%|██████████| 8065/8065 [00:00<00:00, 32492.75 examples/s]


.....................................Training fold 1/5.....................................


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss
