In [None]:
!pip install tf-keras

In [None]:
!pip install transformers datasets scikit-learn evaluate sacrebleu rouge_score matplotlib seaborn tqdm

In [None]:
!pip install sacrebleu rouge_score

In [None]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold
from datasets import Dataset
from evaluate import load
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Instalación de librerías (descomenta si es necesario)
# !pip install transformers datasets scikit-learn evaluate sacrebleu rouge_score matplotlib seaborn tqdm

In [None]:
import pandas as pd
from datasets import Dataset
from tqdm import tqdm  # Para barra de progreso

def load_and_prepare_data(csv_path):
    # Leer el CSV especificando las columnas correctas
    try:
        df = pd.read_csv(
            csv_path,
            usecols=['source', 'target'],  # Solo cargar las columnas necesarias
            encoding='utf-8',
            on_bad_lines='warn'  # Mostrar advertencias si hay líneas mal formateadas
        )
    except Exception as e:
        print(f"Error al leer el CSV: {e}")
        return Dataset.from_list([])
    
    # Verificar datos faltantes
    print("\nResumen de datos faltantes:")
    print(df.isnull().sum())
    
    # Limpiar y preparar los datos
    data_pairs = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Procesando filas"):
        if pd.notna(row['source']) and pd.notna(row['target']):
            data_pairs.append({
                "spanish": str(row['source']).strip(),  # Español (source)
                "quechua": str(row['target']).strip()   # Quechua (target)
            })
    
    print(f"\nTotal de pares válidos cargados: {len(data_pairs)}")
    
    # Estadísticas básicas
    if data_pairs:
        avg_len_spanish = sum(len(pair['spanish']) for pair in data_pairs) / len(data_pairs)
        avg_len_quechua = sum(len(pair['quechua']) for pair in data_pairs) / len(data_pairs)
        print(f"\nLongitud promedio:")
        print(f"- Español: {avg_len_spanish:.1f} caracteres")
        print(f"- Quechua: {avg_len_quechua:.1f} caracteres")
    
    return Dataset.from_list(data_pairs)

# Ruta al archivo CSV
csv_file_path = r"C:\DATA\FERNANDOHC\EDUCACION\MAESTRIA\UNI_MAI\SEMESTRE_3\MIA-204ProyectoDeInvestigacion1\Proyecto_Traductor_Esp_Quechua\datos\corpus_trad.csv"

# Cargar datos
raw_dataset = load_and_prepare_data(csv_file_path)

# Verificación detallada
if len(raw_dataset) > 0:
    print("\nEjemplos cargados (formato español -> quechua):")
    for i in range(min(3, len(raw_dataset))):
        print(f"\nEjemplo {i+1}:")
        print(f"Español: {raw_dataset[i]['spanish']}")
        print(f"Quechua: {raw_dataset[i]['quechua']}")
        
    # Verificación de distribución
    sample = raw_dataset.shuffle().select(range(5))
    print("\nMuestra aleatoria:")
    for example in sample:
        print(f"\nEspañol: {example['spanish']}")
        print(f"Quechua: {example['quechua']}")
else:
    print("\nNo se cargaron datos válidos. Verifica:")
    print("1. Que las columnas se llamen 'source' y 'target'")
    print("2. Que el archivo contenga datos")
    print("3. Que no haya caracteres especiales corruptos")

In [None]:
MODEL_TYPE = "BART"
NUM_FOLDS = 5
RANDOM_STATE = 42

metric_bleu = load("bleu")
metric_rouge = load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    references_bleu = [[label] for label in decoded_labels]

    bleu_results = metric_bleu.compute(predictions=decoded_preds, references=references_bleu)
    rouge_results = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": bleu_results["bleu"],
        "rouge-l": rouge_results["rougeL"],
    }

In [None]:
!pip install --upgrade transformers[torch] accelerate evaluate sacrebleu rouge_score
!pip install --force-reinstall accelerate

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold
import numpy as np
from datasets import Dataset
import torch

# --- Configuración ---
MODEL_TYPE = "BART"
NUM_FOLDS = 5
RANDOM_STATE = 42

# Verificar si hay GPU disponible
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device}")

# --- Función de preprocesamiento ---
def preprocess_function(examples, current_tokenizer):
    inputs = current_tokenizer(
        examples["quechua"], 
        max_length=128, 
        truncation=True,
        padding="max_length"
    )
    labels = current_tokenizer(
        text_target=examples["spanish"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    inputs["labels"] = labels["input_ids"]
    return inputs

# --- K-Fold Cross Validation ---
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=RANDOM_STATE)

bleu_scores_baseline = []
rouge_l_scores_baseline = []

bleu_scores_refined = []
rouge_l_scores_refined = []

for fold, (train_index, val_index) in enumerate(kf.split(raw_dataset)):
    print(f"\n--- Iniciando Fold {fold + 1}/{NUM_FOLDS} ---")
    val_dataset_fold = raw_dataset.select(val_index)
    
    # --- Evaluación del Modelo BART Baseline ---
    print(f"Evaluando Modelo BART Baseline (inicial) en Fold {fold + 1}...")
    try:
        model_path_baseline = r"C:\DATA\FERNANDOHC\EDUCACION\MAESTRIA\UNI_MAI\SEMESTRE_3\MIA-204ProyectoDeInvestigacion1\Proyecto_Traductor_Esp_Quechua\modelos\BART\v1"
        
        # Cargar modelo y tokenizer
        tokenizer_baseline = AutoTokenizer.from_pretrained(model_path_baseline)
        model_baseline = AutoModelForSeq2SeqLM.from_pretrained(model_path_baseline).to(device)
        
        # Preprocesar datos
        tokenized_val_dataset_baseline = val_dataset_fold.map(
            lambda examples: preprocess_function(examples, tokenizer_baseline),
            batched=True,
            remove_columns=["quechua", "spanish"]
        )

        # Configuración del entrenador para evaluación
        training_args = Seq2SeqTrainingArguments(
            output_dir=r"./temp_eval_baseline",
            per_device_eval_batch_size=8,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            report_to="none"
        )

        trainer_baseline = Seq2SeqTrainer(
            model=model_baseline,
            args=training_args,
            tokenizer=tokenizer_baseline,
            compute_metrics=compute_metrics
        )
        
        # Evaluación
        eval_results_baseline = trainer_baseline.evaluate(tokenized_val_dataset_baseline)
        
        # Guardar resultados
        bleu_score = eval_results_baseline.get("eval_bleu", 0.0)
        rouge_score = eval_results_baseline.get("eval_rouge-l", 0.0)
        
        bleu_scores_baseline.append(bleu_score)
        rouge_l_scores_baseline.append(rouge_score)
        
        print(f"  BART Baseline - BLEU: {bleu_score:.4f}, ROUGE-L: {rouge_score:.4f}")

    except Exception as e:
        print(f"  Error evaluando BART Baseline en Fold {fold + 1}: {str(e)}")
        bleu_scores_baseline.append(0.0)
        rouge_l_scores_baseline.append(0.0)