In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 1r entrenamiento

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import os

# Rutas
train_input_path = "/content/drive/MyDrive/TFG/Dataset/train.txt.src"
train_output_path = "/content/drive/MyDrive/TFG/Dataset/train.txt.tgt.tagged"
output_dir = "/content/drive/MyDrive/TFG/Resultados/T5_entrenado_v1"
os.makedirs(output_dir, exist_ok=True)

# Cargar modelo base y tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Cargar y preparar dataset
def load_dataset(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as f:
        inputs = f.readlines()
    with open(output_path, "r", encoding="utf-8") as f:
        outputs = f.readlines()
    assert len(inputs) == len(outputs), "❌ Input/Output no coinciden en longitud"
    return Dataset.from_dict({"input": [i.strip() for i in inputs], "output": [o.strip() for o in outputs]})

raw_dataset = load_dataset(train_input_path, train_output_path)

# Tokenización con max_length diferenciado
def tokenize(example):
    model_inputs = tokenizer(example["input"], padding="max_length", truncation=True, max_length=192)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], padding="max_length", truncation=True, max_length=512)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = raw_dataset.map(tokenize, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Configuración de entrenamiento
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy="no",
    logging_dir=f"{output_dir}/logs",
    remove_unused_columns=False,
)

# Entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Entrenamiento
trainer.train()

# Guardado final
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ Entrenamiento finalizado. Modelo guardado en: {output_dir}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,1.3691
1000,0.3781
1500,0.3746
2000,0.3737
2500,0.3532
3000,0.3676
3500,0.3651
4000,0.3639
4500,0.3689
5000,0.3418


✅ Entrenamiento finalizado. Modelo guardado en: /content/drive/MyDrive/TFG/Resultados/T5_entrenado_v1


# 1a validación

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import evaluate

# Ruta al modelo entrenado
model_path = "/content/drive/MyDrive/TFG/Resultados/T5_entrenado_v1"

# Rutas al dataset de validación
val_input_path = "/content/drive/MyDrive/TFG/Dataset/val.txt.src"
val_output_path = "/content/drive/MyDrive/TFG/Dataset/val.txt.tgt.tagged"

# Cargar modelo y tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Leer ejemplos
with open(val_input_path, "r", encoding="utf-8") as f:
    val_inputs = [line.strip() for line in f.readlines()]
with open(val_output_path, "r", encoding="utf-8") as f:
    val_targets = [line.strip() for line in f.readlines()]

# Usamos solo 300 ejemplos para la validación rápida
val_inputs = val_inputs[:300]
val_targets = val_targets[:300]

# Generar predicciones
generated_outputs = []
for text in val_inputs:
    input_ids = tokenizer(text, return_tensors="pt", truncation=True, max_length=192).input_ids.to(device)
    output_ids = model.generate(input_ids, max_length=512, num_beams=4)
    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    generated_outputs.append(decoded)

# Evaluación con ROUGE
rouge = evaluate.load("rouge")
results = rouge.compute(
    predictions=generated_outputs,
    references=val_targets,
    use_stemmer=True
)
results = {k: round(v * 100, 2) for k, v in results.items()}

# Mostrar resultados
print("MÉTRICAS ROUGE – VALIDACIÓN (val.txt – 300 ejemplos):")
for k, v in results.items():
    print(f"{k.upper()}: {v}")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

MÉTRICAS ROUGE – VALIDACIÓN (val.txt – 300 ejemplos):
ROUGE1: 11.9
ROUGE2: 2.23
ROUGEL: 10.26
ROUGELSUM: 10.26


# 2o entrenamiento

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import os

# Rutas
model_path = "/content/drive/MyDrive/TFG/Resultados/T5_entrenado_v1"
output_dir = "/content/drive/MyDrive/TFG/Resultados/T5_entrenado_v2"
os.makedirs(output_dir, exist_ok=True)

# Cargar modelo entrenado previamente
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

# 📄 Dataset (input = .src / output = .tgt.tagged)
train_input = "/content/drive/MyDrive/TFG/Dataset/train.txt.src"
train_output = "/content/drive/MyDrive/TFG/Dataset/train.txt.tgt.tagged"

# Cargar dataset
def load_dataset(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as f:
        inputs = f.readlines()
    with open(output_path, "r", encoding="utf-8") as f:
        outputs = f.readlines()
    assert len(inputs) == len(outputs), "⚠️ Los archivos no tienen la misma cantidad de líneas"
    return Dataset.from_dict({"input": [i.strip() for i in inputs], "output": [o.strip() for o in outputs]})

# Tokenización con longitudes optimizadas
def tokenize(example):
    return tokenizer(
        example["input"],
        text_target=example["output"],
        padding="max_length",
        truncation=True,
        max_length=192  # para INPUT
    )

train_dataset = load_dataset(train_input, train_output)
train_dataset = train_dataset.map(tokenize, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Parámetros de entrenamiento (1 época más)
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir=f"{output_dir}/logs",
    remove_unused_columns=False,
    save_total_limit=2
)

# Entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Reentrenamiento
trainer.train()

# Guardar modelo
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"✅ Reentrenamiento completado. Modelo guardado en:\n{output_dir}")


Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
500,0.8777
1000,0.8367
1500,0.8684
2000,0.8701
2500,0.8531
3000,0.8821
3500,0.8801
4000,0.8697
4500,0.8699
5000,0.8407


✅ Reentrenamiento completado. Modelo guardado en:
/content/drive/MyDrive/TFG/Resultados/T5_entrenado_v2


# 2a validación

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import evaluate

# Ruta al modelo entrenado
model_path = "/content/drive/MyDrive/TFG/Resultados/T5_entrenado_v2"

# Rutas al dataset de validación
val_input_path = "/content/drive/MyDrive/TFG/Dataset/val.txt.src"
val_output_path = "/content/drive/MyDrive/TFG/Dataset/val.txt.tgt.tagged"

# Cargar modelo y tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Leer ejemplos
with open(val_input_path, "r", encoding="utf-8") as f:
    val_inputs = [line.strip() for line in f.readlines()]
with open(val_output_path, "r", encoding="utf-8") as f:
    val_targets = [line.strip() for line in f.readlines()]

# Usamos solo 300 ejemplos para la validación rápida
val_inputs = val_inputs[:300]
val_targets = val_targets[:300]

# Generar predicciones
generated_outputs = []
for text in val_inputs:
    input_ids = tokenizer(text, return_tensors="pt", truncation=True, max_length=192).input_ids.to(device)
    output_ids = model.generate(input_ids, max_length=512, num_beams=4)
    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    generated_outputs.append(decoded)

# Evaluación con ROUGE
rouge = evaluate.load("rouge")
results = rouge.compute(
    predictions=generated_outputs,
    references=val_targets,
    use_stemmer=True
)
results = {k: round(v * 100, 2) for k, v in results.items()}

# Mostrar resultados
print("MÉTRICAS ROUGE – VALIDACIÓN (val.txt – 300 ejemplos):")
for k, v in results.items():
    print(f"{k.upper()}: {v}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

MÉTRICAS ROUGE – VALIDACIÓN (val.txt – 300 ejemplos):
ROUGE1: 11.63
ROUGE2: 1.96
ROUGEL: 10.28
ROUGELSUM: 10.25


# 3r entrenamiento

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import os

# Rutas del modelo anterior y de salida
model_path = "/content/drive/MyDrive/TFG/Resultados/T5_entrenado_v2"
output_dir = "/content/drive/MyDrive/TFG/Resultados/T5_entrenado_v3"
os.makedirs(output_dir, exist_ok=True)

# Cargar modelo y tokenizer previamente entrenados
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

# Dataset (input = título + abstract, output = resumen)
train_input = "/content/drive/MyDrive/TFG/Dataset/train.txt.src"
train_output = "/content/drive/MyDrive/TFG/Dataset/train.txt.tgt.tagged"

# Cargar el dataset desde los archivos
def load_dataset(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as f:
        inputs = f.readlines()
    with open(output_path, "r", encoding="utf-8") as f:
        outputs = f.readlines()
    assert len(inputs) == len(outputs), "❌ Inputs y outputs no coinciden en tamaño"
    return Dataset.from_dict({
        "input": [i.strip() for i in inputs],
        "output": [o.strip() for o in outputs]
    })

# Tokenización con longitud adecuada
def tokenize(example):
    return tokenizer(
        example["input"],
        text_target=example["output"],
        padding="max_length",
        truncation=True,
        max_length=192
    )

# Preprocesamiento
train_dataset = load_dataset(train_input, train_output)
train_dataset = train_dataset.map(tokenize, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Argumentos de entrenamiento (1 última época)
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy="no",
    logging_dir=f"{output_dir}/logs",
    remove_unused_columns=False,
    save_total_limit=2
)

# Entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Entrenamiento
trainer.train()

# Guardado final
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"✅ Entrenamiento FINAL completado. Modelo v3 guardado en:\n{output_dir}")


Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,0.7923
1000,0.755
1500,0.7864
2000,0.7919
2500,0.7767
3000,0.8077
3500,0.8084
4000,0.8027
4500,0.8035
5000,0.7806


✅ Entrenamiento FINAL completado. Modelo v3 guardado en:
/content/drive/MyDrive/TFG/Resultados/T5_entrenado_v3


In [None]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=3a5930fc50197db0df8e35e7c6dc490269e04665581b230a26cc2b72781e817b
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


# 3a validación

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import evaluate

# Ruta al modelo entrenado
model_path = "/content/drive/MyDrive/TFG/Resultados/T5_entrenado_v3"

# Rutas al dataset de validación
val_input_path = "/content/drive/MyDrive/TFG/Dataset/val.txt.src"
val_output_path = "/content/drive/MyDrive/TFG/Dataset/val.txt.tgt.tagged"

# Cargar modelo y tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_path, local_files_only=True)
tokenizer = T5Tokenizer.from_pretrained(model_path, local_files_only=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Leer ejemplos
with open(val_input_path, "r", encoding="utf-8") as f:
    val_inputs = [line.strip() for line in f.readlines()]
with open(val_output_path, "r", encoding="utf-8") as f:
    val_targets = [line.strip() for line in f.readlines()]

# Usamos solo 300 ejemplos para la validación rápida
val_inputs = val_inputs[:300]
val_targets = val_targets[:300]

# Generar predicciones
generated_outputs = []
for text in val_inputs:
    input_ids = tokenizer(text, return_tensors="pt", truncation=True, max_length=192).input_ids.to(device)
    output_ids = model.generate(input_ids, max_length=512, num_beams=4)
    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    generated_outputs.append(decoded)

# Evaluación con ROUGE
rouge = evaluate.load("rouge")
results = rouge.compute(
    predictions=generated_outputs,
    references=val_targets,
    use_stemmer=True
)
results = {k: round(v * 100, 2) for k, v in results.items()}

# Mostrar resultados
print("MÉTRICAS ROUGE – VALIDACIÓN (val.txt – 300 ejemplos):")
for k, v in results.items():
    print(f"{k.upper()}: {v}")

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/content/drive/MyDrive/TFG/Resultados/T5_entrenado_v3'. Use `repo_type` argument if needed.

# Test

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import evaluate

# Ruta del modelo entrenado
model_path = "/content/drive/MyDrive/TFG/Resultados/T5_entrenado_v3"

# Rutas al dataset de test
test_input_path = "/content/drive/MyDrive/TFG/Dataset/test.txt.src"
test_output_path = "/content/drive/MyDrive/TFG/Dataset/test.txt.tgt.tagged"

# Cargar modelo y tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Cargar datos
with open(test_input_path, "r", encoding="utf-8") as f:
    test_inputs = [line.strip() for line in f.readlines()]
with open(test_output_path, "r", encoding="utf-8") as f:
    test_targets = [line.strip() for line in f.readlines()]

# Limitar a 300 ejemplos para evitar problemas de RAM
test_inputs = test_inputs[:300]
test_targets = test_targets[:300]

# Generar predicciones
generated_outputs = []
for text in test_inputs:
    input_ids = tokenizer(text, return_tensors="pt", truncation=True, max_length=192).input_ids.to(device)
    output_ids = model.generate(input_ids, max_length=512, num_beams=4)
    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    generated_outputs.append(decoded)

# Evaluación ROUGE
rouge = evaluate.load("rouge")
results = rouge.compute(
    predictions=generated_outputs,
    references=test_targets,
    use_stemmer=True
)
results = {k: round(v * 100, 2) for k, v in results.items()}

# Mostrar resultados
print("MÉTRICAS ROUGE – TEST (test.txt – 300 ejemplos):")
for k, v in results.items():
    print(f"{k.upper()}: {v}")
