In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# Paso 2: Importaciones
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, DatasetDict
from peft import get_peft_model, LoraConfig, TaskType
from transformers import BitsAndBytesConfig
import torch
import json

In [None]:
# Paso 3: Cargar el dataset JSONL personalizado
from datasets import load_dataset

data_files = {
    'train': 'dataset_train_filtrado.jsonl',
    'validation': 'dataset_valid_filtrado.jsonl',
    'test': 'dataset_test_filtrado.jsonl'
}

raw_datasets = load_dataset('json', data_files=data_files)

# Mantener solo las primeras 20 filas en cada split
#raw_datasets['train'] = raw_datasets['train'].select(range(20))
#raw_datasets['validation'] = raw_datasets['validation'].select(range(20))
#raw_datasets['test'] = raw_datasets['test'].select(range(20))

# Opcional: verificar
print(raw_datasets['train'][:2])

In [None]:
raw_datasets.shape

In [None]:
raw_datasets['train']

In [None]:
raw_datasets['train']['code_clean'][:2]

In [None]:
raw_datasets['train']['code'][:2]

In [None]:
# Paso 4: Tokenizador
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
max_input_length = 512
max_target_length = 512


In [None]:
def preprocess_function(example):
    prompt = (
        "You are a professional Python developer. Your task is to add a clear, concise docstring and relevant inline comments "
        "to the following Python function. Do not modify the structure or logic of the code. Keep formatting, indentation, and line breaks exactly as they are. "
        "Return only the complete commented function, as valid Python code:\n\n"
        f"{example['code_clean']}\n\n### Return the commented version below:"
    )

    input = tokenizer(prompt, padding="max_length", truncation=True, max_length=max_input_length)
    target = tokenizer(example['code'], padding="max_length", truncation=True, max_length=max_target_length)


    input["labels"] = target["input_ids"]
    return input


In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=False)

In [None]:
# Paso 6: Configurar modelo en 8-bit con LoRA para ahorrar RAM
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,  # o el modelo que uses
    device_map=None,  #"auto",       # o {"": "cpu"} si solo CPU
    torch_dtype=torch.float32,  # ya no es necesario usar float16
) #.to("cpu")

In [None]:
for name, module in model.named_modules():
    if "DenseReluDense" in name or "SelfAttention" in name:
        print(name)


In [None]:
# Paso 7: Configurar LoRA para entrenamiento eficiente
# usamos LoRA para entrenar solo partes del modelo eficientemente
# segun los módulos compatibles con flan-T5
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules = ["q", "k", "v", "o", "wi_0", "wi_1", "wo"],  # módulos compatibles con T5
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, peft_config)

In [None]:
# Paso 8: Data collator para tareas seq2seq
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Paso 9: Argumentos de entrenamiento

training_args = Seq2SeqTrainingArguments(
    output_dir="flan-t5-large_model2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=30,
    save_total_limit=2,
    predict_with_generate=True,
    generation_max_length=512,
    #report_to=["csv"], 
    logging_dir="logs",
    logging_strategy="steps",
    logging_steps=10,
    fp16=False,
    push_to_hub=False,
   
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, 
)


In [None]:
# Paso 10: Inicializar Trainer y entrenar
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=collator,
    #label_names=["labels"],
    # tokenizer=tokenizer
)

trainer.train()

## Guardando los log

In [None]:
import pandas as pd

# Guardar el historial de logs
log_history = trainer.state.log_history
log_df = pd.DataFrame(log_history)

# Guardar en CSV
log_df.to_csv("training_log_2.csv", index=False)
print("Historial guardado en training_log_2.csv")


## Graficando la evolución de eval_loss

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Leer el archivo
log_df = pd.read_csv("training_log_2.csv")

# Filtrar solo entradas con eval_loss
eval_logs = log_df[log_df["eval_loss"].notna()]

# Graficar
plt.figure(figsize=(10, 6))
plt.plot(eval_logs["step"], eval_logs["eval_loss"], marker="o", label="Eval Loss")
plt.title("Evolución del Eval Loss por Step")
plt.xlabel("Step")
plt.ylabel("Eval Loss")
plt.grid(True)
plt.legend()
plt.show()


## Evaluar Métricas con BLEU, ROUGE y METEOR

In [None]:
pip install evaluate nltk rouge-score

In [None]:
import evaluate
import pandas as pd
import nltk
nltk.download('punkt')  # necesario para METEOR

# Cargar tu dataset comentado
df = pd.read_json("dataset_train_filtrado.jsonl", lines=True)

# Referencias y predicciones
references = df["code"].tolist()
predictions = df["code_clean"].tolist()

# Inicializar métricas
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

# Evaluar
bleu_score = bleu.compute(predictions=predictions, references=references)
rouge_score = rouge.compute(predictions=predictions, references=references)
meteor_score = meteor.compute(predictions=predictions, references=references)

print("BLEU:", bleu_score)
print("ROUGE:", rouge_score)
print("METEOR:", meteor_score)


In [None]:
!pip install codebleu


In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from tqdm import tqdm
import nltk
import evaluate

nltk.download("punkt")  # Necesario para METEOR

# Cargar modelo y tokenizer
model_path = "flan-t5-best-model-2"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# Cargar dataset
df = pd.read_json("dataset_train_filtrado.jsonl", lines=True)
df = df[:100]

# Generar predicciones
preds = []
for code_clean in tqdm(df["code_clean"].tolist(), desc="Generando comentarios"):
    prompt = f"Actúa como un desarrollador senior. Agrega docstring y comentarios al siguiente código manteniendo su formato original:\n\n{code_clean}"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    outputs = model.generate(**inputs, max_length=512)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    preds.append(pred)

# Referencias
refs = df["code"].tolist()

# Evaluar con métricas estándar
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

bleu_score = bleu.compute(predictions=preds, references=refs)
rouge_score = rouge.compute(predictions=preds, references=refs)
meteor_score = meteor.compute(predictions=preds, references=refs)

print("BLEU:", bleu_score)
print("ROUGE:", rouge_score)
print("METEOR:", meteor_score)


## Guardar el modelo manualmente al final

In [None]:
# Esto guarda el modelo y el tokenizer
trainer.save_model("flan-t5-best-model-2")  
tokenizer.save_pretrained("flan-t5-best-model-2")


## Cargar el modelo luego para hacer inferencias

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("flan-t5-best-model-2")
tokenizer = AutoTokenizer.from_pretrained("flan-t5-best-model-2")

model.eval()  # modo evaluación


In [None]:
def generate_comment(code_clean):
    prompt = f"Act as a senior Python developer. Add docstring and then return the clean and formatted version of this Python function:\n\n{code_clean}"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
    outputs = model.generate(**inputs, max_length=512)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Ejemplo
test_code = """
def is_prime(n):
    if n <= 1:
        return False
    for i in range(2, int(n ** 0.5) + 1):
        if n % i == 0:
            return False
    return True"""
print(generate_comment(test_code))
