In [1]:
from unsloth import FastLanguageModel
import torch

SYSTEM_PROMPT = "Eres un modelo entrenado para generar resúmenes institucionales de actas parlamentarias. Los resúmenes deben estar redactados en lenguaje formal-administrativo, sin juicios de valor, y seguir una estructura clara."
INSTRUCTION = "Redacta un resumen institucional en español del siguiente documento. Mantén un lenguaje objetivo, enfocado en los hechos y acuerdos:"

model, tokenizer = FastLanguageModel.from_pretrained(
    #model_name="meta-llama/Llama-3.2-1B-Instruct",
    #model_name="BSC-LT/salamandra-2b-instruct",
    #model_name="Qwen/Qwen3-0.6B",
    model_name="Qwen/Qwen3-8B",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True, # quantization
)
tokenizer.clean_up_tokenization_spaces = False

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 06-03 21:28:52 [__init__.py:243] Automatically detected platform cuda.
==((====))==  Unsloth 2025.5.9: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.9.0.1.
   \\   /|    NVIDIA GeForce RTX 4070 SUPER. Num GPUs = 1. Max memory: 11.994 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [24]:
from transformers import TextStreamer

import re

def extract_clean_assistant_response(full_text: str) -> str:
    # Buscar el último bloque <|assistant|>
    assistant_start = full_text.rfind("assistant")
    if assistant_start == -1:
        assistant_content = full_text
    else:
        assistant_content = full_text[assistant_start + len("assistant"):]

    # Eliminar los bloques <think>...</think> si existen
    assistant_content = re.sub(r"<think>.*?</think>", "", assistant_content, flags=re.DOTALL)

    # Eliminar espacios extra al principio y al final
    return assistant_content.strip()

def apply_chat_template(sample, tokenizer):
    """
    Apply a chat template to the sample.
    """
    # Define the chat template
    empty_prompt = f"{INSTRUCTION}\n ##Documento {{document}}\n ##Resumen:"
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": empty_prompt.format(document=sample["document"])},
    ]
    
    # Format the chat template with the sample text
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)


## generate a summary
def generate_summary_streamer(model, tokenizer, sample, max_new_tokens=100):
    FastLanguageModel.for_inference(model)
    if "inference_prompt" in sample:
        inputs = tokenizer(sample["inference_prompt"], return_tensors="pt").to(model.device)
    else:
        prompt = apply_chat_template(sample, tokenizer)
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    text_streamer = TextStreamer(tokenizer)
    final_text = ""
    for token in model.generate(**inputs, streamer = text_streamer, max_new_tokens = max_new_tokens):
        print(token)
        pass

def generate_summary(model, tokenizer, sample, max_new_tokens=100):
    FastLanguageModel.for_inference(model)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if "inference_prompt" in sample:
        inputs = tokenizer(sample["inference_prompt"], return_tensors="pt").to(device)
    else:
        prompt = apply_chat_template(sample, tokenizer)
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=max_new_tokens,

        )  # Adjust max_new_tokens as needed
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    summary = generated_text.split("##Resumen:")[-1].strip()
    if tokenizer.chat_template:
        summary = extract_clean_assistant_response(summary)
    return summary

In [3]:
# More info about parameters: https://huggingface.co/docs/peft/v0.11.0/en/package_reference/lora#peft.LoraConfig
target_modules =  ["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj"]

# When adding special tokens
train_embeddings = False

if train_embeddings:
  target_modules = target_modules + ["lm_head"]

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # rank of lora matrices according to paper not much loss when set relatively low
    target_modules = target_modules,  # On which modules of the llm the lora weights are used
    lora_alpha = 2*16, # scales the weights of the adapters (more influence on base model), 16 was recommended on reddit
    lora_dropout = 0, # Default on 0.05 in tutorial but unsloth says 0 is better
    bias = "none",    # "none" is optimized
    use_gradient_checkpointing = "unsloth", #"unsloth" for very long context, decreases vram
    random_state = 3407,
    use_rslora = False,  # scales lora_alpha with 1/sqrt(r), huggingface says this works better
    loftq_config = None, # And LoftQ
)

Unsloth 2025.5.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [4]:
import pandas as pd
from datasets import Dataset
FOLDER = "sum/test_summary_normal.xlsx"

df = pd.read_excel(FOLDER, engine='openpyxl')
# convert to dataset
dataset = Dataset.from_pandas(df) 
print(dataset)

Dataset({
    features: ['document', 'expected_summary', 'generated_summary', 'language', 'time'],
    num_rows: 20
})


In [5]:
EOS_TOKEN = tokenizer.eos_token

empty_prompt = f"{INSTRUCTION}\n{{document}}\n\nResumen:"

def formatting_prompts_instruction(examples):
  training_prompts = []
  inference_prompts = []
  summaries = []
  for doc, sum in zip(examples["document"] , examples["expected_summary"]):
      inference_prompt = empty_prompt.format(document=doc)
      real_sum = sum.strip()
      training_prompt = inference_prompt + sum + EOS_TOKEN
      training_prompt = training_prompt.replace("\n", " ")  # Remove newlines for better tokenization
      training_prompt = training_prompt.strip()  # Remove leading/trailing spaces
      training_prompts.append(training_prompt)
      inference_prompts.append(inference_prompt)
      summaries.append(real_sum)

  return { "text" : training_prompts, 
           "inference_prompt" : inference_prompts,
           "expected_summary" : summaries }

In [6]:
if tokenizer.chat_template:
    print("Using chat template for formatting prompts")
    def formatting_func(example):
        empty_prompt = f"{INSTRUCTION}\n{{document}}\n"
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": empty_prompt.format(document=example["document"])},
            {"role": "assistant", "content": example["expected_summary"]}
        ]
        return tokenizer.apply_chat_template(messages, tokenize=False)
    dataset_train = dataset.map(lambda x: {"text": formatting_func(x)})
else:
    dataset_train = dataset.map(formatting_prompts_instruction, batched=True, remove_columns=dataset.column_names)
    print(dataset_train)

Using chat template for formatting prompts


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # process 4 batches before updating parameters (parameter update == step)
        num_train_epochs = 2, # between 1 - 3 to prevent overfitting
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        warmup_ratio = 0.03, # 3% of the total steps
        seed = 3407,
        output_dir = "outputs",
        report_to = "none"
    )

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    formatting_func=lambda x: x["text"],
    dataset_text_field = "text",
    max_seq_length = 2048,
    dataset_num_proc = 2,
    args = args,
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/20 [00:00<?, ? examples/s]

In [8]:
def count_trainable_params(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    percentage = 100 * trainable_params / total_params
    return trainable_params, total_params, percentage

trainable_params, total_params, percentage = count_trainable_params(model)
print(f"Trainable parameters: {trainable_params:,} / Total parameters: {total_params:,} ({percentage:.2f}%)")

Trainable parameters: 10,092,544 / Total parameters: 606,142,464 (1.67%)


In [9]:
import torch

torch.cuda.reset_peak_memory_stats()

trainer_stats = trainer.train()
peak_memory = torch.cuda.max_memory_allocated() / (1024**3)  # en GB

print(f"Memoria máxima GPU usada: {peak_memory:.2f} GB")

## clean the memory of GPU
torch.cuda.empty_cache()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 20 | Num Epochs = 2 | Total steps = 4
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 10,092,544/606,142,464 (1.67% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.9503
2,2.8542
3,2.4789
4,2.6958


Memoria máxima GPU usada: 1.41 GB


In [10]:
# Tiempo total en segundos
print("Tiempo total de entrenamiento:", trainer_stats.metrics["train_runtime"], "segundos")
print("Velocidad:", trainer_stats.metrics["train_samples_per_second"], "ejemplos/segundo")

Tiempo total de entrenamiento: 9.8729 segundos
Velocidad: 4.051 ejemplos/segundo


In [11]:
stats_of_trainer = {
    "peak_memory (GB)": peak_memory,
    "train_runtime": trainer_stats.metrics["train_runtime"],
    "train_samples_per_second": trainer_stats.metrics["train_samples_per_second"],
    "trainable_params": trainable_params,
}

print("Estadísticas del entrenamiento:", stats_of_trainer)

Estadísticas del entrenamiento: {'peak_memory (GB)': 1.4087910652160645, 'train_runtime': 9.8729, 'train_samples_per_second': 4.051, 'trainable_params': 10092544}


In [12]:
sample = dataset_train[0]
print("Sample document:", sample)
excepted_summary = sample["expected_summary"]

Sample document: {'document': 'Esta sesión del parlamento se realizó el 14/05/2013. · 8L/PO/P-0916 Pregunta del señor diputado don Fabián Atamán Martín Martín, del Grupo Parlamentario Mixto, sobre los precios de las viviendas protegidas, dirigida a la señora consejera de Cultura, Deportes, Políticas Sociales y Vivienda. El señor presidente: Siguiente pregunta: del señor diputado don Fabián Atamán Martín, del Grupo Mixto, también dirigida a la señora consejera de Cultura, Deportes, Políticas Sociales y Vivienda. Don Fabián. El señor Martín Martín (Desde su escaño): Muchas gracias, señor presidente. Señora consejera, ¿qué valoración le merece que la evolución de los precios de las viviendas protegidas esté por encima de los de las viviendas libres? Muchas gracias. El señor presidente: Gracias, don Fabián. Señora consejera, doña Inés Rojas. La señora consejera de Cultura, Deportes, Políticas Sociales y Vivienda (Rojas de León) (Desde su escaño): Gracias, señor presidente. Señoría, decirle

In [13]:
text = generate_summary_streamer(model, tokenizer, sample, max_new_tokens=4080)

<|im_start|>system
Eres un modelo entrenado para generar resúmenes institucionales de actas parlamentarias. Los resúmenes deben estar redactados en lenguaje formal-administrativo, sin juicios de valor, y seguir una estructura clara.<|im_end|>
<|im_start|>user
Redacta un resumen institucional en español del siguiente documento. Mantén un lenguaje objetivo, enfocado en los hechos y acuerdos:
 ##Documento Esta sesión del parlamento se realizó el 14/05/2013. · 8L/PO/P-0916 Pregunta del señor diputado don Fabián Atamán Martín Martín, del Grupo Parlamentario Mixto, sobre los precios de las viviendas protegidas, dirigida a la señora consejera de Cultura, Deportes, Políticas Sociales y Vivienda. El señor presidente: Siguiente pregunta: del señor diputado don Fabián Atamán Martín, del Grupo Mixto, también dirigida a la señora consejera de Cultura, Deportes, Políticas Sociales y Vivienda. Don Fabián. El señor Martín Martín (Desde su escaño): Muchas gracias, señor presidente. Señora consejera, ¿q

<think>
Okay, I need to create an institutional summary of this parliamentary act. The user wants it in formal, objective language, focusing on facts and agreements. Let me start by reading through the document carefully.

The session was on 14 May 2013, with two questions. First, the speaker from the Mixto group asked about the price of protected homes versus free homes. The Minister of Culture, Deportes, Social Policies, and Housing mentioned that the protected homes are now cheaper than free ones, and there's a reduction in prices. The government is planning to lower the price further, and there's a mention of a new price reduction target. The second question is from the same group, which is about the state's role in housing. The summary should include these points.

I need to structure it clearly. Start with the date and session details. Then mention the two speakers and their questions. The summary should highlight the key points: the current price differences, the government's pl

In [25]:
text = generate_summary(model, tokenizer, sample, max_new_tokens=4080)
print("Generated summary:", text)

Generated summary: **Resumen institucional**  

La sesión del parlamento se realizó el 14/05/2013, con la presencia de la señora consejera de Cultura, Deportes, Políticas Sociales y Vivienda, y del señor diputado don Fabián Atamán Martín (Grupo Parlamentario Mixto). El documento resume las opiniones y acuerdos sobre los precios de las viviendas protegidas en Canarias, comparados con las viviendas libres, y la respuesta del Gobierno regional.  

El precio de las viviendas protegidas (VPO) en Canarias ha bajado significativamente en comparación con las viviendas libres, lo cual se debe a la evolución del mercado. Según los datos del Ministerio de Fomento, en los últimos cinco años, la evolución de los precios de las viviendas protegidas en Canarias fue un 11%, mientras que la vivienda libre se desplomaba un 24%. Esta diferencia ha reducido drásticamente la distancia entre las viviendas protegidas y las viviendas libres, con el precio de la VPO actualmente siendo escasamente 200 euros por

In [None]:
tokenizer.save_pretrained("models/modelo_final_vllm")
model.save_pretrained("models/modelo_final_vllm")

('models/modelo_final_vllm/tokenizer_config.json',
 'models/modelo_final_vllm/special_tokens_map.json',
 'models/modelo_final_vllm/vocab.json',
 'models/modelo_final_vllm/merges.txt',
 'models/modelo_final_vllm/added_tokens.json',
 'models/modelo_final_vllm/tokenizer.json')