# Training

Vergelijking van de modellen


In [None]:
import os
import json
import glob
from collections import defaultdict
from transformers import pipeline
from tqdm import tqdm

# === CONFIG ===
DATA_PATH = "../data/Chats/"
OUTPUT_FILE = "samengevoegd_met_samenvattingen_local.jsonl"
MODEL_NAME = "philschmid/bart-large-cnn-samsum"  # Goed voor dialoogachtige samenvattingen

# === SAMENVATTINGSMODEL LADEN ===
summarizer = pipeline("summarization", model=MODEL_NAME, tokenizer=MODEL_NAME, device=0)  # device=0 voor GPU

# === BESTANDEN VERWERKEN ===
all_chat_data = []

for filepath in glob.glob(os.path.join(DATA_PATH, "*.json")):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    topics = defaultdict(list)
    for item in data:
        topics[item["topic_id"]].append(item["text"])

    for topic_id, messages in topics.items():
        combined_text = " ".join(messages)
        if len(combined_text) < 50:
            continue  # te kort om te verwerken

        # Samenvatting genereren
        try:
            summary = summarizer(combined_text, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
        except Exception as e:
            print(f"Fout bij samenvatten: {e}")
            summary = "Samenvatting niet beschikbaar."

        all_chat_data.append({
            "topic_id": topic_id,
            "chat": combined_text,
            "summary": summary
        })

# === OPSLAAN ALS JSONL ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for entry in all_chat_data:
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Dataset opgeslagen als {OUTPUT_FILE} met {len(all_chat_data)} items.")


## Bart

In [None]:
from datasets import load_dataset, Dataset
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch
from collections import defaultdict
import json

# Dataset laden of creëren
# Stap 1: JSONL of JSON-bestand laden
with open("..\data\Chats\chatlog_topic_001_20250521_011132.json", "r", encoding="utf-8") as f:
    data = json.load(f)  # of gebruik jsonlines als het .jsonl is

# Stap 2: Groeperen per topic_id
topics = defaultdict(list)
for item in data:
    topics[item["topic_id"]].append(item["text"])

# Stap 3: Conversaties samenvoegen
chat_data = []
for topic_id, messages in topics.items():
    combined_text = " ".join(messages)
    chat_data.append({
        "chat": combined_text,
        "summary": "De deelnemers bespreken het plan om windmolens net buiten het dorp te plaatsen. Er is brede instemming dat meer informatie nodig is voordat een oordeel geveld kan worden. Belangrijke zorgen zijn onder andere de impact op het landschap, mogelijke geluidsoverlast en de kosten. Iedereen benadrukt dat alle aspecten zorgvuldig moeten worden afgewogen voordat er een beslissing wordt genomen."
    })

# Stap 4: Dataset aanmaken
dataset = Dataset.from_list(chat_data)

# Tokenizer en model laden
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    inputs = tokenizer(example["chat"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=False)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./bart-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    save_steps=500,
    logging_dir="./logs",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=2
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()
model.save_pretrained("./bart-summarizer")
tokenizer.save_pretrained("./bart-summarizer")


  with open("..\data\Chats\chatlog_topic_001_20250521_011132.json", "r", encoding="utf-8") as f:
Map: 100%|██████████| 1/1 [00:00<00:00, 21.81 examples/s]
  trainer = Trainer(


Step,Training Loss




('./bart-summarizer\\tokenizer_config.json',
 './bart-summarizer\\special_tokens_map.json',
 './bart-summarizer\\vocab.json',
 './bart-summarizer\\merges.txt',
 './bart-summarizer\\added_tokens.json')

## T5-Base

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq


# Model en tokenizer laden
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    input_text = "summarize: " + example["chat"]
    model_inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./t5-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Trainer opzetten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()

# Model opslaan
model.save_pretrained("./t5-summarizer")
tokenizer.save_pretrained("./t5-summarizer")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 1/1 [00:00<00:00, 98.25 examples/s]
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


('./t5-summarizer\\tokenizer_config.json',
 './t5-summarizer\\special_tokens_map.json',
 './t5-summarizer\\spiece.model',
 './t5-summarizer\\added_tokens.json')

## T5-Long

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq



# Model en tokenizer laden
model_name = "google/long-t5-tglobal-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    input_text = "summarize: " + example["chat"]
    model_inputs = tokenizer(input_text, max_length=4096, truncation=True, padding="max_length")

    labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./long-t5-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Trainer opzetten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()

# Model opslaan
model.save_pretrained("./long-t5-summarizer")
tokenizer.save_pretrained("./long-t5-summarizer")


You are using a model of type longt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.
Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/long-t5-tglobal-base and are newly initialized: ['encoder.block.0.layer.0.SelfAttention.k.weight', 'encoder.block.0.layer.0.SelfAttention.o.weight', 'encoder.block.0.layer.0.SelfAttention.q.weight', 'encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'encoder.block.0.layer.0.SelfAttention.v.weight', 'encoder.block.1.layer.0.SelfAttention.k.weight', 'encoder.block.1.layer.0.SelfAttention.o.weight', 'encoder.block.1.layer.0.SelfAttention.q.weight', 'encoder.block.1.layer.0.SelfAttention.v.weight', 'encoder.block.10.layer.0.SelfAttention.k.weight', 'encoder.block.10.layer.0.SelfAttention.o.weight', 'encoder.block.10.layer.0.SelfAttention.q.weight', 'encoder.block.10.layer.0.SelfAttention.v.weight', 'encoder.block.11.layer.0.

Step,Training Loss


('./long-t5-summarizer\\tokenizer_config.json',
 './long-t5-summarizer\\special_tokens_map.json',
 './long-t5-summarizer\\spiece.model',
 './long-t5-summarizer\\added_tokens.json')

## T5-Flan

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq


# Model en tokenizer laden
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    input_text = "summarize: " + example["chat"]
    model_inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./flan-t5-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Trainer opzetten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()

# Model opslaan
model.save_pretrained("./flan-t5-summarizer")
tokenizer.save_pretrained("./flan-t5-summarizer")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 1/1 [00:00<00:00, 14.07 examples/s]
  trainer = Trainer(


Step,Training Loss


('./flan-t5-summarizer\\tokenizer_config.json',
 './flan-t5-summarizer\\special_tokens_map.json',
 './flan-t5-summarizer\\spiece.model',
 './flan-t5-summarizer\\added_tokens.json')

## PEGASUS

In [6]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Model en tokenizer laden
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    input_text = "summarize: " + example["chat"]
    model_inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./pegasus-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Trainer opzetten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()

# Model opslaan
model.save_pretrained("./pegasus-summarizer")
tokenizer.save_pretrained("./pegasus-summarizer")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1/1 [00:00<00:00, 325.52 examples/s]
  trainer = Trainer(


Step,Training Loss




('./pegasus-summarizer\\tokenizer_config.json',
 './pegasus-summarizer\\special_tokens_map.json',
 './pegasus-summarizer\\spiece.model',
 './pegasus-summarizer\\added_tokens.json')

## LED

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


# Model en tokenizer laden
model_name = "allenai/led-base-16384"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    input_text = "summarize: " + example["chat"]
    model_inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./longformer-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Trainer opzetten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()

# Model opslaan
model.save_pretrained("./longformer-summarizer")
tokenizer.save_pretrained("./longformer-summarizer")


Map: 100%|██████████| 1/1 [00:00<00:00, 12.22 examples/s]
  trainer = Trainer(
Input ids are automatically padded from 512 to 1024 to be a multiple of `config.attention_window`: 1024


Step,Training Loss


('./longformer-summarizer\\tokenizer_config.json',
 './longformer-summarizer\\special_tokens_map.json',
 './longformer-summarizer\\vocab.json',
 './longformer-summarizer\\merges.txt',
 './longformer-summarizer\\added_tokens.json',
 './longformer-summarizer\\tokenizer.json')

## Lammla ----> QWen

Dataset moet er anders uit zien met promt

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

# Dataset laden
dataset = load_dataset("json", data_files="data.jsonl")["train"]

# Combineer instructie, input, en output
def format_prompt(example):
    return {
        "text": f"""### Instructie:
{example['instruction']}

### Invoer:
{example['input']}

### Antwoord:
{example['output']}"""
    }

dataset = dataset.map(format_prompt)

# Tokenizer en model (Qwen)
model_name = "Qwen/Qwen1.5-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=1024)

tokenized_dataset = dataset.map(tokenize, batched=True)

# Collator voor causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Training config
training_args = TrainingArguments(
    output_dir="./qwen-summarizer",
    per_device_train_batch_size=1,  # pas aan naar je GPU
    num_train_epochs=3,
    save_steps=100,
    logging_steps=10,
    fp16=True,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Opslaan
model.save_pretrained("./qwen-summarizer")
tokenizer.save_pretrained("./qwen-summarizer")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previ

## XLNet ---> Niet meer

In [9]:
from transformers import AutoTokenizer, XLNetModel

# Voorbeeld dataset
dataset = Dataset.from_dict({
    "chat": [
        "Hoi, hoe is het met je? Ik had gisteren een drukke dag op werk.",
        "Kan je me helpen met het instellen van mijn router?"
    ],
    "summary": [
        "Persoon praat over een drukke werkdag.",
        "Persoon vraagt hulp bij instellen van een router."
    ]
})

# Model en tokenizer laden
model_name = "xlnet/xlnet-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = XLNetModel.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    input_text = "summarize: " + example["chat"]
    model_inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./xlnet-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Trainer opzetten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()

# Model opslaan
model.save_pretrained("./xlnet-summarizer")
tokenizer.save_pretrained("./xlnet-summarizer")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Map: 100%|██████████| 2/2 [00:00<00:00, 238.15 examples/s]
  trainer = Trainer(
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


ValueError: The model did not return a loss from the inputs, only the following keys: last_hidden_state. For reference, the inputs it received are input_ids,token_type_ids,attention_mask,labels.

## Minstral

Dataset moet er anders uit zien met promt

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

# Dataset laden
dataset = load_dataset("json", data_files="data.jsonl")["train"]

# Combineer instructie, input, en output
def format_prompt(example):
    return {
        "text": f"""### Instructie:
{example['instruction']}

### Invoer:
{example['input']}

### Antwoord:
{example['output']}"""
    }

dataset = dataset.map(format_prompt)

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # vaak nodig bij decoder-only modellen

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
)
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=1024)

tokenized_dataset = dataset.map(tokenize, batched=True)

# Collator voor causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Training config
training_args = TrainingArguments(
    output_dir="./Mistral-summarizer",
    per_device_train_batch_size=1,  # pas aan naar je GPU
    num_train_epochs=3,
    save_steps=100,
    logging_steps=10,
    fp16=True,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Opslaan
model.save_pretrained("./Mistral-summarizer")
tokenizer.save_pretrained("./Mistral-summarizer")


## Gemma

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

# Dataset laden
dataset = load_dataset("json", data_files="data.jsonl")["train"]

# Combineer instructie, input, en output
def format_prompt(example):
    return {
        "text": f"""### Instructie:
{example['instruction']}

### Invoer:
{example['input']}

### Antwoord:
{example['output']}"""
    }

dataset = dataset.map(format_prompt)

# Tokenizer en model (Qwen)
model_name = "google/gemma-7b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto"
)
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=1024)

tokenized_dataset = dataset.map(tokenize, batched=True)

# Collator voor causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Training config
training_args = TrainingArguments(
    output_dir="./gemma-summarizer",
    per_device_train_batch_size=1,  # pas aan naar je GPU
    num_train_epochs=3,
    save_steps=100,
    logging_steps=10,
    fp16=True,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Opslaan
model.save_pretrained("./gemma-summarizer")
tokenizer.save_pretrained("./gemma-summarizer")


## OpenChat

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

# Dataset laden
dataset = load_dataset("json", data_files="data.jsonl")["train"]

# Combineer instructie, input, en output
def format_prompt(example):
    return {
        "text": f"""### Instructie:
{example['instruction']}

### Invoer:
{example['input']}

### Antwoord:
{example['output']}"""
    }

dataset = dataset.map(format_prompt)

model_name = "openchat/openchat-3.5-0106"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=1024)

tokenized_dataset = dataset.map(tokenize, batched=True)

# Collator voor causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Training config
training_args = TrainingArguments(
    output_dir="./openchat-summarizer",
    per_device_train_batch_size=1,  # pas aan naar je GPU
    num_train_epochs=3,
    save_steps=100,
    logging_steps=10,
    fp16=True,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Opslaan
model.save_pretrained("./openchat-summarizer")
tokenizer.save_pretrained("./openchat-summarizer")


# Evaluatie

In [8]:
from datasets import Dataset

test_data = Dataset.from_dict({
    "chat": [
        "Hoi allemaal! Ze willen windmolens bouwen buiten het dorp. Wat vinden jullie? Er zijn zorgen over geluidsoverlast en impact op het landschap.",
        "We hebben een bijeenkomst gepland over het nieuwe buurthuis. Mensen willen weten of er genoeg budget is en hoe de planning eruitziet."
    ],
    "summary": [
        "Er is discussie over de bouw van windmolens buiten het dorp, met zorgen over geluid en landschap.",
        "Er komt een bijeenkomst over het buurthuis, met vragen over budget en planning."
    ]
})


In [20]:
from transformers import pipeline
import evaluate
import pandas as pd
rouge = evaluate.load("rouge")
results = []
model_paths = [
    "./bart-summarizer",
    "./long-t5-summarizer",
    "./flan-t5-summarizer",
    "./longformer-summarizer",
    "./t5-summarizer",
    "./pegasus-summarizer",
]

for model_path in model_paths:
    print(f"\nEvaluating: {model_path}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

    summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

    predictions = [
        summarizer(text, max_length=64, min_length=10, do_sample=False)[0]["summary_text"]
        for text in test_data["chat"]
    ]

    scores = rouge.compute(predictions=predictions, references=test_data["summary"])
    results.append({
        "model": model_path,
        "rouge1": round(scores["rouge1"], 4),
        "rouge2": round(scores["rouge2"], 4),
        "rougeL": round(scores["rougeL"], 4),
        "rougeLsum": round(scores["rougeLsum"], 4)
    })

# Tabel tonen
df = pd.DataFrame(results)
print("\n=== ROUGE Vergelijking ===")
print(df.to_string(index=False))


Evaluating: ./bart-summarizer


Device set to use cpu
Your max_length is set to 64, but your input_length is only 52. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)
Your max_length is set to 64, but your input_length is only 51. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)



Evaluating: ./long-t5-summarizer


Device set to use cpu
Your max_length is set to 64, but your input_length is only 62. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)



Evaluating: ./flan-t5-summarizer


Device set to use cpu



Evaluating: ./longformer-summarizer


Device set to use cpu
Your max_length is set to 64, but your input_length is only 52. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)
Input ids are automatically padded from 52 to 1024 to be a multiple of `config.attention_window`: 1024
Your max_length is set to 64, but your input_length is only 51. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)
Input ids are automatically padded from 51 to 1024 to be a multiple of `config.attention_window`: 1024



Evaluating: ./t5-summarizer


Device set to use cpu



Evaluating: ./pegasus-summarizer


Device set to use cpu
Your max_length is set to 64, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 64, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)



=== ROUGE Vergelijking ===
                  model  rouge1  rouge2  rougeL  rougeLsum
      ./bart-summarizer  0.4879  0.1417  0.4337     0.4337
   ./long-t5-summarizer  0.0000  0.0000  0.0000     0.0000
   ./flan-t5-summarizer  0.3749  0.0816  0.3463     0.3463
./longformer-summarizer  0.4808  0.1399  0.4274     0.4274
        ./t5-summarizer  0.4647  0.0982  0.4020     0.4020
   ./pegasus-summarizer  0.0816  0.0213  0.0816     0.0816


In [21]:
df

Unnamed: 0,model,rouge1,rouge2,rougeL,rougeLsum
0,./bart-summarizer,0.4879,0.1417,0.4337,0.4337
1,./long-t5-summarizer,0.0,0.0,0.0,0.0
2,./flan-t5-summarizer,0.3749,0.0816,0.3463,0.3463
3,./longformer-summarizer,0.4808,0.1399,0.4274,0.4274
4,./t5-summarizer,0.4647,0.0982,0.402,0.402
5,./pegasus-summarizer,0.0816,0.0213,0.0816,0.0816
