# Training

Vergelijking van de modellen


In [5]:
import os
import json
import glob
from collections import defaultdict
from transformers import pipeline, AutoTokenizer
from tqdm import tqdm
import torch

# === CONFIG ===
DATA_PATH = r"C:\Users\caspe\Jaar 3\Datalab\Git\Datalab-lectoraat\Testing\data"
OUTPUT_FILE = "samengevoegd_met_samenvattingen_local.jsonl"
MODEL_NAME = "philschmid/bart-large-cnn-samsum"

# === SAMENVATTINGSMODEL EN TOKENIZER LADEN ===
device = 0 if torch.cuda.is_available() else -1
print(f"Device set to use: {'cuda' if device == 0 else 'cpu'}")

summarizer = pipeline("summarization", model=MODEL_NAME, tokenizer=MODEL_NAME, device=device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_INPUT_LENGTH = tokenizer.model_max_length

def split_text_by_tokens(text, tokenizer, max_input_tokens):
    """
    Splitst een lange tekst in stukken die elk maximaal max_input_tokens tokens bevatten.
    """
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    for i in range(0, len(tokens), max_input_tokens):
        chunk_tokens = tokens[i:i+max_input_tokens]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)
    return chunks

# === BESTANDEN VERWERKEN ===
all_chat_data = []

for filepath in glob.glob(os.path.join(DATA_PATH, "*.json")):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)
        print(len(data), "items gevonden in", filepath)
    topics = defaultdict(list)
    for item in data:
        topics[item["topic_id"]].append(item["text"])

    for topic_id, messages in topics.items():
        combined_text = " ".join(messages)
        if len(combined_text.strip()) < 50:
            continue

        try:
            text_chunks = split_text_by_tokens(combined_text, tokenizer, MAX_INPUT_LENGTH - 5)
            summaries = []

            for chunk in text_chunks:
                if not chunk.strip():
                    continue
                summary_output = summarizer(
                    chunk,
                    max_length=100,
                    min_length=30,
                    do_sample=False,
                )
                summaries.append(summary_output[0]["summary_text"])

            full_summary = " ".join(summaries)
            if not full_summary.strip():
                full_summary = "No summary generated."

        except Exception as e:
            print(f"Error summarizing topic_id {topic_id}: {e}")
            full_summary = "Summary not available due to error."

        all_chat_data.append({
            "topic_id": topic_id,
            "chat": combined_text,
            "summary": full_summary
        })

# === OPSLAAN ALS JSONL ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for entry in all_chat_data:
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Dataset opgeslagen als {OUTPUT_FILE} met {len(all_chat_data)} items.")

Device set to use: cpu


Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (10872 > 1024). Running this sequence through the model will result in indexing errors


175 items gevonden in C:\Users\caspe\Jaar 3\Datalab\Git\Datalab-lectoraat\Testing\data\chatlog_topic_002_20250605_110618.json
165 items gevonden in C:\Users\caspe\Jaar 3\Datalab\Git\Datalab-lectoraat\Testing\data\chatlog_topic_003_20250605_111208.json
170 items gevonden in C:\Users\caspe\Jaar 3\Datalab\Git\Datalab-lectoraat\Testing\data\chatlog_topic_004_20250605_111802.json
166 items gevonden in C:\Users\caspe\Jaar 3\Datalab\Git\Datalab-lectoraat\Testing\data\chatlog_topic_005_20250605_112316.json
172 items gevonden in C:\Users\caspe\Jaar 3\Datalab\Git\Datalab-lectoraat\Testing\data\chatlog_topic_006_20250605_112912.json
161 items gevonden in C:\Users\caspe\Jaar 3\Datalab\Git\Datalab-lectoraat\Testing\data\chatlog_topic_007_20250605_113423.json
164 items gevonden in C:\Users\caspe\Jaar 3\Datalab\Git\Datalab-lectoraat\Testing\data\chatlog_topic_008_20250605_114010.json
165 items gevonden in C:\Users\caspe\Jaar 3\Datalab\Git\Datalab-lectoraat\Testing\data\chatlog_topic_009_20250605_1145

In [1]:
from datasets import load_dataset, Dataset
import json
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch
from collections import defaultdict

# Laad je jsonl-bestand in als een lijst van dicts
with open("samengevoegd_met_samenvattingen_local.jsonl", "r", encoding="utf-8") as f:
    chat_data = [json.loads(line) for line in f]

# Maak een Huggingface Dataset
dataset = Dataset.from_list(chat_data)
# Tokenizer en model laden
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    inputs = tokenizer(example["chat"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=False)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 9/9 [00:00<00:00, 14.90 examples/s]


## Bart

In [2]:
from datasets import load_dataset, Dataset
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch
from collections import defaultdict
import json



# Tokenizer en model laden
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    inputs = tokenizer(example["chat"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=False)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./bart-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=1,
    save_steps=500,
    logging_dir="./logs",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=2,
    report_to=[],

)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()
model.save_pretrained("./bart-summarizer")
tokenizer.save_pretrained("./bart-summarizer")


Map: 100%|██████████| 9/9 [00:00<00:00, 15.37 examples/s]
  trainer = Trainer(


Step,Training Loss




('./bart-summarizer\\tokenizer_config.json',
 './bart-summarizer\\special_tokens_map.json',
 './bart-summarizer\\vocab.json',
 './bart-summarizer\\merges.txt',
 './bart-summarizer\\added_tokens.json')

## T5-Base

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq


# Model en tokenizer laden
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    input_text = "summarize: " + example["chat"]
    model_inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./t5-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to=[],

)

# Trainer opzetten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()

# Model opslaan
model.save_pretrained("./t5-summarizer")
tokenizer.save_pretrained("./t5-summarizer")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 9/9 [00:00<00:00, 39.79 examples/s]
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,3.4194


('./t5-summarizer\\tokenizer_config.json',
 './t5-summarizer\\special_tokens_map.json',
 './t5-summarizer\\spiece.model',
 './t5-summarizer\\added_tokens.json')

## T5-Long

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq



# Model en tokenizer laden
model_name = "google/long-t5-tglobal-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    input_text = "summarize: " + example["chat"]
    model_inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./long-t5-summarizer",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,    
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to=[],
    fp16=True,
)

# Trainer opzetten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()

# Model opslaan
model.save_pretrained("./long-t5-summarizer")
tokenizer.save_pretrained("./long-t5-summarizer")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You are using a model of type longt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.
Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/long-t5-tglobal-base and are newly initialized: ['encoder.block.0.layer.0.SelfAttention.k.weight', 'encoder.block.0.layer.0.SelfAttention.o.weight', 'encoder.block.0.layer.0.SelfAttention.q.weight', 'encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'encoder.block.0.l

Step,Training Loss
10,26.1862


('./long-t5-summarizer\\tokenizer_config.json',
 './long-t5-summarizer\\special_tokens_map.json',
 './long-t5-summarizer\\spiece.model',
 './long-t5-summarizer\\added_tokens.json')

In [6]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.1%2Bcu118-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.7.1%2Bcu118-cp312-cp312-win_amd64.whl.metadata (6.8 kB)
Collecting pillow!=8.3.*,>=5.3.0 (from torchvision)
  Using cached https://download.pytorch.org/whl/pillow-11.0.0-cp312-cp312-win_amd64.whl.metadata (9.3 kB)
Downloading https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp312-cp312-win_amd64.whl (2817.2 MB)
   ---------------------------------------- 0.0/2.8 GB ? eta -:--:--
   ---------------------------------------- 0.0/2.8 GB 46.3 MB/s eta 0:01:01
   ---------------------------------------- 0.0/2.8 GB 53.8 MB/s eta 0:00:52
    -----------------------

## T5-Flan

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq


# Model en tokenizer laden
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    input_text = "summarize: " + example["chat"]
    model_inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./flan-t5-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to=[],
    fp16=True,

)

# Trainer opzetten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()

# Model opslaan
model.save_pretrained("./flan-t5-summarizer")
tokenizer.save_pretrained("./flan-t5-summarizer")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 9/9 [00:00<00:00, 37.66 examples/s]
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,0.0


('./flan-t5-summarizer\\tokenizer_config.json',
 './flan-t5-summarizer\\special_tokens_map.json',
 './flan-t5-summarizer\\spiece.model',
 './flan-t5-summarizer\\added_tokens.json')

## PEGASUS

In [2]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Model en tokenizer laden
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    input_text = "summarize: " + example["chat"]
    model_inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./pegasus-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to=[],
    fp16=True,

)

# Trainer opzetten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()

# Model opslaan
model.save_pretrained("./pegasus-summarizer")
tokenizer.save_pretrained("./pegasus-summarizer")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 9/9 [00:00<00:00, 38.07 examples/s]
  trainer = Trainer(


Step,Training Loss
10,5.5731




('./pegasus-summarizer\\tokenizer_config.json',
 './pegasus-summarizer\\special_tokens_map.json',
 './pegasus-summarizer\\spiece.model',
 './pegasus-summarizer\\added_tokens.json')

## LED

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


# Model en tokenizer laden
model_name = "allenai/led-base-16384"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    input_text = "summarize: " + example["chat"]
    model_inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./longformer-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to=[],
    fp16=True,

)

# Trainer opzetten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()

# Model opslaan
model.save_pretrained("./longformer-summarizer")
tokenizer.save_pretrained("./longformer-summarizer")


Map: 100%|██████████| 9/9 [00:00<00:00, 32.72 examples/s]
  trainer = Trainer(
Input ids are automatically padded from 512 to 1024 to be a multiple of `config.attention_window`: 1024


Step,Training Loss
10,2.5536


('./longformer-summarizer\\tokenizer_config.json',
 './longformer-summarizer\\special_tokens_map.json',
 './longformer-summarizer\\vocab.json',
 './longformer-summarizer\\merges.txt',
 './longformer-summarizer\\added_tokens.json',
 './longformer-summarizer\\tokenizer.json')

## Lammla ----> QWen

Dataset moet er anders uit zien met promt

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

# Dataset laden
dataset = load_dataset("json", data_files="data.jsonl")["train"]

# Combineer instructie, input, en output
def format_prompt(example):
    return {
        "text": f"""### Instructie:
{example['instruction']}

### Invoer:
{example['input']}

### Antwoord:
{example['output']}"""
    }

dataset = dataset.map(format_prompt)

# Tokenizer en model (Qwen)
model_name = "Qwen/Qwen1.5-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=1024)

tokenized_dataset = dataset.map(tokenize, batched=True)

# Collator voor causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Training config
training_args = TrainingArguments(
    output_dir="./qwen-summarizer",
    per_device_train_batch_size=1,  # pas aan naar je GPU
    num_train_epochs=3,
    save_steps=100,
    logging_steps=10,
    fp16=True,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Opslaan
model.save_pretrained("./qwen-summarizer")
tokenizer.save_pretrained("./qwen-summarizer")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previ

## XLNet ---> Niet meer

In [9]:
from transformers import AutoTokenizer, XLNetModel

# Voorbeeld dataset
dataset = Dataset.from_dict({
    "chat": [
        "Hoi, hoe is het met je? Ik had gisteren een drukke dag op werk.",
        "Kan je me helpen met het instellen van mijn router?"
    ],
    "summary": [
        "Persoon praat over een drukke werkdag.",
        "Persoon vraagt hulp bij instellen van een router."
    ]
})

# Model en tokenizer laden
model_name = "xlnet/xlnet-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = XLNetModel.from_pretrained(model_name)

# Preprocessing-functie
def preprocess_function(example):
    input_text = "summarize: " + example["chat"]
    model_inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(example["summary"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training parameters
training_args = TrainingArguments(
    output_dir="./xlnet-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Trainer opzetten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Training starten
trainer.train()

# Model opslaan
model.save_pretrained("./xlnet-summarizer")
tokenizer.save_pretrained("./xlnet-summarizer")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Map: 100%|██████████| 2/2 [00:00<00:00, 238.15 examples/s]
  trainer = Trainer(
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


ValueError: The model did not return a loss from the inputs, only the following keys: last_hidden_state. For reference, the inputs it received are input_ids,token_type_ids,attention_mask,labels.

## Minstral

Dataset moet er anders uit zien met promt

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

# Dataset laden
dataset = load_dataset("json", data_files="data.jsonl")["train"]

# Combineer instructie, input, en output
def format_prompt(example):
    return {
        "text": f"""### Instructie:
{example['instruction']}

### Invoer:
{example['input']}

### Antwoord:
{example['output']}"""
    }

dataset = dataset.map(format_prompt)

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # vaak nodig bij decoder-only modellen

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
)
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=1024)

tokenized_dataset = dataset.map(tokenize, batched=True)

# Collator voor causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Training config
training_args = TrainingArguments(
    output_dir="./Mistral-summarizer",
    per_device_train_batch_size=1,  # pas aan naar je GPU
    num_train_epochs=3,
    save_steps=100,
    logging_steps=10,
    fp16=True,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Opslaan
model.save_pretrained("./Mistral-summarizer")
tokenizer.save_pretrained("./Mistral-summarizer")


## Gemma

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

# Dataset laden
dataset = load_dataset("json", data_files="data.jsonl")["train"]

# Combineer instructie, input, en output
def format_prompt(example):
    return {
        "text": f"""### Instructie:
{example['instruction']}

### Invoer:
{example['input']}

### Antwoord:
{example['output']}"""
    }

dataset = dataset.map(format_prompt)

# Tokenizer en model (Qwen)
model_name = "google/gemma-7b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto"
)
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=1024)

tokenized_dataset = dataset.map(tokenize, batched=True)

# Collator voor causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Training config
training_args = TrainingArguments(
    output_dir="./gemma-summarizer",
    per_device_train_batch_size=1,  # pas aan naar je GPU
    num_train_epochs=3,
    save_steps=100,
    logging_steps=10,
    fp16=True,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Opslaan
model.save_pretrained("./gemma-summarizer")
tokenizer.save_pretrained("./gemma-summarizer")


## OpenChat

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

# Dataset laden
dataset = load_dataset("json", data_files="data.jsonl")["train"]

# Combineer instructie, input, en output
def format_prompt(example):
    return {
        "text": f"""### Instructie:
{example['instruction']}

### Invoer:
{example['input']}

### Antwoord:
{example['output']}"""
    }

dataset = dataset.map(format_prompt)

model_name = "openchat/openchat-3.5-0106"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=1024)

tokenized_dataset = dataset.map(tokenize, batched=True)

# Collator voor causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Training config
training_args = TrainingArguments(
    output_dir="./openchat-summarizer",
    per_device_train_batch_size=1,  # pas aan naar je GPU
    num_train_epochs=3,
    save_steps=100,
    logging_steps=10,
    fp16=True,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Opslaan
model.save_pretrained("./openchat-summarizer")
tokenizer.save_pretrained("./openchat-summarizer")


# Evaluatie

In [4]:
from datasets import Dataset

test_data = Dataset.from_dict({
    "chat": [
        "Hoi allemaal! Ze willen windmolens bouwen buiten het dorp. Wat vinden jullie? Er zijn zorgen over geluidsoverlast en impact op het landschap.",
        "We hebben een bijeenkomst gepland over het nieuwe buurthuis. Mensen willen weten of er genoeg budget is en hoe de planning eruitziet."
    ],
    "summary": [
        "Er is discussie over de bouw van windmolens buiten het dorp, met zorgen over geluid en landschap.",
        "Er komt een bijeenkomst over het buurthuis, met vragen over budget en planning."
    ]
})


In [17]:
sumary = "Buurtbewoners bespreken het plan voor een nieuw parkeerterrein aan de dorpsrand. Arthur en Koen vinden extra parkeerplaatsen positief, maar Fatima maakt zich zorgen over de nabijheid van het speelveldje. Linda vreest voor het uitzicht vanuit haar woning. Peter meldt dat het een groen terrein wordt met bomen, wat positief ontvangen wordt. Koen wijst erop dat het aantal plekken teruggaat van 60 naar 45 om ruimte te maken voor groen. Fatima en Linda stellen voor een groenstrook aan te leggen tussen het speelveld en het parkeerterrein, voor veiligheid en minder zicht. Linda wil dat de gemeente een visualisatie deelt, en Peter stelt voor mee te denken over de inrichting. Allen zijn het erover eens dat bewoners betrokken moeten worden bij de plannen en gezamenlijk suggesties aan de gemeente moeten doen."


In [22]:
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import evaluate

# JSON bestand inladen
with open("eval/eval1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)
df['summary'] = sumary  # Voeg de samenvatting toe aan de DataFrame
# Groeperen per gesprek (topic_id) en alle 'text' regels samenvoegen tot 1 tekst per gesprek
conversations = df.groupby("topic_id").agg({
    "text": lambda texts: " ".join(texts),  # alle berichten samenvoegen
    "summary": "first"  # neem de summary (die voor alle regels gelijk is)
}).reset_index()

rouge = evaluate.load("rouge")
results = []

model_paths = [
    "./bart-summarizer",
    "./long-t5-summarizer",
    "./flan-t5-summarizer",
    "./longformer-summarizer",
    "./t5-summarizer",
    "./pegasus-summarizer",
]

for model_path in model_paths:
    print(f"\nEvaluating: {model_path}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

    summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

    predictions = [
        summarizer(conversation_text, max_length=248, min_length=10, do_sample=False)[0]["summary_text"]
        for conversation_text in conversations["text"]
    ]

    for pred in predictions[:5]:
        print(f"Prediction: {pred}")

    scores = rouge.compute(predictions=predictions, references=conversations["summary"])
    results.append({
        "model": model_path,
        "rouge1": round(scores["rouge1"], 4),
        "rouge2": round(scores["rouge2"], 4),
        "rougeL": round(scores["rougeL"], 4),
        "rougeLsum": round(scores["rougeLsum"], 4)
    })

# Resultaten tonen
df_results = pd.DataFrame(results)
print("\n=== ROUGE Vergelijking ===")
print(df_results.to_string(index=False))



Evaluating: ./bart-summarizer


Device set to use cuda:0


Prediction: Hoi allemaal! Wat vinden jullie van die plannen voor een nieuw parkeerterrein aan de rand van het dorp? Goed idee of juist niet? Ben benieuwd naar Jullie meningen! Nou, ik vind het wel een goed ideae, toch?! 🤩 Ik snap het nut ervan, maar komt het niet pal naast het speelveldje? Dat zou ik echt jammer vinden. Gerrit, Jordy, Gerrit and Gerrit are all in onze suggesties. Jordy is in de zomer, Jordi is in the zomer van de verkeersdruk van de straat, Gerhard is in in the Zomer van der Gerrit van der Berrit. Gerret is in het zomer. Jordi, Jordie, Jorden, Gerret, Gerrito, Gerrel, Gerri, Gerrik and Gerrel are in de Zomervan van de Gerrit Van der Berth van der Parkeerrein. Ger

Evaluating: ./long-t5-summarizer


Device set to use cuda:0


Prediction: tetttetttetttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt

Evaluating: ./flan-t5-summarizer


Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (954 > 512). Running this sequence through the model will result in indexing errors


Prediction: Precies Koen! Een groenstrook zou om het te vinden voor een nieuw parkeerterrein aan de rand van de dorp? Goed idee or juist niet? Ben benieuwd naar jullie meningen!

Evaluating: ./longformer-summarizer


Device set to use cuda:0
Input ids are automatically padded from 794 to 1024 to be a multiple of `config.attention_window`: 1024


Prediction: Hoi allemaal! Wat vinden jullie van die plannen voor een nieuw parkeerterrein aan de rand van het dorp? Goed idee of juist niet? Ben benieuwd naar jullie meningen! Nou, ik vind het parkeerterrein aan de rand van het dorp. Goed idee of juist niet.

Evaluating: ./t5-summarizer


Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (954 > 512). Running this sequence through the model will result in indexing errors


Prediction: jullie van die plannen voor een nieuw parkeerterrein aan de rand van het dorp? Ben benieuwd naar juist niet? Nou, ik vind hehet wel en goed idee Fatima. Spelen is belangrijk.

Evaluating: ./pegasus-summarizer


Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (749 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [6]:
df

Unnamed: 0,model,rouge1,rouge2,rougeL,rougeLsum
0,./bart-summarizer,0.4879,0.1417,0.4337,0.4337
1,./long-t5-summarizer,0.0,0.0,0.0,0.0
2,./flan-t5-summarizer,0.4366,0.1417,0.3824,0.3824
3,./longformer-summarizer,0.4879,0.1417,0.4337,0.4337
4,./t5-summarizer,0.4495,0.1023,0.3824,0.3824
5,./pegasus-summarizer,0.109,0.0217,0.109,0.109


In [3]:
import json
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Laad model
model_path = "./bart-summarizer"  
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# JSON string met één bericht

# Laad het JSON-bestand 
with open("eval/eval1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Combineer alle tekstberichten
chat_text = " ".join([msg["text"] for msg in data])
print(chat_text)

# Genereer samenvatting
summary = summarizer(chat_text, max_length=128, min_length=10, do_sample=False)[0]["summary_text"]

# Toon output
print("📝 Samenvatting van de discussie:")
print(summary)


Device set to use cuda:0


Hoi allemaal! Wat vinden jullie van die plannen voor een nieuw parkeerterrein aan de rand van het dorp? Goed idee of juist niet? Ben benieuwd naar jullie meningen! Nou, ik vind het wel een goed idee! 🚗 Meer parkeerplekken is nooit verkeerd, toch?! 🤩 Ik snap het nut ervan, maar komt het niet pal naast het speelveldje? Dat zou ik echt jammer vinden voor de kinderen 😕 Daar heb je een goed punt Fatima. Spelen is belangrijk. Maar eerlijk gezegd vind ik het soms echt zoeken naar een parkeerplek als ik thuiskom van werk... 😓 Wat ik me afvraag: wordt het een open terrein of een parkeergarage? Want als het open wordt, dan kijk ik straks tegen auto's aan vanuit mijn keukenraam. 😐 Ik heb gelezen dat het een groen parkeerterrein wordt met bomen ertussen. Dat zou wel schelen qua aanzicht en hitte in de zomer 🌳🌳 Een groen terrein klinkt wel als een mooie middenweg. Maar betekent dat ook minder plekken dan oorspronkelijk gepland? Ja, volgens de gemeentepagina worden het 45 plekken i.p.v. 60. Minder, 

# COde voor mark. Hier gebruiken we onze gefinetuned model van bart

In [4]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import json

def preprocces_json(json_file):
    """
    Laad een JSON-bestand en retourneer de tekstberichten.
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Combineer alle tekstberichten met nieuwe regels ertussen
    chat_text = "\n".join([msg["text"] for msg in data])

    input_text = "Summarize the following conversation. Give mainly the opinions of the people:\n" + chat_text
    return input_text

def voormark(input_text, model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
    summary = summarizer(
        input_text,
        max_length=248,
        min_length=50,
        do_sample=True,
        num_beams=4
    )[0]["summary_text"]
    return summary
input_text = preprocces_json("eval/eval1.json")
summary = voormark(input_text, "./bart-summarizer")
print(summary)

Device set to use cuda:0


Summarize the following conversation. Give mainly the opinions of the people: “Linda”, “Hoi allemaal! Wat vinden jullie van die plannen voor een nieuw parkeerterrein aan de rand van het dorp? Goed idee of juist niet? Ben benieuwd naar Jullie meningen!” “Nou, ik vind het wel een goed Idee, toch?! 🤩”“Ik snap het nut ervan, maar komt het niet pal naast het speelveldje? Dat zou ik echt jammer vinden van de kinderen 😕“ “I’m a man who’s a woman who is a woman in her 20s.” 💪“We don’t know if we’re going to get to know her, but we do know that she’ll be a woman soon.“She is a man in her 30s!““What if she is


In [8]:
from transformers import pipeline
import json

summarizer = pipeline("summarization", model="allenai/led-base-16384")

# Laad het JSON-bestand
with open("eval/eval1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Combineer alle tekstberichten met nieuwe regels ertussen
chat_text = "\n".join([msg["text"] for msg in data])

# Voeg een instructie toe voor betere focus
input_text = "Summarize the following conversation. Give mainly the opinions of the people:\n" + chat_text

# Samenvatten met beam search en iets langere minimumlengte
summary = summarizer(
    input_text,
    max_length=248,
    min_length=50,
    do_sample=True,
    num_beams=4
)[0]["summary_text"]

print(summary)


Device set to use cuda:0
Input ids are automatically padded from 831 to 1024 to be a multiple of `config.attention_window`: 1024


Summarize the following conversation. Give mainly the opinions of the people: 🌱Hoi allemaal! Wat vinden jullie van die plannen voor een nieuw parkeerterrein aan de rand van het dorp? Goed idee of juist niet? Ben benieuwd naar jullie meningen! 🌱Hoi allemaal! 🌱Hoi allemaal! 🌱Hoi allemaal! Wat vinden jullie van die plannen voor een nieuw parkeerterrein aan de rand van het dorp? Ben benieuwd naar jullie meningen! 🌱Hoi allemaal! 🌱Hoi allemaal! 🌱Hoi allemaal! 🌱Hoi allemaal! 🌱Hoi snap het nut ervan, maar komt het niet pal naast het speelveldje? Dat zou ik echt jammer vinden voor de kinderen 😕Hoi allemaal! �


In [6]:
from transformers import pipeline
import json

summarizer = pipeline("summarization", model="facebook/bart-base")

# Laad het JSON-bestand
with open("eval/eval1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Combineer alle tekstberichten met nieuwe regels ertussen
chat_text = "\n".join([msg["text"] for msg in data])

# Voeg een instructie toe voor betere focus
input_text = "Summarize the following conversation. Give mainly the opinions of the people:\n" + chat_text

# Samenvatten met beam search en iets langere minimumlengte
summary = summarizer(
    input_text,
    max_length=248,
    min_length=50,
    do_sample=True,
    num_beams=4
)[0]["summary_text"]

print(summary)


Device set to use cuda:0


Summarize the following conversation. Give mainly the opinions of the people: ~~Hoi allemaal! Wat vinden jullie van die plannen voor een nieuw parkeerterrein aan de rand van het dorp? Goed idee of juist niet? Ben benieuwd naar jullies meningen! ~~Nou, ik vind het wel een goed idea! 🚗 Meer parkeersplekken is nooit verkeerd, toch?! 🤩 ~~Ik snap het nut ervan, maar komt het niet pal naast het speelveldje? Dat zou ik echt jammer vinden! 💪 ~~Fatima’s keușen is belangrijk. Maar eerlijk gezegd vind ik het soms echt zoeken naar een Parkeerplek als ik thuiskom van werk... 😕@@@@@@@@Daar heb je echt samenkomen met wat


In [5]:
from transformers import pipeline
import json

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Laad het JSON-bestand
with open("eval/eval1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Combineer alle tekstberichten met nieuwe regels ertussen
chat_text = "\n".join([msg["text"] for msg in data])

# Voeg een instructie toe voor betere focus
input_text = "Summarize the following conversation. Give mainly the opinions of the people:\n" + chat_text

# Samenvatten met beam search en iets langere minimumlengte
summary = summarizer(
    input_text,
    max_length=248,
    min_length=50,
    do_sample=True,
    num_beams=4
)[0]["summary_text"]

print(summary)


Device set to use cuda:0


A parkeerterrein aan de rand van het dorp is planned. The people of the dorp had a conversation about the plannen. They were asked what they thought of the idea of a nieuw parkeersdruk. They said it was a goed idee of juist niet.
