# Summarization of the completed and ongoing procedure's proposals on my dataset

## 1. LLM package install and import

In [1]:
!pip install --no-deps "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.29" "trl<0.9.0" peft accelerate bitsandbytes
!pip install --upgrade --no-deps "transformers>=4.47.0" "tokenizers>=0.21.0" "huggingface-hub>=0.26.0,<1.0"

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[kaggle-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-pgkq4oxp/unsloth_25650a19c5d74f87adf95accceb5bdf8
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-pgkq4oxp/unsloth_25650a19c5d74f87adf95accceb5bdf8
  Resolved https://github.com/unslothai/unsloth.git to commit 8ea5338154859ed25b50366cb1264ed4d933eae3
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.12.9-py3-none-any.whl size=382658 sha256=0956f1bb09c1104fbb500c6ff206b17fe4191a5ce2d0470d528a451318145748
  Stored in directory: /tmp/pip-

In [None]:
import os
# To avoid 'MessageFactory' error on Kaggle
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
# Gives Torch priority to the GPU in comparison to Tenserflow
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, BitsAndBytesConfig
import torch
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import concurrent.futures
import ast
import json
import re

# Make sure NLTK data is downloaded
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 2. Model loading

In [None]:
model_name = "unsloth/phi-4-unsloth-bnb-4bit"

# Quantification configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model with the quantification configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, 
    device_map="auto",
    trust_remote_code=True
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

E0000 00:00:1767108543.324966      33 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767108543.379229      33 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00003-of-00003.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.39G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/170 [00:00<?, ?B/s]

## 3. Data import

In [5]:
# To make it work here, first input the sample_cod_completed_2025.csv on Kaggle and name it completed-procedures-2025-27-12
# You can swap the dataset with the one that contains the ongoing procedures if you want to specifically summarize them 
df = pd.read_csv("/kaggle/input/completed-procedures-2025-27-12/sample_cod_completed_2025.csv")  

In [6]:
# Since every summarization takes a lot of time, I run this code sequentially on parts of my dataset.
# Here is the code to summarize the 3 last procedures. You can change the parts of the dataset selected if you want to summarize other procedures.
df = df[15:] 
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,url,reference,title,subjects,key_players,key_events,documentation_gateway,transparency,...,Mandatory consultation of other institutions,Other legal basis,legislative_proposal_url,legislative_proposal_text,decisions_url_list,resolution_text_1,position_text_1,text_adopted_others_1,final_act_text,final_act_url
15,109,109,https://oeil.europarl.europa.eu/oeil/en/proced...,2025/0022(COD),Securities settlement in the EU and central se...,"2.50.03 Securities and financial markets, stoc...",{'European Parliament': [{'Committee responsib...,{'Legislative proposal published': {'Date': '1...,"[{'Institution': 'European Commission', 'Docum...",,...,[{'text': 'European Economic and Social Commit...,Rules of Procedure EP 165,https://eur-lex.europa.eu/legal-content/EN/TXT...,"EUROPEAN COMMISSION Brussels, 12.2.2025 COM(...",['https://www.europarl.europa.eu/doceo/documen...,European Parliament legislative resolution of ...,Position of the European Parliament adopted at...,,REGULATION (EU) 2025/2075 OF THE EUROPEAN PARL...,https://eur-lex.europa.eu/legal-content/EN/TXT...
16,115,115,https://oeil.europarl.europa.eu/oeil/en/proced...,2025/0074(COD),Extension of the timeframe for the establishme...,"7.30.30 Action to combat crime, 7.40.04 Judici...",{'European Parliament': [{'Committee responsib...,{'Legislative proposal published': {'Date': '0...,"[{'Institution': 'European Commission', 'Docum...",,...,,Rules of Procedure EP 165,https://eur-lex.europa.eu/legal-content/EN/TXT...,"EUROPEAN COMMISSION Brussels, 2.4.2025 COM(2...",['https://www.europarl.europa.eu/doceo/documen...,European Parliament legislative resolution of ...,Position of the European Parliament adopted at...,,REGULATION (EU) 2025/2082 OF THE EUROPEAN PARL...,https://eur-lex.europa.eu/legal-content/EN/TXT...
17,116,116,https://oeil.europarl.europa.eu/oeil/en/proced...,2025/0056(COD),Common rules for imports: suspension of certai...,"6.20.02 Export/import control, trade defence, ...",{'European Parliament': [{'Committee responsib...,{'Legislative proposal published': {'Date': '0...,"[{'Institution': 'European Commission', 'Docum...",,...,,,https://eur-lex.europa.eu/legal-content/EN/TXT...,"EUROPEAN COMMISSION Brussels, 7.3.2025 COM(2...",['https://www.europarl.europa.eu/doceo/documen...,European Parliament legislative resolution of ...,Position of the European Parliament adopted at...,,,


## 4. Functions to summarize the procedures

In [7]:
# Wrap llm generation into a function
def generation(prompt) :
  model.generation_config.pad_token_id = tokenizer.pad_token_id

  messages = [
      {"role": "system", "content": "You are a journalist"},
      {"role": "user", "content": prompt},
  ]
  input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
  outputs = model.generate(input_tensor.to(model.device), max_new_tokens = 300, temperature=0.1, do_sample=True)

  result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)

  return result  

In [8]:
# Function to prompt phi-4 to summarize the first page of a legislative proposal
def paste_global_summary_first_page(first_page, legislative_proposal_title):
    prompt = f"""
You are a journalist explaining European legislation to the general public.

Below is the first page of the legislative proposal for the procedure "{legislative_proposal_title}" that has been submitted to the European Parliament:

---
{first_page}
---

Summarize it in one or a few coherent paragraphs.
Your summary should:
- Explain the adopted proposal's main points.
- Remain concise, neutral, and clear. Do not repeat yourself.
- Avoid quoting specific articles or amendments.  
- Clarify technical or institutional terms in simple language.
- Exclude legal boilerplate: Systematically remove standard legal clauses that do not provide specific content to this law.

Base your explanation only on the provided text. Do not take sides. Do not defend nor attack the adopted text.
Output: a summary, in one or a few coherent paragraphs.
"""

    result = generation(prompt)
    return result


In [9]:
# Function prompting phi-4 to iteratively summarize the next page of a legislative proposal with its previous summary
def paste_global_summary(legislative_proposal_title, previous_summary, new_text, i):
    prompt = f"""
You are a journalist explaining European legislation to the general public.

Below is the summary written so far, covering the first {i} pages of the proposal for the legislative procedure "{legislative_proposal_title}" that has been submitted to the European Parliament:

---
{previous_summary}
---

Here is the next page of the proposal:

---
{new_text}
---

Update and rewrite the summary so it now covers everything up to this point ({i+1} pages), in one or a few consistent paragraphs.
Your summary should:
- Explain the proposal's main points.
- Remain concise, neutral, and clear. Do not repeat yourself.
- Avoid quoting specific articles or amendments.  
- Clarify technical or institutional terms in simple language. 
- Exclude legal boilerplate: Systematically remove standard legal clauses that do not provide specific content to this law.


Base your summary only on the provided text. Do not take sides. Do not defend nor attack the adopted text.
Output: a summary, in one or a few constitent paragraphs.
"""

    result = generation(prompt)
    return result

In [10]:
# Function to split the legislative texts into chunks of 1000 words
def text_into_segments(text, max_words=1000):
    sentences = sent_tokenize(text)  # Split the text into sentences
    segments = []
    current_segment = ""
    current_word_count = 0

    # Iterate through each sentence and group them into segments
    for sentence in sentences:
        sentence_word_count = len(word_tokenize(sentence))
        # If adding this sentence exceeds the word limit, start a new segment
        if current_word_count + sentence_word_count > max_words:
            if current_segment:
                segments.append(current_segment.strip())
            current_segment = sentence
            current_word_count = sentence_word_count
        else:
            # Otherwise, keep adding sentences to the current segment
            current_segment += " " + sentence
            current_word_count += sentence_word_count

    # Append the last segment if any text remains
    if current_segment:
        segments.append(current_segment.strip())

    return segments

In [11]:
def clean_and_parse_json(json_str):
    # Return None if the string is empty or missing
    if not json_str or json_str.strip() == "":
        return None

    # Replace single quotes with double quotes (for valid JSON)
    json_str = json_str.replace("'", '"')

    # Fix misplaced brackets like COM(2025]0513 so that it becomes COM(2025)0513
    json_str = re.sub(r'\((\d+)\]', r'(\1)', json_str)

    # Convert parentheses into square brackets only when wrapping objects
    json_str = re.sub(r'\(\s*({.*?})\s*,\s*({.*?})\s*\)', r'[\1, \2]', json_str)

    # Add missing quotes around JSON keys
    json_str = re.sub(r'(?<={|,)\s*([a-zA-Z_]\w*)\s*(?=:)', r'"\1"', json_str)

    # Remove trailing commas before closing braces or brackets
    json_str = re.sub(r',\s*([}\]])', r'\1', json_str)

    # Try to parse the cleaned JSON string
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        # Print error details if parsing fails
        print(f"Parsing error after cleaning: {e}")
        print(f"Problematic string: {repr(json_str)}")
        return None

In [12]:
def get_first_date_index(df, index):
    # Check that the index exists in the DataFrame
    if index not in df.index:
        return None

    # Retrieve the 'documentation_gateway' field for the given index
    json_str = df.at[index, 'documentation_gateway']
    list_dicts = clean_and_parse_json(json_str)

    # If the parsed JSON is empty or invalid, return None
    if not list_dicts:
        return None

    # Extract the first available date from the parsed JSON list
    first_date = list_dicts[0].get('Date')
    return first_date

In [13]:
def process_text(idx, df):
    try:
        print(f"Processing procedure {df['reference'][idx]}")
        # Retrieve the raw legislative text
        text = str(df['legislative_proposal_text'][idx]) if pd.notna(df['legislative_proposal_text'][idx]) else ""

        if text.strip() == "":
            print(f"Error : 'legislative_proposal_text' is empty for procedure {df['reference'][idx]}")
            return {"title": df['title'][idx], "error": "legislative_proposal_text' is empty"}

        # Split the legislative text into segments for processing
        texts = text_into_segments(text)
        
        # Initialize the summary text
        summary = ""

        # Loop through each text segment and build the global summary incrementally
        for i, text in enumerate(texts):
            if i == 0:
                # Generate the first page summary
                summary = paste_global_summary_first_page(text, df['title'][idx])
            else:
                # Continue the summary with additional segments
                updated_summary = paste_global_summary(df['title'][idx], summary, text, i)
                if not updated_summary:
                    print(f"Summary failed for segment {i} of {df['title'][idx]}")
                else:
                    summary = updated_summary
        
        # Save the generated summary back into the DataFrame
        df.at[idx, 'proposal_summary'] = summary
        # Save the updated DataFrame to CSV after each summary
        df.to_csv('cod_completed_proposal_general_summary_15_17.csv')
        print("Summary done")

    except Exception as e:
        print(e)
        # Print any error that occurs during processing
        return {
            "title": df['title'][idx],
            "error": str(e)
        }

In [14]:
# Iterate over all of the dataset's rows and process each legislative text
for idx in df.index:
    result = process_text(idx, df)
    del result
    # Clear GPU memory after each iteration to prevent memory overflow
    torch.cuda.empty_cache()

print('Summarization finished')

Processing procedure 2025/0022(COD)
Summary done
Processing procedure 2025/0074(COD)
Summary done
Processing procedure 2025/0056(COD)
Summary done
Summarization finished
