In [7]:
from pathlib import Path

BASE = Path(r"C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis")
LAW = BASE / "data" / "law"

print(LAW)

C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis\data\law


In [12]:
import fitz  # PyMuPDF
import re
import pandas as pd
from pathlib import Path

# ---------------------------------------------------
# FIXED PATHS (original plan)
# ---------------------------------------------------
BASE = Path(r"C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis")
LAW = BASE / "data" / "law"

PDF1 = LAW / "ai_act_2021_proposal.pdf"
PDF2 = LAW / "ai_act_2024_ep_position.pdf"
PDF3 = LAW / "ai_act_2024_final_oj.pdf"

PDFS = {
    "proposal_2021": PDF1,
    "ep_2024": PDF2,
    "final_2024": PDF3
}

# ---------------------------------------------------
# STEP 1 — PDF → Raw Text
# ---------------------------------------------------
def pdf_to_text(path: Path) -> str:
    doc = fitz.open(path)
    pages = []
    for page in doc:
        pages.append(page.get_text("text"))
    doc.close()
    return "\n".join(pages)

# ---------------------------------------------------
# STEP 2 — Split Into Articles
# ---------------------------------------------------
ARTICLE_RE = re.compile(
    r"(Article\s+(\d+)[^\n]*)([\s\S]*?)(?=(Article\s+\d+)|\Z)",
    re.IGNORECASE
)

def split_by_article(txt: str, source: str):
    out = []
    for m in ARTICLE_RE.finditer(txt):
        out.append({
            "source": source,
            "article_number": m.group(2),
            "article_header": (m.group(1) or "").strip(),
            "article_text": (m.group(3) or "").strip()
        })
    return out

# ---------------------------------------------------
# STEP 3 — Remove Characters Illegal for Excel
# ---------------------------------------------------
def strip_illegal(s):
    if not isinstance(s, str):
        return s
    return "".join(ch for ch in s if ch in ("\n", "\t") or ord(ch) >= 32)

# ---------------------------------------------------
# MAIN RUN — EXACT ORIGINAL PLAN
# ---------------------------------------------------
rows = []

for label, pdf in PDFS.items():
    print(f"Extracting: {label}")
    raw = pdf_to_text(pdf)
    print(f"Text length: {len(raw)}")
    parts = split_by_article(raw, label)
    print(f"Articles found: {len(parts)}")
    rows.extend(parts)

df = pd.DataFrame(rows)

for col in ["article_header", "article_text"]:
    df[col] = df[col].astype(str).apply(strip_illegal)

# ---------------------------------------------------
# STEP 4 — Save One Clean CSV With Semicolon
# ---------------------------------------------------
OUT = LAW / "ai_act_articles_clean.csv"
df.to_csv(OUT, sep=";", index=False, encoding="utf-8-sig")

print("DONE. Saved to:", OUT)
print("Rows:", len(df))
print(df.head())


Extracting: proposal_2021
Text length: 315642
Articles found: 254
Extracting: ep_2024
Text length: 592676
Articles found: 522
Extracting: final_2024
Text length: 599918
Articles found: 528
DONE. Saved to: C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis\data\law\ai_act_articles_clean.csv
Rows: 1304
          source article_number  \
0  proposal_2021            225   
1  proposal_2021            114   
2  proposal_2021             16   
3  proposal_2021            288   
4  proposal_2021              1   

                                      article_header  \
0                             Article 225 TFEU, this   
1                   Article 114 of the Treaty on the   
2                            Article 16 of the TFEU.   
3  Article 288 TFEU, will reduce legal fragmentat...   
4  Article 1), respect for private life and prote...   

                                        article_text  
0  3 \nEuropean Council, Special meeting of the E...  
1  Functioning of the European U

In [15]:
import fitz
import re
import pandas as pd
from pathlib import Path

BASE = Path(r"C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis")
LAW = BASE / "data" / "law"
OUT = LAW / "ai_act_articles.xlsx"

def clean_text_for_excel(text):
    if text is None:
        return None
    text = str(text)
    return re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", text)

def extract_articles(pdf_path, source_name):
    print("Processing:", pdf_path.name)
    doc = fitz.open(str(pdf_path))
    raw_text = " ".join(page.get_text() for page in doc)
    raw_text = raw_text.replace("\r", " ").replace("\n", " ")
    parts = re.split(r"(Article\s+\d+)", raw_text, flags=re.IGNORECASE)

    articles = []
    for i in range(1, len(parts), 2):
        number = parts[i].strip()
        text = parts[i+1].strip() if i+1 < len(parts) else ""
        articles.append({
            "source": source_name,
            "article_number": number,
            "article_text": text
        })
    return articles

pdf_files = {
    "Proposal_2021": LAW / "ai_act_2021_proposal.pdf",
    "EP_Position_2024": LAW / "ai_act_2024_ep_position.pdf",
    "Final_OJ_2024": LAW / "ai_act_2024_final_oj.pdf"
}

records = []
for label, pdf_path in pdf_files.items():
    records.extend(extract_articles(pdf_path, label))

df = pd.DataFrame(records)
df = df.applymap(clean_text_for_excel)
df.to_excel(OUT, index=False)
print("Saved:", OUT)


Processing: ai_act_2021_proposal.pdf
Processing: ai_act_2024_ep_position.pdf
Processing: ai_act_2024_final_oj.pdf
Saved: C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis\data\law\ai_act_articles.xlsx


  df = df.applymap(clean_text_for_excel)


In [16]:
import re
from pathlib import Path

import fitz  # PyMuPDF
import pandas as pd

# -------------------------------------------------------------------
# Paths
# -------------------------------------------------------------------
BASE = Path(r"C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis")
LAW = BASE / "data" / "law"

pdf_files = {
    "Proposal_2021": LAW / "ai_act_2021_proposal.pdf",
    "EP_Position_2024": LAW / "ai_act_2024_ep_position.pdf",
    "Final_OJ_2024": LAW / "ai_act_2024_final_oj.pdf",
}

# -------------------------------------------------------------------
# Helpers
# -------------------------------------------------------------------

def strip_illegal(s: str) -> str:
    """Remove characters Excel / openpyxl rejects."""
    if not isinstance(s, str):
        return s
    return re.sub(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]", "", s)

def extract_articles_strict(pdf_path: Path, source_name: str):
    """
    Extract articles from a single AI Act PDF.

    Logic:
    - read page by page
    - normalize newlines and simple hyphenation
    - a line is an article heading iff:
        * it starts with 'Article <number>'
        * AND the first non-space character after the number is
          uppercase or a digit (so 'Article 62 shall apply...' is ignored)
    - everything until the next heading is that article's text
    """
    print(f"Processing: {pdf_path.name}")

    doc = fitz.open(str(pdf_path))
    lines = []

    for page in doc:
        text = page.get_text("text")
        text = text.replace("\r", "\n")
        # simple de-hyphenation
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        for ln in text.splitlines():
            lines.append(ln.rstrip())

    records = []
    current_art = None
    current_title = None
    buffer = []

    # heading: "Article 5", "Article 5 Subject matter", "Article 5 - Subject matter", etc.
    art_re = re.compile(r"^\s*Article\s+(\d+[A-Z]?)\b(?:\s*(.*))?$", re.IGNORECASE)

    for ln in lines:
        m = art_re.match(ln)
        if m:
            tail = (m.group(2) or "").strip(" .–-")

            # Guard: ignore cross-references like "Article 62 shall apply …"
            if tail and not (tail[0].isupper() or tail[0].isdigit()):
                if current_art is not None:
                    buffer.append(ln)
                continue

            # Flush previous article
            if current_art is not None:
                text_block = "\n".join(buffer).strip()
                if text_block:
                    records.append(
                        {
                            "source": source_name,
                            "article_number": current_art,
                            "article_title": strip_illegal(current_title),
                            "article_text": strip_illegal(text_block),
                        }
                    )

            # Start new article
            current_art = m.group(1).strip()
            current_title = tail if tail else None
            buffer = []
        else:
            if current_art is not None:
                buffer.append(ln)

    # Flush last article
    if current_art is not None and buffer:
        text_block = "\n".join(buffer).strip()
        if text_block:
            records.append(
                {
                    "source": source_name,
                    "article_number": current_art,
                    "article_title": strip_illegal(current_title),
                    "article_text": strip_illegal(text_block),
                }
            )

    return records

# -------------------------------------------------------------------
# Run extraction for all three PDFs
# -------------------------------------------------------------------
all_records = []
for label, pdf_path in pdf_files.items():
    all_records.extend(extract_articles_strict(pdf_path, label))

articles_df = pd.DataFrame(all_records)

# Optional: sort for sanity
articles_df = articles_df.sort_values(["source", "article_number"], key=lambda s: s.astype(str))

print(articles_df.groupby("source")["article_number"].nunique())
print(articles_df.head())

# -------------------------------------------------------------------
# Save to Excel
# -------------------------------------------------------------------
OUT_XLSX = LAW / "ai_act_articles_clean.xlsx"
articles_df.to_excel(OUT_XLSX, index=False)
print("Saved:", OUT_XLSX)


Processing: ai_act_2021_proposal.pdf
Processing: ai_act_2024_ep_position.pdf
Processing: ai_act_2024_final_oj.pdf
source
EP_Position_2024    113
Final_OJ_2024       113
Proposal_2021        85
Name: article_number, dtype: int64
               source article_number article_title  \
89   EP_Position_2024              1          None   
99   EP_Position_2024             10          None   
194  EP_Position_2024            100          None   
195  EP_Position_2024            101          None   
196  EP_Position_2024            102          None   

                                          article_text  
89   Subject matter`\n1.\nThe purpose of this Regul...  
99   Data and data governance\n1.\nHigh-risk AI sys...  
194  Administrative fines on Union institutions, bo...  
195  Fines for providers of general-purpose AI mode...  
196  Amendment to Regulation (EC) No 300/2008\nIn A...  
Saved: C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis\data\law\ai_act_articles_clean.xlsx


In [17]:
import fitz  # PyMuPDF
import re
import pandas as pd
from pathlib import Path

# ---------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------
BASE = Path(r"C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis")
LAW = BASE / "data" / "law"
OUT = LAW / "ai_act_articles_clean.xlsx"

pdf_files = {
    "proposal_2021": LAW / "ai_act_2021_proposal.pdf",
    "ep_position_2024": LAW / "ai_act_2024_ep_position.pdf",
    "final_oj_2024": LAW / "ai_act_2024_final_oj.pdf",
}

# ---------------------------------------------------------------------
# Utility: Clean text to avoid Excel corruption
# ---------------------------------------------------------------------
def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    # Remove illegal Excel characters
    s = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", s)
    # Normalize whitespace
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    # Fix hyphenation across line breaks
    s = re.sub(r"-\s*\n\s*", "", s)
    return s.strip()


# ---------------------------------------------------------------------
# Heading detector (bulletproof)
# ---------------------------------------------------------------------
def is_article_heading(line: str) -> re.Match:
    """
    TRUE heading examples:
        Article 5
        Article 5 Definitions
        Article 5 – Definitions
        Article 5: Definitions

    FALSE headings (should be ignored):
        In accordance with Article 5…
        Article 14 shall apply…
        Article 3 may be used…

    Rule:
    - Must start at line beginning
    - Title must start with capital letter OR be empty
    - Next word must NOT be: shall, may, does, is, are, must, should
    """

    pattern = r"^(Article)\s+(\d+)\s*(.*)$"
    m = re.match(pattern, line.strip())
    if not m:
        return None

    title = m.group(3).strip()

    # If title begins with lowercase verb → reject
    forbidden_starts = ("shall", "may", "does", "is", "are", "must", "should", "be", "has", "have")
    if title:
        first = title.split()[0].lower()
        if first in forbidden_starts:
            return None

        # If starts with lowercase letter → reject
        if first[0].islower():
            return None

    return m


# ---------------------------------------------------------------------
# Extraction function
# ---------------------------------------------------------------------
def extract_articles_from_pdf(pdf_path: Path, source_name: str):
    print(f"\nExtracting from: {pdf_path.name}")
    doc = fitz.open(pdf_path)

    all_text = ""
    for page in doc:
        all_text += page.get_text()

    raw_lines = all_text.split("\n")

    articles = []
    current_article = None
    current_title = ""
    body_lines = []

    for line in raw_lines:

        m = is_article_heading(line)
        if m:
            # Save previous article before opening new one
            if current_article is not None:
                articles.append({
                    "source": source_name,
                    "article_number": current_article,
                    "article_title": current_title,
                    "article_text": clean_text("\n".join(body_lines))
                })

            current_article = m.group(2)
            current_title = m.group(3).strip()
            body_lines = []
            continue

        # Otherwise: normal body text
        if current_article is not None:
            body_lines.append(line)

    # Close final article
    if current_article is not None:
        articles.append({
            "source": source_name,
            "article_number": current_article,
            "article_title": current_title,
            "article_text": clean_text("\n".join(body_lines))
        })

    return articles


# ---------------------------------------------------------------------
# RUN EXTRACTION
# ---------------------------------------------------------------------
records = []
for label, pdf_path in pdf_files.items():
    records.extend(extract_articles_from_pdf(pdf_path, label))

articles_df = pd.DataFrame(records)

# Safety cleaning for Excel
articles_df["article_text"] = articles_df["article_text"].apply(clean_text)
articles_df["article_title"] = articles_df["article_title"].apply(clean_text)

# ---------------------------------------------------------------------
# SAVE AS EXCEL
# ---------------------------------------------------------------------
articles_df.to_excel(OUT, index=False)
print("\nSaved to:", OUT)
print("Rows extracted:", len(articles_df))

# Quick check
print(articles_df.groupby("source")["article_number"].nunique())



Extracting from: ai_act_2021_proposal.pdf

Extracting from: ai_act_2024_ep_position.pdf

Extracting from: ai_act_2024_final_oj.pdf

Saved to: C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis\data\law\ai_act_articles_clean.xlsx
Rows extracted: 375
source
ep_position_2024    113
final_oj_2024       113
proposal_2021        85
Name: article_number, dtype: int64


In [18]:
import pandas as pd
from pathlib import Path

# Paths
BASE = Path(r"C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis")
LAW = BASE / "data" / "law"
IN_XLSX = LAW / "ai_act_articles_clean.xlsx"
OUT_XLSX = LAW / "ai_act_articles_collapsed.xlsx"

# Load
df = pd.read_excel(IN_XLSX, dtype=str)

# Ensure no NaNs and compute text length
df["article_text"] = df["article_text"].fillna("")
df["article_title"] = df["article_title"].fillna("")
df["text_len"] = df["article_text"].str.len()

# Sort so the longest text per (source, article_number) comes first
df_sorted = df.sort_values(["source", "article_number", "text_len"], ascending=[True, True, False])

# Collapse: take the first (i.e. longest) row per (source, article_number)
collapsed = (
    df_sorted
    .groupby(["source", "article_number"], as_index=False)
    .agg({
        "article_title": "first",
        "article_text": "first"
    })
)

# Optional: sort for sanity
collapsed = collapsed.sort_values(["source", "article_number"], key=lambda s: s.astype(str))

# Save
collapsed.to_excel(OUT_XLSX, index=False)
print("Saved:", OUT_XLSX)
print("Rows before:", len(df), "→ after collapse:", len(collapsed))
print(collapsed.groupby("source")["article_number"].nunique())


Saved: C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis\data\law\ai_act_articles_collapsed.xlsx
Rows before: 375 → after collapse: 311
source
ep_position_2024    113
final_oj_2024       113
proposal_2021        85
Name: article_number, dtype: int64


In [19]:
def guess_selective(text):
    text_l = text.lower()
    if "risk" in text_l: return "Risk Classification"
    if "conform" in text_l or "technical" in text_l: return "Technical Requirements"
    if "oversight" in text_l or "human" in text_l: return "Human Oversight"
    if "authority" in text_l or "board" in text_l: return "Governance & Oversight"
    if "fundamental" in text_l or "rights" in text_l: return "Fundamental Rights & Ethics"
    if "transparen" in text_l or "information" in text_l: return "Transparency & Information"
    if "provider" in text_l or "deployer" in text_l: return "Market Access & Obligations"
    if "sandbox" in text_l or "innovation" in text_l: return "Innovation & Exceptions"
    if "penalt" in text_l or "enforcement" in text_l: return "Enforcement & Penalties"
    return "Scope & Definitions"

df["selective_code"] = df["article_text"].fillna("").apply(guess_selective)
df.to_excel(OUTFILE, index=False)


In [20]:
import pandas as pd
from pathlib import Path

BASE = Path(r"C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis")
LAW = BASE / "data" / "law"
INFILE = LAW / "ai_act_articles_collapsed.xlsx"
OUTFILE = LAW / "ai_act_articles_coding_template.xlsx"

df = pd.read_excel(INFILE, dtype=str)

# Add coding columns
df["selective_code"] = ""
df["axial_problem"] = ""
df["axial_mechanism"] = ""
df["axial_actor"] = ""
df["axial_outcome"] = ""
df["notes"] = ""

df.to_excel(OUTFILE, index=False)
print("Coding template saved to:", OUTFILE)


Coding template saved to: C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis\data\law\ai_act_articles_coding_template.xlsx
