In [38]:
import re
import json
import pandas as pd
from IPython.display import display, HTML
import PyPDF2
from pdfminer.high_level import extract_text
import os

In [30]:
def load_raw_text(file_path):
    """
    Load raw text from the gdpr text and display basic statistics
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    print(f"Loaded text: {len(text)} characters, {len(text.split())} words")
    print(f"First 200 Characters: {text[:200]}...")
    return text

In [31]:
def clean_gdpr_text(raw_text):
    """
    Clean the GDPPR text by removing headers, footers, and normalising structure
    """

    # Remove headers and footers
    cleaned_text = re.sub(r'\d+\.\d+\.\d+\s*L\s*\d+/\d+\s*Official Journal of the European Union\s*EN', '', raw_text)

    # Remove page numbers
    cleaned_text = re.sub(r'\n\s*\d+\s*\n', '\n', cleaned_text)

    # Normalise whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    # Make sure chapter and article headings stand out
    cleaned_text = re.sub(r'(CHAPTER [IVX]+)', r'\n\n\1', cleaned_text)
    cleaned_text = re.sub(r'(ARTICLE \d+)', r'\n\n\1', cleaned_text)

    print(f"Cleaned text: {len(cleaned_text)} characters")
    print(f"Sample of cleaned text: {cleaned_text[1000: 1200]}...")

    return cleaned_text

In [32]:
def parse_gdpr_text(cleaned_text):
    """
    Parse the GDPR text line by line to build the heirarchical structure
    """
    lines = cleaned_text.split('\n')
    gdpr_data = {
        "title": "General Data Protection Regulation",
        "chapters": {}
    }

    current_chapter = None
    current_article = None
    aritcle_title = ""
    article_content = []

    for line in lines:
        line = line.strip()

        if not line or "Official Journal" in line:
            continue

        # Check for chapter heading
        chapter_match = re.match(r'CHAPTER ([IVX]+)\s*(.*)', line)
        if chapter_match:
            chapter_num = chapter_match.group(1)
            chapter_title = chapter_match.group(2).strip()

            # Save the current chapter
            current_chapter = chapter_num

            gdpr_data["chapters"][chapter_num] = {
                "title": chapter_title,
                "articles": {}
            }

            continue

        # Check for article heading
        article_match = re.match(r'Article (\d+)\s*(.*)', line)
        if article_match:
            # Save previous article if exists
            if current_article and current_chapter and article_content:
                gdpr_data["chapters"][current_chapter]["articles"][current_article] = {
                    "title": aritcle_title,
                    "content": "\n".join(article_content),
                    "paragraphs": []
                }

            # Start new article 
            current_article = article_match.group(1)
            aritcle_title = article_match.group(2).strip()
            article_content = []
            continue

        # Add content to current article
        if current_article and current_chapter:
            article_content.append(line)

    # Save the last article
    if current_article and current_chapter and article_content:
        gdpr_data["chapters"][current_chapter]["articles"][current_article] = {
            "title": aritcle_title,
            "content": "\n".join(article_content),
            "paragraphs": []
        }

    # Extract paragraphs for each article
    for chapter in gdpr_data["chapters"].values():
        for article in chapter["articles"].values():
            content = article["content"]
            paragraph_matches = re.finditer(r'(\d+)\.[\s]+(.*?)(?=\n\d+\.|\Z)', content, re.DOTALL)
            paragraphs = []
            for match in paragraph_matches:
                paragraph_num = match.group(1)
                paragraph_content = match.group(2).strip()
                paragraphs.append({
                    "number": paragraph_num,
                    "text": paragraph_content
                })

            article["paragraphs"] = paragraphs

    return gdpr_data  

In [33]:
# Step 4: Alternative parsing approach (in case the first one doesn't work well)
def parse_gdpr_alternative(cleaned_text):
    """Alternative parsing method that focuses on article extraction first"""
    gdpr_data = {
        "title": "General Data Protection Regulation",
        "chapters": {}
    }
    
    # Extract all articles first
    article_pattern = r'Article (\d+)\s*([^\n]*)\n(.*?)(?=Article \d+|$)'
    articles = re.finditer(article_pattern, cleaned_text, re.DOTALL)
    
    all_articles = {}
    for match in articles:
        article_num = match.group(1)
        article_title = match.group(2).strip()
        article_content = match.group(3).strip()
        
        all_articles[article_num] = {
            "title": article_title,
            "content": article_content,
            "paragraphs": []
        }
        
        # Extract paragraphs
        paragraph_pattern = r'(\d+)\.\s+(.*?)(?=\n\d+\.\s+|\Z)'
        paragraphs = re.finditer(paragraph_pattern, article_content, re.DOTALL)
        
        for para_match in paragraphs:
            para_num = para_match.group(1)
            para_text = para_match.group(2).strip()
            
            all_articles[article_num]["paragraphs"].append({
                "number": para_num,
                "text": para_text
            })
    
    # Try to determine chapter structure
    chapter_pattern = r'CHAPTER ([IVX]+)\s*([^\n]*)'
    chapters = re.finditer(chapter_pattern, cleaned_text)
    
    # Create a list of chapter start positions
    chapter_positions = []
    for match in chapters:
        chapter_num = match.group(1)
        chapter_title = match.group(2).strip()
        position = match.start()
        
        chapter_positions.append((position, chapter_num, chapter_title))
    
    # Sort by position
    chapter_positions.sort()
    
    # If no chapters found, use fallback structure
    if not chapter_positions:
        gdpr_data["articles"] = all_articles
        return gdpr_data
    
    # Assign articles to chapters based on their position in the text
    for i, (position, chapter_num, chapter_title) in enumerate(chapter_positions):
        next_position = float('inf')
        if i + 1 < len(chapter_positions):
            next_position = chapter_positions[i + 1][0]
        
        gdpr_data["chapters"][chapter_num] = {
            "title": chapter_title,
            "articles": {}
        }
        
        # Find articles that belong to this chapter
        for article_num, article_data in all_articles.items():
            # Find position of this article in the text
            article_match = re.search(f'Article {article_num}\\s', cleaned_text)
            if article_match and article_match.start() >= position and article_match.start() < next_position:
                gdpr_data["chapters"][chapter_num]["articles"][article_num] = article_data
    
    return gdpr_data

In [34]:
# Step 5: Save the structured data to JSON
def save_json(data, file_path):
    """Save structured data to a JSON file"""
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    
    print(f"Data saved to {file_path}")
    
    # Check data structure
    if "chapters" in data:
        chapter_count = len(data["chapters"])
        article_count = sum(len(chapter["articles"]) for chapter in data["chapters"].values())
        
        print(f"Structure contains {chapter_count} chapters and {article_count} articles")
        
        # Show a sample chapter and article
        if chapter_count > 0:
            sample_chapter_key = list(data["chapters"].keys())[0]
            sample_chapter = data["chapters"][sample_chapter_key]
            
            print(f"\nSample Chapter {sample_chapter_key}: {sample_chapter['title']}")
            
            if sample_chapter["articles"]:
                sample_article_key = list(sample_chapter["articles"].keys())[0]
                sample_article = sample_chapter["articles"][sample_article_key]
                
                print(f"Sample Article {sample_article_key}: {sample_article['title']}")
                print(f"Content preview: {sample_article['content'][:150]}...")
    else:
        article_count = len(data.get("articles", {}))
        print(f"Structure contains {article_count} articles (no chapters found)")
    
    return data

In [35]:
# Step 6: Create a search function
def search_gdpr(gdpr_data, query):
    """Search the GDPR data for a specific query term"""
    query = query.lower()
    results = []
    
    if "chapters" in gdpr_data:
        for chapter_key, chapter in gdpr_data["chapters"].items():
            for article_key, article in chapter["articles"].items():
                content = article["content"].lower()
                
                if query in content or query in article["title"].lower():
                    # Find context around the match
                    index = content.find(query)
                    if index >= 0:
                        start = max(0, index - 50)
                        end = min(len(content), index + len(query) + 50)
                        preview = "..." + content[start:end] + "..."
                    else:
                        preview = content[:100] + "..."
                    
                    results.append({
                        "chapter": chapter_key,
                        "chapter_title": chapter["title"],
                        "article": article_key,
                        "article_title": article["title"],
                        "preview": preview
                    })
    else:
        # Fallback for structure without chapters
        for article_key, article in gdpr_data.get("articles", {}).items():
            content = article["content"].lower()
            
            if query in content or query in article["title"].lower():
                index = content.find(query)
                if index >= 0:
                    start = max(0, index - 50)
                    end = min(len(content), index + len(query) + 50)
                    preview = "..." + content[start:end] + "..."
                else:
                    preview = content[:100] + "..."
                
                results.append({
                    "chapter": "N/A",
                    "chapter_title": "N/A",
                    "article": article_key,
                    "article_title": article["title"],
                    "preview": preview
                })
    
    return results

In [36]:
# Step 7: Display search results in a tabular format
def display_search_results(results):
    """Display search results in a formatted table"""
    if not results:
        return "No results found."
    
    df = pd.DataFrame(results)
    
    # Format the DataFrame
    df = df[['chapter', 'chapter_title', 'article', 'article_title', 'preview']]
    df.columns = ['Chapter', 'Chapter Title', 'Article', 'Article Title', 'Context']
    
    return df

In [37]:
# Step 8: Main processing function
def process_gdpr_document(input_file_path, output_file_path):
    """Process the GDPR document from raw text to structured JSON"""
    # Load raw text
    print("Step 1: Loading raw text...")
    raw_text = load_raw_text(input_file_path)
    
    # Clean the text
    print("\nStep 2: Cleaning text...")
    cleaned_text = clean_gdpr_text(raw_text)
    
    # Parse the text using both methods
    print("\nStep 3: Parsing text (method 1)...")
    gdpr_data1 = parse_gdpr_text(cleaned_text)
    
    print("\nStep 3 (alt): Parsing text (method 2)...")
    gdpr_data2 = parse_gdpr_alternative(cleaned_text)
    
    # Determine which method worked better
    articles1 = sum(len(chapter["articles"]) for chapter in gdpr_data1["chapters"].values()) if "chapters" in gdpr_data1 else len(gdpr_data1.get("articles", {}))
    articles2 = sum(len(chapter["articles"]) for chapter in gdpr_data2["chapters"].values()) if "chapters" in gdpr_data2 else len(gdpr_data2.get("articles", {}))
    
    print(f"\nMethod 1 found {articles1} articles")
    print(f"Method 2 found {articles2} articles")
    
    # Use the method that found more articles
    gdpr_data = gdpr_data1 if articles1 >= articles2 else gdpr_data2
    
    # Save the structured data
    print("\nStep 4: Saving structured data...")
    save_json(gdpr_data, output_file_path)
    
    return gdpr_data

In [40]:
def extract_with_pypdf2(pdf_path):
    """ 
    Extract text from PDF using PyPDF2
    """
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

In [41]:
def extract_with_pdfminer(pdf_path):
    """ 
    Extract text from PDF using pdfminer.six (usually better quality)
    """
    return extract_text(pdf_path)

In [42]:
def extract_with_ocr(pdf_path):
    """
    Extract text from PDF using OCR (for scanned documents)
    """
    try:
        from pdf2image import convert_from_path
        import pytesseract
    except ImportError:
        print("Please install pdf2image and pytesseract")
        return None
    
    print("Converting PDF to images (this may take a while)...")
    images = convert_from_path(pdf_path)

    print("Extracting text from images...")
    text = ""
    for i, image in enumerate(images):
        print(f"Processing image {i+1}/{len(images)}...")
        text += pytesseract.image_to_string(image)

    return text




In [43]:
def extract_pdf_text(pdf_path, output_path=None, method='pdfminer'):
    """
    Extract text from PDF file using specified method and save to text file.
    
    Args:
        pdf_path: Path to the PDF file
        output_path: Path to save the text output (default: same as PDF with .txt extension)
        method: Extraction method ('pypdf2', 'pdfminer', or 'ocr')
    
    Returns:
        Path to the saved text file
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")
    
    # Default output path
    if output_path is None:
        output_path = os.path.splitext(pdf_path)[0] + '.txt'
    
    print(f"Extracting text from {pdf_path} using {method} method...")
    
    # Extract text using selected method
    if method == 'pypdf2':
        text = extract_with_pypdf2(pdf_path)
    elif method == 'pdfminer':
        text = extract_with_pdfminer(pdf_path)
    elif method == 'ocr':
        text = extract_with_ocr(pdf_path)
    else:
        raise ValueError(f"Unknown extraction method: {method}")
    
    # Save extracted text
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(text)
    
    print(f"Extraction complete! Text saved to {output_path}")
    print(f"Extracted {len(text)} characters, {len(text.split())} words")
    
    return output_path

In [44]:
def try_all_methods(pdf_path):
    """Try all extraction methods and save the best result"""
    base_name = os.path.splitext(pdf_path)[0]
    
    # Try all methods
    methods = ['pypdf2', 'pdfminer', 'ocr']
    outputs = {}
    
    for method in methods:
        try:
            output_path = f"{base_name}_{method}.txt"
            extract_pdf_text(pdf_path, output_path, method)
            
            # Get file size
            size = os.path.getsize(output_path)
            outputs[method] = (output_path, size)
        except Exception as e:
            print(f"Error with {method} method: {e}")
    
    # Find the method that produced the largest file (likely contains most content)
    if outputs:
        best_method = max(outputs.items(), key=lambda x: x[1][1])[0]
        best_path = outputs[best_method][0]
        
        # Rename to standard file
        final_path = f"{base_name}.txt"
        os.replace(best_path, final_path)
        
        print(f"\nBest extraction method was {best_method}, saved as {final_path}")
        return final_path
    else:
        print("All extraction methods failed!")
        return None

In [53]:
pdf_file = "GDPR_EN.pdf"
extract_pdf_text(pdf_file, method='pypdf2', output_path='gdpr_raw_text.txt')

Extracting text from GDPR_EN.pdf using pypdf2 method...
Extraction complete! Text saved to gdpr_raw_text.txt
Extracted 366545 characters, 61941 words


'gdpr_raw_text.txt'

In [54]:
input_file_path = 'gdpr_raw_text.txt'
output_file_path = 'gdpr_structured.json'

In [55]:
gdpr_data = process_gdpr_document(input_file_path, output_file_path)

Step 1: Loading raw text...
Loaded text: 366545 characters, 61941 words
First 200 Characters: I 
(Legislativ e acts) 
REGUL ATIONS 
REGUL ATION (EU) 2016/679 OF THE EUR OPEAN PARLIAMENT AND OF THE COUNCIL 
of 27 Apr il 2016 
on the protection of natural persons with regard to the processing of...

Step 2: Cleaning text...
Cleaned text: 361981 characters
Sample of cleaned text: e 8(1) of the Char ter of Fundamental Rights of the European Union (the ‘Char ter’) and Article 16(1) of the Treaty on the Functioning of the European Union (TFEU) provide that ever yone has the right...

Step 3: Parsing text (method 1)...

Step 3 (alt): Parsing text (method 2)...

Method 1 found 0 articles
Method 2 found 7 articles

Step 4: Saving structured data...
Data saved to gdpr_structured.json
Structure contains 9 chapters and 7 articles

Sample Chapter I: Gener al provisions Article 1 Subject-matter and objectiv es 1. This Regulation lays down rules relating to the prote ction of natural persons with reg