In [2]:
%pip install requests beautifulsoup4 pandas numpy

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy
  Downloading numpy-2.1.3-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl (11.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.4/11.4 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m01[0m
[?25hDownloading numpy-2.1.3-cp312-cp312-macosx_14_0_arm64.whl (5.1 MB)
[2K   [38;2;114;1

In [11]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
import time

def download_submissions():
    # Create submissions directory if it doesn't exist
    if not os.path.exists('submissions'):
        os.makedirs('submissions')

    base_url = 'https://www.infrastructure.gov.au/have-your-say/new-acma-powers-combat-misinformation-and-disinformation'
    page = 0
    downloaded_files = set()  # Keep track of files we've already seen
    
    while True:
        url = f"{base_url}?page={page}"
        print(f"Processing page {page}")
        
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to access page {page}. Status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)
        found_files = False
        
        for link in links:
            href = link['href']
            if any(ext in href.lower() for ext in ['.pdf', '.doc', '.docx']):
                file_url = urljoin(base_url, href)
                filename = os.path.basename(href)
                file_path = os.path.join('submissions', filename)
                
                # Skip if we've already seen this file
                if filename in downloaded_files:
                    continue
                    
                found_files = True
                downloaded_files.add(filename)
                
                # Skip if file already exists
                if os.path.exists(file_path):
                    print(f"Skipping existing file: {filename}")
                    continue
                
                try:
                    print(f"Downloading: {filename}")
                    file_response = requests.get(file_url)
                    
                    if file_response.status_code == 200:
                        with open(file_path, 'wb') as f:
                            f.write(file_response.content)
                        print(f"Successfully downloaded: {filename}")
                        time.sleep(1)
                    else:
                        print(f"Failed to download {filename}")
                        
                except Exception as e:
                    print(f"Error downloading {filename}: {str(e)}")
        
        if not found_files:
            print(f"No new files found on page {page}. Ending search.")
            break
            
        page += 1
        time.sleep(2)

if __name__ == "__main__":
    download_submissions()

Processing page 0
Skipping existing file: acma2023-32023-a-george.pdf
Skipping existing file: acma2023-28035-a-new-approach.pdf
Skipping existing file: acma2023-e3496-abby.pdf
Skipping existing file: acma2023-e3669-abc.pdf
Skipping existing file: acma2023-e2409-accan.pdf
Skipping existing file: acma2023-e2404-adam.pdf
Skipping existing file: acma2023-e248-adam-c-smith.pdf
Skipping existing file: acma2023-33256-adam-kachwalla.pdf
Skipping existing file: acma2023-e450-adam-sughito.pdf
Skipping existing file: acma2023-33320-adrian-adair.pdf
Processing page 1
Skipping existing file: acma2023-29852-adrian-bertino-clarke.pdf
Skipping existing file: acma2023-24668-adrian-giacobetti.pdf
Skipping existing file: acma2023-e3037-adrian-lee.pdf
Skipping existing file: acma2023-34086-aawaa.pdf
Skipping existing file: acma2023-e1805-ailsa-williams.pdf
Skipping existing file: acma2023-e2250-aisling-monahan.pdf
Skipping existing file: acma2023-21929-aj-hayes.pdf
Skipping existing file: acma2023-21789-a

In [8]:
%pip install pymupdf4llm --upgrade


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [12]:
import os
from pathlib import Path
import pymupdf4llm

def process_pdfs():
    # Process each PDF in submissions directory
    submissions_dir = 'submissions'
    for filename in os.listdir(submissions_dir):
        if not filename.lower().endswith('.pdf'):
            continue
            
        pdf_path = os.path.join(submissions_dir, filename)
        base_name = os.path.splitext(filename)[0]
        md_file = os.path.join(submissions_dir, f"{base_name}.md")
        
        # Skip if markdown file already exists
        if os.path.exists(md_file):
            print(f"Skipping already processed file: {filename}")
            continue
            
        print(f"Processing: {filename}")
        
        try:
            # Convert to markdown
            md_text = pymupdf4llm.to_markdown(pdf_path)
            
            # Save markdown version next to PDF
            Path(md_file).write_bytes(md_text.encode())
            print(f"Successfully processed {filename}")
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

if __name__ == "__main__":
    process_pdfs()

Processing: acma2023-26053-kaye-powell.pdf
Processing submissions/acma2023-26053-kaye-powell.pdf...
Successfully processed acma2023-26053-kaye-powell.pdf
Processing: acma2023-e3676-christine-mckinlay.pdf
Processing submissions/acma2023-e3676-christine-mckinlay.pdf...
Successfully processed acma2023-e3676-christine-mckinlay.pdf
Processing: acma2023-32808-peter-coventry.pdf
Processing submissions/acma2023-32808-peter-coventry.pdf...
Successfully processed acma2023-32808-peter-coventry.pdf
Processing: acma2023-22115-ray-gibson.pdf
Processing submissions/acma2023-22115-ray-gibson.pdf...
Successfully processed acma2023-22115-ray-gibson.pdf
Processing: acma2023-31473-susanne-mccully.pdf
Processing submissions/acma2023-31473-susanne-mccully.pdf...
Successfully processed acma2023-31473-susanne-mccully.pdf
Processing: acma2023-34609-brenton-moss.pdf
Processing submissions/acma2023-34609-brenton-moss.pdf...
Successfully processed acma2023-34609-brenton-moss.pdf
Processing: acma2023-13662-leon-ro

In [13]:
%pip install openai

Collecting openai
  Downloading openai-1.54.3-py3-none-any.whl.metadata (24 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.7.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.4/149.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting tqdm>4 (from openai)
  Downloading tqdm-4.67.0-py3-none-any.whl.metadata (57 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-extensions<5,>=4.11 (from openai)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3,>=1.9.0->openai)
  Downloading annotated_types-

In [6]:
import os
from pathlib import Path
import json
from openai import OpenAI
import time

def analyze_submission(client, text):
    system_prompt = """You are analyzing submissions about the Australian Government's proposed legislation for new ACMA powers to combat misinformation and disinformation.
    For the given submission text, please analyze:
    1. Overall Position: Whether they support or oppose the legislation (or neutral/mixed)
    2. Key Arguments: Main points made in the submission
    3. Main Themes: Recurring themes or areas of focus
    4. Notable Quotes: Any particularly significant quotes that support the analysis
    5. Stakeholder Type: What type of stakeholder made this submission (e.g., tech company, civil society, academic, individual)
    
    Return your analysis in JSON format."""
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-2024-08-06", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": text}
            ],
            response_format={ "type": "json_object" },
            temperature=0.1,  # Lower temperature for more consistent analysis
            seed=1
        )
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"Error in OpenAI API call: {str(e)}")
        return None

def process_submissions():
    client = OpenAI()
    
    # Process each markdown file in submissions directory
    submissions_dir = 'submissions'
    for filename in sorted(os.listdir(submissions_dir)):
        if not filename.lower().endswith('.md'):
            continue
        md_path = os.path.join(submissions_dir, filename)
        base_name = os.path.splitext(filename)[0]
        analysis_file = os.path.join(submissions_dir, f"{base_name}.json")
        
        # Skip if analysis already exists
        if os.path.exists(analysis_file):
            print(f"Skipping already analyzed file: {filename} ({analysis_file} exists)")
            continue
            
        print(f"Analyzing: {filename}")
        
        try:
            # Read markdown content
            with open(md_path, 'r', encoding='utf-8') as f:
                md_text = f.read()
            
            # Analyze with OpenAI
            analysis = analyze_submission(client, md_text)
            
            if analysis:
                # Save analysis with timestamp and model info
                output = {
                    'filename': filename,
                    'analysis_metadata': {
                        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
                        'model': 'gpt-4o-2024-08-06'
                    },
                    'analysis': analysis
                }
                
                with open(analysis_file, 'w', encoding='utf-8') as f:
                    json.dump(output, f, ensure_ascii=False, indent=2)
                print(f"Successfully analyzed {filename}, wrote to {analysis_file}")
            
            # Rate limiting
            time.sleep(1)
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

if __name__ == "__main__":
    process_submissions()

Analyzing: acma2023-13421-darcy-schenk.md
Successfully analyzed acma2023-13421-darcy-schenk.md, wrote to submissions/acma2023-13421-darcy-schenk.json
Analyzing: acma2023-13438-anonymous.md
Successfully analyzed acma2023-13438-anonymous.md, wrote to submissions/acma2023-13438-anonymous.json
Analyzing: acma2023-13578-anonymous.md
Successfully analyzed acma2023-13578-anonymous.md, wrote to submissions/acma2023-13578-anonymous.json
Analyzing: acma2023-13579-anonymous.md
Successfully analyzed acma2023-13579-anonymous.md, wrote to submissions/acma2023-13579-anonymous.json
Analyzing: acma2023-13635-peter-cunningham.md
Successfully analyzed acma2023-13635-peter-cunningham.md, wrote to submissions/acma2023-13635-peter-cunningham.json
Analyzing: acma2023-13641-anonymous.md
Successfully analyzed acma2023-13641-anonymous.md, wrote to submissions/acma2023-13641-anonymous.json
Analyzing: acma2023-13644-rowan-karrer.md
Successfully analyzed acma2023-13644-rowan-karrer.md, wrote to submissions/acma202