In [2]:
%pip install requests beautifulsoup4 pandas numpy openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
# Download submissions pdfs into submissions directory

import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
import time

def download_submissions():
    # Create submissions directory if it doesn't exist
    if not os.path.exists('submissions'):
        os.makedirs('submissions')

    base_url = 'https://www.infrastructure.gov.au/have-your-say/new-acma-powers-combat-misinformation-and-disinformation'
    page = 0
    downloaded_files = set()  # Keep track of files we've already seen
    
    while True:
        url = f"{base_url}?page={page}"
        print(f"Processing page {page}")
        
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to access page {page}. Status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)
        found_files = False
        
        for link in links:
            href = link['href']
            if any(ext in href.lower() for ext in ['.pdf', '.doc', '.docx']):
                file_url = urljoin(base_url, href)
                filename = os.path.basename(href)
                file_path = os.path.join('submissions', filename)
                
                # Skip if we've already seen this file
                if filename in downloaded_files:
                    continue
                    
                found_files = True
                downloaded_files.add(filename)
                
                # Skip if file already exists
                if os.path.exists(file_path):
                    print(f"Skipping existing file: {filename}")
                    continue
                
                try:
                    print(f"Downloading: {filename}")
                    file_response = requests.get(file_url)
                    
                    if file_response.status_code == 200:
                        with open(file_path, 'wb') as f:
                            f.write(file_response.content)
                        print(f"Successfully downloaded: {filename}")
                        time.sleep(1)
                    else:
                        print(f"Failed to download {filename}")
                        
                except Exception as e:
                    print(f"Error downloading {filename}: {str(e)}")
        
        if not found_files:
            print(f"No new files found on page {page}. Ending search.")
            break
            
        page += 1
        time.sleep(2)

if __name__ == "__main__":
    download_submissions()

In [4]:
# Download tranches excel files into tranches/

import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
import time

def download_tranches():
    # Create tranches directory if it doesn't exist
    if not os.path.exists('tranches'):
        os.makedirs('tranches')

    base_url = 'https://www.infrastructure.gov.au/have-your-say/new-acma-powers-combat-misinformation-and-disinformation'
    
    response = requests.get(base_url)
    if response.status_code != 200:
        print(f"Failed to access page. Status code: {response.status_code}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', href=True)
    
    for link in links:
        href = link['href']
        if '.xlsx' in href.lower():
            file_url = urljoin(base_url, href)
            filename = os.path.basename(href)
            file_path = os.path.join('tranches', filename)
            
            # Skip if file already exists
            if os.path.exists(file_path):
                print(f"Skipping existing file: {filename}")
                continue
                
            try:
                print(f"Downloading: {filename}")
                file_response = requests.get(file_url)
                
                if file_response.status_code == 200:
                    with open(file_path, 'wb') as f:
                        f.write(file_response.content)
                    print(f"Successfully downloaded: {filename}")
                    time.sleep(1)
                else:
                    print(f"Failed to download {filename}")
                    
            except Exception as e:
                print(f"Error downloading {filename}: {str(e)}")

if __name__ == "__main__":
    download_tranches()

Downloading: acma2023-tranche-1-15sep23.xlsx
Successfully downloaded: acma2023-tranche-1-15sep23.xlsx
Downloading: acma2023-tranche-2-22sep23.xlsx
Successfully downloaded: acma2023-tranche-2-22sep23.xlsx
Downloading: acma2023-tranche-3-28sep23.xlsx
Successfully downloaded: acma2023-tranche-3-28sep23.xlsx
Downloading: acma2023-tranche-4-5oct23_0.xlsx
Successfully downloaded: acma2023-tranche-4-5oct23_0.xlsx
Downloading: acma2023-tranche-5-23oct23.xlsx
Successfully downloaded: acma2023-tranche-5-23oct23.xlsx
Downloading: acma2023-tranche-6-13nov23.xlsx
Successfully downloaded: acma2023-tranche-6-13nov23.xlsx
Downloading: acma2023-tranche-7-11dec23.xlsx
Successfully downloaded: acma2023-tranche-7-11dec23.xlsx
Downloading: acma2023-tranche-8-18dec23.xlsx
Successfully downloaded: acma2023-tranche-8-18dec23.xlsx
Downloading: acma2023-tranche-9-21dec23.xlsx
Successfully downloaded: acma2023-tranche-9-21dec23.xlsx


In [9]:
# Convert tranches excel files to json

import pandas as pd
import os
import json
from datetime import datetime
import re

def parse_tranche_date(filename):
    # Extract date pattern like "15sep23", "5oct23", etc.
    pattern = r'-(\d{1,2}(?:sep|oct|nov|dec)23)'
    match = re.search(pattern, filename)
    if match:
        date_str = match.group(1)
        # Convert to standard format (e.g., "2023-09-15")
        day = re.match(r'(\d{1,2})', date_str).group(1)
        month = re.search(r'(sep|oct|nov|dec)', date_str).group(1)
        year = '2023'
        
        # Pad day with leading zero if needed
        day = day.zfill(2)
        
        # Convert month abbreviation to number
        month_map = {'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'}
        month_num = month_map[month]
        
        return f"{year}-{month_num}-{day}"
    return None

def process_tranches_to_json():
    # Create tranches directory if it doesn't exist
    tranches_dir = 'tranches'
    if not os.path.exists(tranches_dir):
        print("No tranches directory found!")
        return

    # Process each xlsx file
    for filename in os.listdir(tranches_dir):
        if not filename.lower().endswith('.xlsx'):
            continue
            
        xlsx_path = os.path.join(tranches_dir, filename)
        json_path = os.path.join(tranches_dir, f"{os.path.splitext(filename)[0]}.json")
        
        # Skip if JSON already exists
        if os.path.exists(json_path):
            print(f"Skipping existing JSON: {json_path}")
            continue
            
        try:
            print(f"Processing: {filename}")
            
            # Parse tranche date
            tranche_date = parse_tranche_date(filename)
            
            # Read Excel file without headers
            df = pd.read_excel(xlsx_path, header=None)
            
            # Create submissions map (filename -> submitter)
            submissions_map = {}
            for _, row in df.iterrows():
                submitter = row[0]  # First column
                submission_file = row[1]  # Second column
                if pd.notna(submitter) and pd.notna(submission_file):
                    submissions_map[str(submission_file).strip()] = str(submitter).strip()
            
            # Create JSON structure
            json_data = {
                'filename': filename,
                'metadata': {
                    'processed_timestamp': datetime.now().isoformat(),
                    'tranche_date': tranche_date,
                    'submission_count': len(submissions_map)
                },
                'submissions': submissions_map
            }
            
            # Save as JSON
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(json_data, f, ensure_ascii=False, indent=2)
                
            print(f"Successfully created: {json_path} with {len(submissions_map)} submissions")
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

if __name__ == "__main__":
    process_tranches_to_json()

Processing: acma2023-tranche-8-18dec23.xlsx
Successfully created: tranches/acma2023-tranche-8-18dec23.json with 338 submissions
Processing: acma2023-tranche-7-11dec23.xlsx
Successfully created: tranches/acma2023-tranche-7-11dec23.json with 384 submissions
Processing: acma2023-tranche-6-13nov23.xlsx
Successfully created: tranches/acma2023-tranche-6-13nov23.json with 150 submissions
Processing: acma2023-tranche-9-21dec23.xlsx
Successfully created: tranches/acma2023-tranche-9-21dec23.json with 130 submissions
Processing: acma2023-tranche-2-22sep23.xlsx
Successfully created: tranches/acma2023-tranche-2-22sep23.json with 178 submissions
Processing: acma2023-tranche-5-23oct23.xlsx
Successfully created: tranches/acma2023-tranche-5-23oct23.json with 297 submissions
Processing: acma2023-tranche-4-5oct23_0.xlsx
Successfully created: tranches/acma2023-tranche-4-5oct23_0.json with 299 submissions
Processing: acma2023-tranche-1-15sep23.xlsx
Successfully created: tranches/acma2023-tranche-1-15sep23.

In [8]:
%pip install pymupdf4llm --upgrade

In [12]:
import os
from pathlib import Path
import pymupdf4llm

def process_pdfs():
    # Process each PDF in submissions directory
    submissions_dir = 'submissions'
    for filename in os.listdir(submissions_dir):
        if not filename.lower().endswith('.pdf'):
            continue
            
        pdf_path = os.path.join(submissions_dir, filename)
        base_name = os.path.splitext(filename)[0]
        md_file = os.path.join(submissions_dir, f"{base_name}.md")
        
        # Skip if markdown file already exists
        if os.path.exists(md_file):
            print(f"Skipping already processed file: {filename}")
            continue
            
        print(f"Processing: {filename}")
        
        try:
            # Convert to markdown
            md_text = pymupdf4llm.to_markdown(pdf_path)
            
            # Save markdown version next to PDF
            Path(md_file).write_bytes(md_text.encode())
            print(f"Successfully processed {filename}")
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

if __name__ == "__main__":
    process_pdfs()

In [13]:
%pip install openai

In [6]:
import os
from pathlib import Path
import json
from openai import OpenAI
import time

def analyze_submission(client, text):
    system_prompt = """You are analyzing submissions about the Australian Government's proposed legislation for new ACMA powers to combat misinformation and disinformation.
    For the given submission text, please analyze:
    1. Overall Position: Whether they support or oppose the legislation (or neutral/mixed)
    2. Key Arguments: Main points made in the submission
    3. Main Themes: Recurring themes or areas of focus
    4. Notable Quotes: Any particularly significant quotes that support the analysis
    5. Stakeholder Type: What type of stakeholder made this submission (e.g., tech company, civil society, academic, individual)
    
    Return your analysis in JSON format."""
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-2024-08-06", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": text}
            ],
            response_format={ "type": "json_object" },
            temperature=0.1,  # Lower temperature for more consistent analysis
            seed=1
        )
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"Error in OpenAI API call: {str(e)}")
        return None

def process_submissions():
    client = OpenAI()
    
    # Process each markdown file in submissions directory
    submissions_dir = 'submissions'
    for filename in sorted(os.listdir(submissions_dir)):
        if not filename.lower().endswith('.md'):
            continue
        md_path = os.path.join(submissions_dir, filename)
        base_name = os.path.splitext(filename)[0]
        analysis_file = os.path.join(submissions_dir, f"{base_name}.json")
        
        # Skip if analysis already exists
        if os.path.exists(analysis_file):
            print(f"Skipping already analyzed file: {filename} ({analysis_file} exists)")
            continue
            
        print(f"Analyzing: {filename}")
        
        try:
            # Read markdown content
            with open(md_path, 'r', encoding='utf-8') as f:
                md_text = f.read()
            
            # Analyze with OpenAI
            analysis = analyze_submission(client, md_text)
            
            if analysis:
                # Save analysis with timestamp and model info
                output = {
                    'filename': filename,
                    'analysis_metadata': {
                        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
                        'model': 'gpt-4o-2024-08-06'
                    },
                    'analysis': analysis
                }
                
                with open(analysis_file, 'w', encoding='utf-8') as f:
                    json.dump(output, f, ensure_ascii=False, indent=2)
                print(f"Successfully analyzed {filename}, wrote to {analysis_file}")
            
            # Rate limiting
            time.sleep(1)
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

if __name__ == "__main__":
    process_submissions()

In [19]:
import os
import json
from pathlib import Path
import re

def natural_sort_key(s):
    # Extract tranche number for sorting
    match = re.search(r'tranche-(\d+)', s)
    return int(match.group(1)) if match else 0

def get_position_emoji(analysis, filename):
    if not analysis or 'Overall Position' not in analysis:
        print(f"⚠️ Missing Overall Position in analysis for {filename}")
        return "❓"  # Unknown
    position = analysis['Overall Position'].lower()
    if 'support' in position:
        return "✅"  # Support
    elif 'oppose' in position:
        return "❌"  # Oppose
    elif 'mixed' in position or 'neutral' in position:
        return "↔️"  # Mixed/Neutral
    print(f"⚠️ Unknown position value: {position} for {filename}")
    return "❓"  # Unknown

def get_stakeholder_emoji(analysis):
    if not analysis or 'Stakeholder Type' not in analysis:
        print(f"⚠️ Missing Stakeholder Type in analysis")
        return "❓"
    stakeholder = analysis['Stakeholder Type'].lower()
    
    # Skip processing if N/A
    if stakeholder in ['n/a', 'unknown']:
        return "❓"
        
    # Business/Industry/Think Tank
    if any(x in stakeholder for x in [
        'company', 'business', 'industry association', 
        'media outlet', 'think tank'
    ]):
        return "🏢"
    
    # Academic/Research
    elif 'academic' in stakeholder:
        return "🎓"
    
    # Individual
    elif 'individual' in stakeholder:
        return "👤"
    
    # Civil Society/NGO/International/Religious
    elif any(x in stakeholder for x in [
        'civil society', 'ngo', 'international organization',
        'religious organization', 'political party', 'educational institution', 
        'media organization', 'industry body', 'multistakeholder organization'
    ]):
        return "🌐"
    
    # Government/Legal
    elif any(x in stakeholder for x in ['government', 'legal', 'legislative']):
        return "🏛️"
    
    # If we get here, it's an unhandled type
    print(f"⚠️ Unknown stakeholder type: {stakeholder}")
    return "❓"

def generate_summary_markdown():
    tranches_dir = 'tranches'
    submissions_dir = 'submissions'
    output_file = 'submissions_summary.md'
    
    # Get all tranche JSON files
    tranche_files = [f for f in os.listdir(tranches_dir) if f.endswith('.json')]
    tranche_files.sort(key=natural_sort_key)
    
    with open(output_file, 'w', encoding='utf-8') as out:
        out.write("# Submissions Summary\n\n")
        
        for tranche_file in tranche_files:
            with open(os.path.join(tranches_dir, tranche_file), 'r', encoding='utf-8') as f:
                tranche_data = json.load(f)
            
            # Write tranche header
            tranche_date = tranche_data['metadata']['tranche_date']
            out.write(f"## {os.path.splitext(tranche_file)[0]} ({tranche_date})\n\n")
            
            # Write table header
            out.write("| # | Submitter | PDF | Analysis | Position | Type |\n")
            out.write("|---|-----------|-----|----------|----------|------|\n")
            
            # Write table rows
            for i, (pdf_file, submitter) in enumerate(tranche_data['submissions'].items(), 1):
                # Get corresponding JSON analysis file if it exists
                base_name = os.path.splitext(pdf_file)[0]
                json_file = f"{base_name}.json"  # Remove the _analysis suffix
                json_path = os.path.join(submissions_dir, json_file)
                
                analysis = None
                if os.path.exists(json_path):
                    try:
                        with open(json_path, 'r', encoding='utf-8') as f:
                            analysis_data = json.load(f)
                            analysis = analysis_data.get('analysis', {})
                    except Exception as e:
                        print(f"⚠️ Error reading analysis file {json_file}: {str(e)}")
                else:
                    print(f"⚠️ Missing analysis file: {json_file}")
                    continue
                
                pdf_link = f"[PDF](submissions/{pdf_file})" if os.path.exists(os.path.join(submissions_dir, pdf_file)) else "❌"
                json_link = f"[Analysis](submissions/{json_file})" if os.path.exists(json_path) else "❌"
                
                position_emoji = get_position_emoji(analysis, json_path)
                stakeholder_emoji = get_stakeholder_emoji(analysis)
                
                out.write(f"| {i} | {submitter} | {pdf_link} | {json_link} | {position_emoji} | {stakeholder_emoji} |\n")
            
            out.write("\n")  # Add space between tables
        
        # Write legend
        out.write("\n## Legend\n\n")
        out.write("### Position\n")
        out.write("- ✅ Support\n")
        out.write("- ❌ Oppose\n")
        out.write("- ↔️ Mixed/Neutral\n")
        out.write("- ❓ Unknown\n\n")
        out.write("### Stakeholder Type\n")
        out.write("- 🏢 Company/Business\n")
        out.write("- 🎓 Academic\n")
        out.write("- 👤 Individual\n")
        out.write("- 🌐 Civil Society/NGO\n")
        out.write("- 🏛️ Government\n")
        out.write("- ❓ Unknown\n")

if __name__ == "__main__":
    generate_summary_markdown()

⚠️ Missing analysis file: acma2023-17518-anonymous.json
⚠️ Missing analysis file: acma2023-18557-anonymous .json
⚠️ Missing analysis file: acma2023-21187-anthony-alaveras .json
⚠️ Missing analysis file: acma2023-21878-noah-abbey.json
⚠️ Missing analysis file: acma2023-27270-suri-ratnapala-and-adrian-ratnapala.json
⚠️ Missing analysis file: acma2023-27957-regional-development-australia-southern-inland.json
⚠️ Missing analysis file: acma2023-28523-democratic-labour-party-south-australia.json
⚠️ Missing analysis file: acma2023-33079-rob-d'ermilo.json
⚠️ Missing analysis file: acma2023-34662-feminist-legal-clinic-inc..json
⚠️ Missing analysis file: acma2023-e2410-michael-de stoop.json
⚠️ Missing analysis file: acma2023-e2420-murray-may.json
⚠️ Missing analysis file: acma2023-e2910-alexander-cornell stewart.json
⚠️ Missing analysis file: acma2023-e3675-lynda-crawford.json
⚠️ Unknown stakeholder type: multistakeholder organization (global network initiative)
⚠️ Missing analysis file: acma202