PURPOSE: Complete missing metadata for bills without issue tags or governance

WHAT THIS DOES:
- Identifies bills missing issue tags or governance (~126 bills)
- Uses LLM to assign issue tags (1-3 per bill)  
- Uses LLM to assign governance categories
- Validates against existing human-tagged bills
- Merges with complete bills to create full dataset

OUTPUT: bill_data_completed.csv with all 146 bills fully tagged

In [2]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import json
import yaml
from anthropic import Anthropic
from collections import Counter

sns.set_style("whitegrid")
sns.set_palette("colorblind")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 11

print("✓ Libraries loaded")

✓ Libraries loaded


In [3]:
try:
    with open('../config.yml', 'r') as f:
        config = yaml.safe_load(f)
    
    api_key = config.get('ANTHROPIC_API_KEY')
    if not api_key:
        raise ValueError("ANTHROPIC_API_KEY not found in config.yml")
    
    print("✓ API key loaded from config.yml")
    
except FileNotFoundError:
    print("✗ config.yml not found in parent directory")
    print("  Create ../config.yml with:")
    print("  ANTHROPIC_API_KEY: sk-ant-your-key-here")
    raise
except Exception as e:
    print(f"✗ Error loading config: {e}")
    raise

✓ API key loaded from config.yml


In [4]:
df_bills = pl.read_excel(
    "../data/policy_tracker.xlsx", 
    sheet_name="4. US State - Current Child"
)

print(f"✓ Loaded {len(df_bills):,} bills")

# Count existing tags
has_tags = df_bills.filter(pl.col('Issue Tag').is_not_null())
has_governance = df_bills.filter(pl.col('Governance').is_not_null())

print(f"  Bills with issue tags: {len(has_tags)} ({len(has_tags)/len(df_bills)*100:.1f}%)")
print(f"  Bills with governance: {len(has_governance)} ({len(has_governance)/len(df_bills)*100:.1f}%)")
print(f"  Bills fully complete (both tags + governance): {len(df_bills.filter(pl.col('Issue Tag').is_not_null() & pl.col('Governance').is_not_null()))}")

Could not determine dtype for column 14, falling back to string


✓ Loaded 146 bills
  Bills with issue tags: 20 (13.7%)
  Bills with governance: 112 (76.7%)
  Bills fully complete (both tags + governance): 20


In [6]:
VALID_ISSUE_TAGS = [
    "Accessibility", "Algorithms", "Antitrust", "Artificial intelligence",
    "Authorisation, registration and licensing", "Children", "Civil rights",
    "Competition", "Content moderation/Free Speech", "Cybersecurity",
    "Democracy", "Design and testing standards", "Discrimination",
    "Disinformation", "Education and research", "Employment", "Extremism",
    "Foreign direct investment", "Free speech", "Hate and harassment",
    "Human rights", "Instrument unspecified", "Intellectual Property",
    "International trade", "Internet access", "Labour law", "Liability",
    "National Security", "News and journalism", "Online safety",
    "Other operating conditions", "Polarization", "Privacy", "Public health",
    "Public procurement", "Subsidies and industrial policy", "Taxation",
    "Transparency"
]

print(f"✓ Loaded {len(VALID_ISSUE_TAGS)} valid issue tags")

✓ Loaded 38 valid issue tags


In [8]:
DATA_COMPLETION_PROMPT = """You are analyzing US state child online safety legislation to complete missing metadata tags.

**Bill Information:**
State: {state}
Name: {name}
Description: {description}

**Existing Tags:**
Issue Tag 1: {tag1}
Issue Tag 2: {tag2}
Issue Tag 3: {tag3}
Governance: {governance}

---

## TASK: Complete Missing Tags

### Issue Tags
Select 1-3 tags from this list that best describe the bill's PRIMARY focus areas:

{issue_tag_list}

**IMPORTANT CONSTRAINTS:**
- DO NOT select "Children" - this is a child safety bill dataset, so that tag is redundant
- Prioritize SPECIFICITY over generality
- Choose tags that distinguish THIS bill from other child safety bills
- If the bill focuses on online safety broadly, use more specific tags like "Content moderation/Free Speech", "Privacy", "Cybersecurity", etc.

### Governance Category
Select ONE category that best describes the bill's governance approach:

- Content Moderation/Free Speech: Bills regulating harmful content, age verification for adult content
- Privacy/Data Rights: Bills restricting data collection/use from minors
- Platforms + Democracy: Bills requiring platform design changes, impact assessments
- Government Surveillance: Bills creating task forces, studies, or government monitoring programs

If none fit well, leave as null.

---

## OUTPUT FORMAT

Respond with ONLY valid JSON (no markdown, no explanation):

{{
  "issue_tags": ["tag1", "tag2"],
  "governance": "category name or null",
  "reasoning": "Brief explanation of tag choices"
}}
"""

issue_tag_list = "\n".join([f"- {tag}" for tag in VALID_ISSUE_TAGS])
print("✓ Prompt template ready")

✓ Prompt template ready


In [9]:
# Bills missing issue tags OR governance
bills_needing_completion = df_bills.filter(
    pl.col('Issue Tag').is_null() | pl.col('Governance').is_null()
)

print(f"\nBills needing completion: {len(bills_needing_completion)}")
print(f"Bills already complete: {len(df_bills) - len(bills_needing_completion)}")


Bills needing completion: 126
Bills already complete: 20


In [10]:
print("\n=== RUNNING DATA COMPLETION ===")
print(f"Processing {len(bills_needing_completion)} incomplete bills\n")

client = Anthropic(api_key=api_key)

data_completion_results = []
completion_errors = []

for i, bill in enumerate(bills_needing_completion.iter_rows(named=True), 1):
    print(f"[{i:3}/{len(bills_needing_completion)}] {bill['US State']:15} {bill['Name'][:45]}...")
    
    # Create data completion prompt
    prompt = DATA_COMPLETION_PROMPT.format(
        state=bill['US State'],
        name=bill['Name'],
        description=bill['Description'],
        tag1=bill.get('Issue Tag') or 'None',
        tag2=bill.get('Issue Tag_1') or 'None',
        tag3=bill.get('Issue Tag_2') or 'None',
        governance=bill.get('Governance') or 'None',
        issue_tag_list=issue_tag_list
    )
    
    try:
        message = client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=800,
            temperature=0.0,
            messages=[{"role": "user", "content": prompt}]
        )
        
        text = message.content[0].text.strip()
        if text.startswith('```'):
            text = text.replace('```json', '').replace('```', '').strip()
        
        completion = json.loads(text)
        
        # Store results with metadata
        result = {
            'state': bill['US State'],
            'name': bill['Name'],
            'description': bill['Description'],
            'existing_tag_1': bill.get('Issue Tag'),
            'existing_tag_2': bill.get('Issue Tag_1'),
            'existing_tag_3': bill.get('Issue Tag_2'),
            'existing_governance': bill.get('Governance'),
            'llm_tags': completion.get('issue_tags', []),
            'llm_governance': completion.get('governance'),
            'reasoning': completion.get('reasoning', '')
        }
        
        data_completion_results.append(result)
        
        tags_str = ', '.join(completion.get('issue_tags', []))
        gov_str = completion.get('governance', 'null')
        print(f"     ✓ Tags: {tags_str[:50]}")
        print(f"     ✓ Gov:  {gov_str}")
        
    except Exception as e:
        print(f"     ✗ Error: {e}")
        completion_errors.append({
            'bill': f"{bill['US State']} - {bill['Name']}",
            'error': str(e)
        })

print(f"\n✓ Data completion: {len(data_completion_results)}/{len(bills_needing_completion)} bills")
print(f"  Errors: {len(completion_errors)}")

# Save results
with open('../data/data_completion_results.json', 'w') as f:
    json.dump(data_completion_results, f, indent=2)
print("✓ Saved: ../data/data_completion_results.json")


=== RUNNING DATA COMPLETION ===
Processing 126 incomplete bills

[  1/126] Georgia         Student Technology Protection Act...
     ✓ Tags: Education and research, Cybersecurity, Online safe
     ✓ Gov:  Platforms + Democracy
[  2/126] Idaho           PROTECTION OF MINORS – Adds to existing law t...
     ✓ Tags: Content moderation/Free Speech, Design and testing
     ✓ Gov:  Content moderation/Free Speech
[  3/126] Idaho           PROTECTION OF MINORS – Adds to existing law t...
     ✓ Tags: Content moderation/Free Speech, Design and testing
     ✓ Gov:  Content moderation/Free Speech
[  4/126] Illinois        Creates the Children's Privacy Protection and...
     ✓ Tags: Privacy, Online safety, Design and testing standar
     ✓ Gov:  Platforms + Democracy
[  5/126] Illinois        Creates the Minor Online Data Privacy Act....
     ✓ Tags: Privacy, Online safety
     ✓ Gov:  Privacy/Data Rights
[  6/126] Indiana         Internet safety curricula for schools...
     ✓ Tags: Education a

In [11]:
print("\n\n=== CREATING COMPLETED DATASET ===\n")

# Create lookup for LLM results
llm_lookup = {
    (r['state'], r['name']): r 
    for r in data_completion_results
}

# Merge all bills (complete + newly completed)
rows = []
for bill in df_bills.iter_rows(named=True):
    key = (bill['US State'], bill['Name'])
    
    # Check if this bill was processed by LLM
    if key in llm_lookup:
        llm_result = llm_lookup[key]
        row = {
            'US State': bill['US State'],
            'Name': bill['Name'],
            'Description': bill['Description'],
            'Issue_Tag_1': llm_result['llm_tags'][0] if len(llm_result['llm_tags']) > 0 else None,
            'Issue_Tag_2': llm_result['llm_tags'][1] if len(llm_result['llm_tags']) > 1 else None,
            'Issue_Tag_3': llm_result['llm_tags'][2] if len(llm_result['llm_tags']) > 2 else None,
            'Governance': llm_result['llm_governance'],
            'Source': 'LLM'
        }
    else:
        # Bill already had complete data
        row = {
            'US State': bill['US State'],
            'Name': bill['Name'],
            'Description': bill['Description'],
            'Issue_Tag_1': bill.get('Issue Tag'),
            'Issue_Tag_2': bill.get('Issue Tag_1'),
            'Issue_Tag_3': bill.get('Issue Tag_2'),
            'Governance': bill.get('Governance'),
            'Source': 'Original'
        }
    
    rows.append(row)

df_completed = pl.DataFrame(rows)

print(f"✓ Created completed dataset: {df_completed.shape}")
print(f"  Bills from original data: {len(df_completed.filter(pl.col('Source') == 'Original'))}")
print(f"  Bills completed by LLM: {len(df_completed.filter(pl.col('Source') == 'LLM'))}")

# Save
df_completed.write_csv('../data/bill_data_completed.csv')
print("✓ Saved: ../data/bill_data_completed.csv")



=== CREATING COMPLETED DATASET ===

✓ Created completed dataset: (146, 8)
  Bills from original data: 20
  Bills completed by LLM: 126
✓ Saved: ../data/bill_data_completed.csv
