In [1]:
import pandas as pd
import numpy as np
import requests
import time
from datetime import datetime
from tqdm import tqdm
import os
import json
import pickle
from dotenv import load_dotenv
import logging
from typing import Dict, List, Optional

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('118th_congress_extraction.log'),
        logging.StreamHandler()
    ]
)

# Load environment variables
load_dotenv()
CONGRESS_API_KEY = os.getenv('CONGRESS_API_KEY')

# Configuration
CACHE_DIR = '118th_congress_cache'
CHECKPOINT_FILE = '118th_congress_checkpoint.pkl'
RATE_LIMIT_DELAY = 0.75
MAX_RETRIES = 3

# Create cache directory
os.makedirs(CACHE_DIR, exist_ok=True)

def save_checkpoint(state: Dict):
    """Save extraction progress"""
    with open(CHECKPOINT_FILE, 'wb') as f:
        pickle.dump(state, f)
    logging.info(f"Checkpoint saved: {state['total_processed']} bills processed")

def load_checkpoint() -> Optional[Dict]:
    """Load previous progress if exists"""
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'rb') as f:
            state = pickle.load(f)
        logging.info(f"Resuming from checkpoint: {state['total_processed']} bills already processed")
        return state
    return None

def fetch_with_retry(url: str, use_cache: bool = True) -> Optional[Dict]:
    """Fetch URL with retry logic and optional caching"""
    # Check cache
    if use_cache:
        cache_key = url.replace('/', '_').replace(':', '').replace('?', '_').replace('&', '_')
        cache_file = os.path.join(CACHE_DIR, f"{cache_key}.json")
        
        if os.path.exists(cache_file):
            try:
                with open(cache_file, 'r') as f:
                    return json.load(f)
            except:
                pass  # Corrupted cache, fetch fresh
    
    # Fetch from API
    for attempt in range(MAX_RETRIES):
        try:
            response = requests.get(url, timeout=30)
            if response.status_code == 200:
                data = response.json()
                # Cache if enabled
                if use_cache:
                    with open(cache_file, 'w') as f:
                        json.dump(data, f)
                return data
            elif response.status_code == 429:
                wait_time = int(response.headers.get('Retry-After', 60))
                logging.warning(f"Rate limited. Waiting {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                logging.warning(f"HTTP {response.status_code} for {url}")
        except Exception as e:
            logging.error(f"Error on attempt {attempt + 1}: {str(e)}")
        
        if attempt < MAX_RETRIES - 1:
            time.sleep(5 * (attempt + 1))
    
    return None

def get_all_118th_congress_bills(bill_type: str = 'hr') -> List[Dict]:
    """Get ALL bills of a specific type from 118th Congress"""
    congress = 118
    all_bills = []
    offset = 0
    limit = 250
    
    logging.info(f"Fetching all {bill_type.upper()} bills from 118th Congress")
    
    while True:
        url = f"https://api.congress.gov/v3/bill/{congress}/{bill_type}?api_key={CONGRESS_API_KEY}&limit={limit}&offset={offset}"
        
        data = fetch_with_retry(url)
        if not data:
            break
        
        bills = data.get('bills', [])
        if not bills:
            break
        
        all_bills.extend(bills)
        
        # Check pagination
        pagination = data.get('pagination', {})
        total_count = pagination.get('count', 0)
        
        logging.info(f"Fetched {len(all_bills)} of {total_count} {bill_type.upper()} bills")
        
        if offset + limit >= total_count:
            break
        
        offset += limit
        time.sleep(RATE_LIMIT_DELAY)
    
    return all_bills

def extract_bill_features_detailed(bill_info: Dict) -> Optional[Dict]:
    """Extract comprehensive features from bill data"""
    try:
        if not bill_info or not bill_info.get('bill'):
            return None
        
        bill = bill_info['bill']
        
        features = {
            'bill_id': f"{bill.get('congress')}-{bill.get('type')}-{bill.get('number')}",
            'congress': bill.get('congress'),
            'bill_type': bill.get('type'),
            'bill_number': bill.get('number'),
            'title': bill.get('title', ''),
            'introduced_date': bill.get('introducedDate'),
            'url': bill.get('url', ''),
        }
        
        # Policy area
        policy_area = bill.get('policyArea')
        features['policy_area'] = policy_area.get('name', 'Unknown') if isinstance(policy_area, dict) else 'Unknown'
        
        # Sponsor information
        sponsors = bill.get('sponsors', [])
        if isinstance(sponsors, list) and sponsors:
            main_sponsor = sponsors[0]
            features['sponsor_name'] = main_sponsor.get('fullName', '')
            features['sponsor_party'] = main_sponsor.get('party', 'Unknown')
            features['sponsor_state'] = main_sponsor.get('state', '')
            features['sponsor_bioguide_id'] = main_sponsor.get('bioguideId', '')
        else:
            features['sponsor_name'] = ''
            features['sponsor_party'] = 'Unknown'
            features['sponsor_state'] = ''
            features['sponsor_bioguide_id'] = ''
        
        features['sponsor_count'] = len(sponsors) if isinstance(sponsors, list) else 0
        
        # Party breakdown
        if isinstance(sponsors, list):
            sponsor_parties = [s.get('party', '') for s in sponsors]
            features['dem_sponsors'] = sponsor_parties.count('D')
            features['rep_sponsors'] = sponsor_parties.count('R')
            features['ind_sponsors'] = features['sponsor_count'] - features['dem_sponsors'] - features['rep_sponsors']
        else:
            features['dem_sponsors'] = 0
            features['rep_sponsors'] = 0
            features['ind_sponsors'] = 0
        
        # Cosponsors
        cosponsors = bill_info.get('cosponsors', [])
        features['cosponsor_count'] = len(cosponsors) if isinstance(cosponsors, list) else 0
        
        if isinstance(cosponsors, list) and cosponsors:
            cosponsor_parties = [c.get('party', '') for c in cosponsors]
            features['dem_cosponsors'] = cosponsor_parties.count('D')
            features['rep_cosponsors'] = cosponsor_parties.count('R')
            features['ind_cosponsors'] = features['cosponsor_count'] - features['dem_cosponsors'] - features['rep_cosponsors']
            
            # Original cosponsors
            original = sum(1 for c in cosponsors if c.get('isOriginalCosponsor', False))
            features['original_cosponsor_count'] = original
        else:
            features['dem_cosponsors'] = 0
            features['rep_cosponsors'] = 0
            features['ind_cosponsors'] = 0
            features['original_cosponsor_count'] = 0
        
        # Bipartisan metrics
        total_sponsors = features['sponsor_count'] + features['cosponsor_count']
        total_dem = features['dem_sponsors'] + features['dem_cosponsors']
        total_rep = features['rep_sponsors'] + features['rep_cosponsors']
        
        features['is_bipartisan'] = int(total_dem > 0 and total_rep > 0)
        features['bipartisan_ratio'] = min(total_dem, total_rep) / total_sponsors * 2 if total_sponsors > 0 else 0
        
        # Committees
        committees_data = bill_info.get('committees', {})
        if isinstance(committees_data, dict):
            committee_items = committees_data.get('committees', [])
            if isinstance(committee_items, list):
                features['committee_count'] = len(committee_items)
                committee_names = [c.get('name', '') for c in committee_items if isinstance(c, dict)]
                features['committees'] = '; '.join(committee_names[:5])
            else:
                features['committee_count'] = 0
                features['committees'] = ''
        else:
            features['committee_count'] = 0
            features['committees'] = ''
        
        # Latest action
        latest_action = bill.get('latestAction', {})
        features['latest_action'] = latest_action.get('text', '') if isinstance(latest_action, dict) else ''
        features['latest_action_date'] = latest_action.get('actionDate', '') if isinstance(latest_action, dict) else ''
        
        # Actions
        actions = bill_info.get('actions', [])
        features['action_count'] = len(actions) if isinstance(actions, list) else 0
        
        # Subjects
        subjects_data = bill_info.get('subjects', {})
        if isinstance(subjects_data, dict):
            subject_items = subjects_data.get('legislativeSubjects', {})
            if isinstance(subject_items, dict):
                items = subject_items.get('item', [])
                if isinstance(items, list):
                    features['subject_count'] = len(items)
                    subject_names = [s.get('name', '') for s in items if isinstance(s, dict)]
                    features['subjects'] = '; '.join(subject_names[:10])
                else:
                    features['subject_count'] = 0
                    features['subjects'] = ''
            else:
                features['subject_count'] = 0
                features['subjects'] = ''
        else:
            features['subject_count'] = 0
            features['subjects'] = ''
        
        # Title features
        features['title_length'] = len(features['title'])
        features['title_word_count'] = len(features['title'].split())
        
        # Determine outcome
        latest = features['latest_action'].lower()
        if any(term in latest for term in ['became public law', 'signed by president', 'enacted']):
            features['passed'] = 1
        elif any(term in latest for term in ['failed', 'rejected', 'vetoed']):
            features['passed'] = 0
        else:
            features['passed'] = -1  # Pending
        
        return features
        
    except Exception as e:
        logging.error(f"Error extracting features: {str(e)}")
        return None

def fetch_bill_details_comprehensive(congress: int, bill_type: str, bill_number: str) -> Optional[Dict]:
    """Fetch comprehensive bill information"""
    base_url = f"https://api.congress.gov/v3/bill/{congress}/{bill_type}/{bill_number}"
    
    # Main bill info
    bill_data = fetch_with_retry(f"{base_url}?api_key={CONGRESS_API_KEY}")
    if not bill_data:
        return None
    
    result = {'bill': bill_data.get('bill', {})}
    
    # Get cosponsors (up to 250)
    cosponsor_url = f"{base_url}/cosponsors?api_key={CONGRESS_API_KEY}&limit=250"
    cosponsor_data = fetch_with_retry(cosponsor_url)
    if cosponsor_data:
        result['cosponsors'] = cosponsor_data.get('cosponsors', [])
    
    # Get committees
    committees_url = f"{base_url}/committees?api_key={CONGRESS_API_KEY}"
    committees_data = fetch_with_retry(committees_url)
    if committees_data:
        result['committees'] = committees_data
    
    # Get subjects
    subjects_url = f"{base_url}/subjects?api_key={CONGRESS_API_KEY}"
    subjects_data = fetch_with_retry(subjects_url)
    if subjects_data:
        result['subjects'] = subjects_data.get('subjects', {})
    
    # Get recent actions (up to 50)
    actions_url = f"{base_url}/actions?api_key={CONGRESS_API_KEY}&limit=50"
    actions_data = fetch_with_retry(actions_url)
    if actions_data:
        result['actions'] = actions_data.get('actions', [])
    
    return result

def extract_118th_congress():
    """Extract all bills from 118th Congress"""
    
    # Check for checkpoint
    print("Loading checkpoint")
    checkpoint = load_checkpoint()
    if checkpoint:
        print("Checkpoint exists")
        all_features = checkpoint['features']
        processed_bills = set(checkpoint['processed_bills'])
        current_type_idx = checkpoint['current_type_idx']
    else:
        print("Checkpoint does NOT exist")
        all_features = []
        processed_bills = set()
        current_type_idx = 0
    
    bill_types = ['hr', 's', 'hjres', 'sjres']
    congress = 118
    
    # Process each bill type
    for type_idx, bill_type in enumerate(bill_types):
        if type_idx < current_type_idx:
            continue
        
        # Get all bills of this type
        bills = get_all_118th_congress_bills(bill_type)
        logging.info(f"Processing {len(bills)} {bill_type.upper()} bills")
        
        # Process each bill
        for bill in tqdm(bills, desc=f"118th Congress {bill_type.upper()}"):
            bill_number = bill.get('number', '')
            bill_key = f"{congress}-{bill_type}-{bill_number}"
            
            if bill_key in processed_bills:
                continue
            
            try:
                # Fetch detailed info
                detailed_info = fetch_bill_details_comprehensive(congress, bill_type, bill_number)
                if detailed_info:
                    features = extract_bill_features_detailed(detailed_info)
                    if features:
                        all_features.append(features)
                        processed_bills.add(bill_key)
                
                # Save checkpoint every 50 bills
                if len(processed_bills) % 50 == 0:
                    save_checkpoint({
                        'features': all_features,
                        'processed_bills': list(processed_bills),
                        'total_processed': len(processed_bills),
                        'current_type_idx': type_idx
                    })
                
                time.sleep(RATE_LIMIT_DELAY)
                
            except Exception as e:
                logging.error(f"Error processing {bill_key}: {str(e)}")
                continue
    
    # Create final dataset
    if all_features:
        df = pd.DataFrame(all_features)
        
        # Add time-based features
        if 'introduced_date' in df.columns:
            df['introduced_date'] = pd.to_datetime(df['introduced_date'], errors='coerce')
            df['days_since_introduction'] = (datetime.now() - df['introduced_date']).dt.days
            df['month_introduced'] = df['introduced_date'].dt.month
            df['quarter_introduced'] = df['introduced_date'].dt.quarter
            df['year_introduced'] = df['introduced_date'].dt.year
            df['is_election_year'] = (df['year_introduced'] % 4 == 0).astype(int)
        
        # Save full dataset
        df.to_csv('bills_118th_congress_full.csv', index=False)
        logging.info(f"Saved {len(df)} bills to bills_118th_congress_full.csv")
        
        # Summary statistics
        print("\n" + "="*60)
        print("118TH CONGRESS EXTRACTION COMPLETE")
        print("="*60)
        print(f"Total bills extracted: {len(df)}")
        print(f"Bills with known outcomes: {len(training_df)}")
        if len(training_df) > 0:
            print(f"Pass rate: {(training_df['passed'] == 1).mean()*100:.1f}%")
            print("\nBills by type:")
            print(df['bill_type'].value_counts())
            print("\nSponsor party distribution:")
            print(df['sponsor_party'].value_counts())
            print("\nOutcome distribution:")
            print(training_df['passed'].value_counts())
        
        # Clean up
        if os.path.exists(CHECKPOINT_FILE):
            os.remove(CHECKPOINT_FILE)
        
        return df
    
    return pd.DataFrame()

if __name__ == "__main__":
    if not CONGRESS_API_KEY:
        print("ERROR: CONGRESS_API_KEY not found!")
    else:
        print("Extracting ALL bills from 118th Congress (2023-2024)")
        print("This includes HR, S, HJRES, and SJRES bills")
        print("Estimated time: 2-4 hours")
        print("-" * 60)
        
        start_time = time.time()
        df = extract_118th_congress()
        
        elapsed = time.time() - start_time
        print(f"\nCompleted in {elapsed/3600:.1f} hours")
        
        if not df.empty:
            print("\nFiles created:")
            print("- bills_118th_congress_full.csv")

2025-07-25 23:40:09,264 - INFO - Fetching all S bills from 118th Congress
2025-07-25 23:40:09,272 - INFO - Fetched 250 of 5649 S bills


Extracting ALL bills from 118th Congress (2023-2024)
This includes HR, S, HJRES, and SJRES bills
Estimated time: 2-4 hours
------------------------------------------------------------
Loading checkpoint
Checkpoint does NOT exist


2025-07-25 23:40:10,027 - INFO - Fetched 500 of 5649 S bills
2025-07-25 23:40:10,784 - INFO - Fetched 750 of 5649 S bills
2025-07-25 23:40:11,541 - INFO - Fetched 1000 of 5649 S bills
2025-07-25 23:40:12,300 - INFO - Fetched 1250 of 5649 S bills
2025-07-25 23:40:13,056 - INFO - Fetched 1500 of 5649 S bills
2025-07-25 23:40:13,813 - INFO - Fetched 1750 of 5649 S bills
2025-07-25 23:40:14,566 - INFO - Fetched 2000 of 5649 S bills
2025-07-25 23:40:15,323 - INFO - Fetched 2250 of 5649 S bills
2025-07-25 23:40:16,090 - INFO - Fetched 2500 of 5649 S bills
2025-07-25 23:40:16,849 - INFO - Fetched 2750 of 5649 S bills
2025-07-25 23:40:17,608 - INFO - Fetched 3000 of 5649 S bills
2025-07-25 23:40:18,374 - INFO - Fetched 3250 of 5649 S bills
2025-07-25 23:40:19,145 - INFO - Fetched 3500 of 5649 S bills
2025-07-25 23:40:19,917 - INFO - Fetched 3750 of 5649 S bills
2025-07-25 23:40:20,687 - INFO - Fetched 4000 of 5649 S bills
2025-07-25 23:40:21,458 - INFO - Fetched 4250 of 5649 S bills
2025-07-25


118TH CONGRESS EXTRACTION COMPLETE
Total bills extracted: 6001
Bills with known outcomes: 112
Pass rate: 85.7%

Bills by type:
bill_type
S        5649
HJRES     230
SJRES     122
Name: count, dtype: int64

Sponsor party distribution:
sponsor_party
D    3175
R    2699
I     127
Name: count, dtype: int64

Outcome distribution:
passed
1    96
0    16
Name: count, dtype: int64

Completed in 8.0 hours

Files created:
- ../data/bills_118th_congress_full.csv
- ../data/bills_118th_congress_training.csv
