In [1]:
import pandas as pd
import numpy as np
import requests
import time
from datetime import datetime, timedelta
from tqdm import tqdm
import os
import json
import pickle
from dotenv import load_dotenv
import logging
from typing import Dict, List, Optional, Tuple

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('../data/extraction_log.log'),
        logging.StreamHandler()
    ]
)

# Load environment variables
load_dotenv()
CONGRESS_API_KEY = os.getenv('CONGRESS_API_KEY')

# Configuration
CACHE_DIR = '../data/api_cache'
CHECKPOINT_FILE = '../data/extraction_checkpoint.pkl'
MAX_RETRIES = 3
RETRY_DELAY = 5  # seconds
RATE_LIMIT_DELAY = 0.75  # seconds between API calls

# Create cache directory if it doesn't exist
os.makedirs(CACHE_DIR, exist_ok=True)

def save_checkpoint(state: Dict):
    """Save current extraction state for resumption"""
    with open(CHECKPOINT_FILE, 'wb') as f:
        pickle.dump(state, f)
    logging.info(f"Checkpoint saved: {state['bills_processed']} bills processed")

def load_checkpoint() -> Optional[Dict]:
    """Load previous extraction state if exists"""
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'rb') as f:
            state = pickle.load(f)
        logging.info(f"Checkpoint loaded: {state['bills_processed']} bills already processed")
        return state
    return None

def fetch_with_retry(url: str, max_retries: int = MAX_RETRIES) -> Optional[Dict]:
    """Fetch URL with retry logic and caching"""
    # Check cache first
    cache_key = url.replace('/', '_').replace(':', '').replace('?', '_')
    cache_file = os.path.join(CACHE_DIR, f"{cache_key}.json")
    
    if os.path.exists(cache_file):
        try:
            with open(cache_file, 'r') as f:
                return json.load(f)
        except:
            pass  # If cache is corrupted, fetch fresh
    
    # Fetch from API with retries
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=30)
            if response.status_code == 200:
                data = response.json()
                # Cache the response
                with open(cache_file, 'w') as f:
                    json.dump(data, f)
                return data
            elif response.status_code == 429:  # Rate limited
                wait_time = int(response.headers.get('Retry-After', 60))
                logging.warning(f"Rate limited. Waiting {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                logging.warning(f"HTTP {response.status_code} for {url}")
                
        except requests.exceptions.Timeout:
            logging.warning(f"Timeout on attempt {attempt + 1} for {url}")
        except Exception as e:
            logging.error(f"Error on attempt {attempt + 1} for {url}: {str(e)}")
        
        if attempt < max_retries - 1:
            time.sleep(RETRY_DELAY * (attempt + 1))
    
    return None

def get_all_bills_for_congress(congress: int, bill_type: str = 'hr', limit: int = 250) -> List[Dict]:
    """Get all bills for a specific congress and type"""
    all_bills = []
    offset = 0
    
    logging.info(f"Fetching all {bill_type.upper()} bills from Congress {congress}")
    
    while True:
        url = f"https://api.congress.gov/v3/bill/{congress}/{bill_type}?api_key={CONGRESS_API_KEY}&limit={limit}&offset={offset}"
        
        data = fetch_with_retry(url)
        if not data:
            break
            
        bills = data.get('bills', [])
        if not bills:
            break
            
        all_bills.extend(bills)
        offset += limit
        
        # Check if we've reached the end
        pagination = data.get('pagination', {})
        if offset >= pagination.get('count', 0):
            break
            
        time.sleep(RATE_LIMIT_DELAY)
    
    logging.info(f"Found {len(all_bills)} {bill_type.upper()} bills in Congress {congress}")
    return all_bills

def fetch_detailed_bill_info(congress: int, bill_type: str, bill_number: str) -> Optional[Dict]:
    """Fetch comprehensive bill information"""
    base_url = f"https://api.congress.gov/v3/bill/{congress}/{bill_type}/{bill_number}"
    
    # Main bill info
    bill_data = fetch_with_retry(f"{base_url}?api_key={CONGRESS_API_KEY}")
    if not bill_data:
        return None
    
    # Get additional data
    result = {'bill': bill_data.get('bill', {})}
    
    # Cosponsors (with pagination)
    all_cosponsors = []
    offset = 0
    while True:
        cosponsor_url = f"{base_url}/cosponsors?api_key={CONGRESS_API_KEY}&limit=250&offset={offset}"
        cosponsor_data = fetch_with_retry(cosponsor_url)
        if cosponsor_data:
            cosponsors = cosponsor_data.get('cosponsors', [])
            if not cosponsors:
                break
            all_cosponsors.extend(cosponsors)
            if len(cosponsors) < 250:
                break
            offset += 250
            time.sleep(RATE_LIMIT_DELAY)
        else:
            break
    result['cosponsors'] = all_cosponsors
    
    # Subjects
    subjects_data = fetch_with_retry(f"{base_url}/subjects?api_key={CONGRESS_API_KEY}")
    result['subjects'] = subjects_data.get('subjects', {}) if subjects_data else {}
    
    # Actions (with pagination)
    all_actions = []
    offset = 0
    while True:
        actions_url = f"{base_url}/actions?api_key={CONGRESS_API_KEY}&limit=250&offset={offset}"
        actions_data = fetch_with_retry(actions_url)
        if actions_data:
            actions = actions_data.get('actions', [])
            if not actions:
                break
            all_actions.extend(actions)
            if len(actions) < 250:
                break
            offset += 250
            time.sleep(RATE_LIMIT_DELAY)
        else:
            break
    result['actions'] = all_actions
    
    # Committees
    committees_data = fetch_with_retry(f"{base_url}/committees?api_key={CONGRESS_API_KEY}")
    result['committees'] = committees_data.get('committees', {}) if committees_data else {}
    
    return result

def extract_features_safe(bill_info: Dict) -> Optional[Dict]:
    """Safely extract features with comprehensive error handling"""
    try:
        if not bill_info or not bill_info.get('bill'):
            return None
        
        bill = bill_info['bill']
        
        # Basic info
        features = {
            'bill_id': f"{bill.get('congress')}-{bill.get('type')}-{bill.get('number')}",
            'congress': bill.get('congress'),
            'bill_type': bill.get('type'),
            'bill_number': bill.get('number'),
            'title': bill.get('title', ''),
            'introduced_date': bill.get('introducedDate'),
            'url': bill.get('url', ''),
        }
        
        # Policy area
        policy_area = bill.get('policyArea')
        if isinstance(policy_area, dict):
            features['policy_area'] = policy_area.get('name', 'Unknown')
        else:
            features['policy_area'] = 'Unknown'
        
        # Sponsor information
        sponsors = bill.get('sponsors', [])
        if isinstance(sponsors, list) and sponsors:
            main_sponsor = sponsors[0]
            features['sponsor_name'] = main_sponsor.get('fullName', '')
            features['sponsor_party'] = main_sponsor.get('party', 'Unknown')
            features['sponsor_state'] = main_sponsor.get('state', '')
            features['sponsor_bioguide_id'] = main_sponsor.get('bioguideId', '')
        else:
            features['sponsor_name'] = ''
            features['sponsor_party'] = 'Unknown'
            features['sponsor_state'] = ''
            features['sponsor_bioguide_id'] = ''
        
        features['sponsor_count'] = len(sponsors) if isinstance(sponsors, list) else 0
        
        # Count sponsors by party
        if isinstance(sponsors, list):
            sponsor_parties = [s.get('party', 'Unknown') for s in sponsors if isinstance(s, dict)]
            features['dem_sponsors'] = sponsor_parties.count('D')
            features['rep_sponsors'] = sponsor_parties.count('R')
            features['ind_sponsors'] = features['sponsor_count'] - features['dem_sponsors'] - features['rep_sponsors']
        else:
            features['dem_sponsors'] = 0
            features['rep_sponsors'] = 0
            features['ind_sponsors'] = 0
        
        # Cosponsor information
        cosponsors = bill_info.get('cosponsors', [])
        if not isinstance(cosponsors, list):
            cosponsors = []
        features['cosponsor_count'] = len(cosponsors)
        
        # Count cosponsors by party
        cosponsor_parties = [c.get('party', 'Unknown') for c in cosponsors if isinstance(c, dict)]
        features['dem_cosponsors'] = cosponsor_parties.count('D')
        features['rep_cosponsors'] = cosponsor_parties.count('R')
        features['ind_cosponsors'] = features['cosponsor_count'] - features['dem_cosponsors'] - features['rep_cosponsors']
        
        # Original cosponsors
        original_cosponsors = [c for c in cosponsors if isinstance(c, dict) and c.get('isOriginalCosponsor', False)]
        features['original_cosponsor_count'] = len(original_cosponsors)
        
        # Bipartisan features
        total_sponsors = features['sponsor_count'] + features['cosponsor_count']
        total_dem = features['dem_sponsors'] + features['dem_cosponsors']
        total_rep = features['rep_sponsors'] + features['rep_cosponsors']
        
        features['is_bipartisan'] = int(total_dem > 0 and total_rep > 0)
        features['bipartisan_ratio'] = min(total_dem, total_rep) / total_sponsors * 2 if total_sponsors > 0 else 0
        
        # Committee information from bill data
        committees_in_bill = bill.get('committees', {})
        if isinstance(committees_in_bill, dict):
            committee_items = committees_in_bill.get('item', [])
            if isinstance(committee_items, dict):
                committee_items = [committee_items]
            elif not isinstance(committee_items, list):
                committee_items = []
        else:
            committee_items = []
        
        # Also check committees endpoint data
        committees_data = bill_info.get('committees', {})
        if isinstance(committees_data, dict):
            additional_items = committees_data.get('item', [])
            if isinstance(additional_items, list):
                committee_items.extend(additional_items)
        
        features['committee_count'] = len(set(c.get('name', '') for c in committee_items if isinstance(c, dict)))
        
        # Latest action
        latest_action = bill.get('latestAction', {})
        if isinstance(latest_action, dict):
            features['latest_action'] = latest_action.get('text', '')
            features['latest_action_date'] = latest_action.get('actionDate', '')
        else:
            features['latest_action'] = ''
            features['latest_action_date'] = ''
        
        # Actions analysis
        actions = bill_info.get('actions', [])
        if not isinstance(actions, list):
            actions = []
        features['action_count'] = len(actions)
        
        # Analyze action types
        action_texts = [a.get('text', '').lower() for a in actions if isinstance(a, dict)]
        features['referred_to_committee'] = sum(1 for a in action_texts if 'referred to' in a)
        features['reported_by_committee'] = sum(1 for a in action_texts if 'reported' in a)
        features['passed_house'] = int(any('passed house' in a or 'passed the house' in a for a in action_texts))
        features['passed_senate'] = int(any('passed senate' in a or 'passed the senate' in a for a in action_texts))
        features['has_amendments'] = int(any('amendment' in a for a in action_texts))
        features['has_vote'] = int(any('vote' in a or 'yea-and-nay' in a or 'roll no' in a for a in action_texts))
        
        # Subjects
        subjects_data = bill_info.get('subjects', {})
        subject_count = 0
        subject_names = []
        
        if isinstance(subjects_data, dict):
            # Try different structures
            for key in ['legislativeSubjects', 'policyArea']:
                subj_items = subjects_data.get(key, {})
                if isinstance(subj_items, dict) and 'item' in subj_items:
                    items = subj_items['item']
                    if isinstance(items, list):
                        subject_names.extend([s.get('name', '') for s in items if isinstance(s, dict)])
                elif isinstance(subj_items, list):
                    subject_names.extend([s.get('name', '') for s in subj_items if isinstance(s, dict)])
        
        features['subject_count'] = len(subject_names)
        features['subjects'] = '; '.join(subject_names[:10])  # First 10 subjects
        
        # Title analysis
        features['title_length'] = len(features['title'])
        features['title_word_count'] = len(features['title'].split())
        
        # Determine passage status
        latest = features['latest_action'].lower()
        if features['passed_house'] and features['passed_senate']:
            features['passed'] = 1
        elif any(term in latest for term in ['became public law', 'signed by president', 'enacted']):
            features['passed'] = 1
        elif any(term in latest for term in ['failed', 'rejected', 'vetoed', 'motion to proceed not agreed']):
            features['passed'] = 0
        elif features['action_count'] < 3 and 'introduced' in latest:
            features['passed'] = 0  # Bills with minimal action typically don't pass
        else:
            features['passed'] = -1  # Unknown/pending
        
        return features
        
    except Exception as e:
        logging.error(f"Error extracting features: {str(e)}")
        return None

def extract_comprehensive_dataset():
    """Extract comprehensive dataset with resume capability"""
    # Load checkpoint if exists
    checkpoint = load_checkpoint()
    if checkpoint:
        all_features = checkpoint['features']
        processed_bills = set(checkpoint['processed_bills'])
        start_congress = checkpoint['current_congress']
        start_type_idx = checkpoint['current_type_idx']
    else:
        all_features = []
        processed_bills = set()
        start_congress = 113  # Start from 113th Congress (2013-2014)
        start_type_idx = 0
    
    # Define what to extract
    congresses = list(range(start_congress, 119))  # 113th to 118th Congress
    bill_types = ['hr', 's', 'hjres', 'sjres']  # House bills, Senate bills, Joint resolutions
    
    total_bills_processed = len(processed_bills)
    
    for congress in congresses:
        for type_idx, bill_type in enumerate(bill_types):
            # Skip if already processed
            if congress < start_congress or (congress == start_congress and type_idx < start_type_idx):
                continue
            
            try:
                # Get all bills for this congress and type
                bills = get_all_bills_for_congress(congress, bill_type)
                
                # Process each bill
                for bill in tqdm(bills, desc=f"Congress {congress} {bill_type.upper()}"):
                    bill_number = bill.get('number', '')
                    bill_key = f"{congress}-{bill_type}-{bill_number}"
                    
                    # Skip if already processed
                    if bill_key in processed_bills:
                        continue
                    
                    # Fetch detailed info
                    detailed_info = fetch_detailed_bill_info(congress, bill_type, bill_number)
                    if detailed_info:
                        features = extract_features_safe(detailed_info)
                        if features:
                            all_features.append(features)
                            processed_bills.add(bill_key)
                            total_bills_processed += 1
                    
                    # Save checkpoint every 100 bills
                    if total_bills_processed % 100 == 0:
                        save_checkpoint({
                            'features': all_features,
                            'processed_bills': list(processed_bills),
                            'bills_processed': total_bills_processed,
                            'current_congress': congress,
                            'current_type_idx': type_idx
                        })
                    
                    # Rate limiting
                    time.sleep(RATE_LIMIT_DELAY)
                    
            except Exception as e:
                logging.error(f"Error processing {congress} {bill_type}: {str(e)}")
                # Save checkpoint on error
                save_checkpoint({
                    'features': all_features,
                    'processed_bills': list(processed_bills),
                    'bills_processed': total_bills_processed,
                    'current_congress': congress,
                    'current_type_idx': type_idx
                })
                continue
    
    # Create final dataset
    if all_features:
        df = pd.DataFrame(all_features)
        
        # Add calculated features
        if 'introduced_date' in df.columns:
            df['introduced_date'] = pd.to_datetime(df['introduced_date'], errors='coerce')
            df['days_since_introduction'] = (datetime.now() - df['introduced_date']).dt.days
            df['month_introduced'] = df['introduced_date'].dt.month
            df['quarter_introduced'] = df['introduced_date'].dt.quarter  
            df['year_introduced'] = df['introduced_date'].dt.year
            df['is_election_year'] = (df['year_introduced'] % 4 == 0).astype(int)
        
        # Activity metrics
        df['actions_per_day'] = df['action_count'] / (df.get('days_since_introduction', 1).fillna(1) + 1)
        df['has_multiple_actions'] = (df['action_count'] > 3).astype(int)
        
        # Save datasets
        df.to_csv('../data/bills_with_features_full.csv', index=False)
        logging.info(f"Saved {len(df)} bills to bills_with_features_full.csv")
        
        # Create training dataset
        training_df = df[df['passed'] != -1].copy()
        training_df.to_csv('../data/bills_training_data_full.csv', index=False)
        logging.info(f"Saved {len(training_df)} bills with known outcomes for training")
        
        # Print summary
        print("\n" + "="*60)
        print("EXTRACTION COMPLETE")
        print("="*60)
        print(f"Total bills extracted: {len(df)}")
        print(f"Bills with known outcomes: {len(training_df)}")
        print(f"Pass rate: {(training_df['passed'] == 1).mean()*100:.1f}%")
        print("\nBills by Congress:")
        print(df['congress'].value_counts().sort_index())
        print("\nBills by type:")
        print(df['bill_type'].value_counts())
        print("\nSponsor party distribution:")
        print(df['sponsor_party'].value_counts())
        
        # Clean up checkpoint
        if os.path.exists(CHECKPOINT_FILE):
            os.remove(CHECKPOINT_FILE)
            
        return df
    else:
        logging.error("No data extracted!")
        return pd.DataFrame()

if __name__ == "__main__":
    if not CONGRESS_API_KEY:
        print("ERROR: CONGRESS_API_KEY not found!")
        print("Please add your API key to the .env file")
    else:
        print("Starting comprehensive bill extraction...")
        print("This will fetch bills from the 113th-118th Congress")
        print("The process is resumable - you can stop and restart anytime")
        print("-" * 60)
        
        df = extract_comprehensive_dataset()
        
        if not df.empty:
            print("\nExtraction completed successfully!")
            print(f"Full dataset: ../data/bills_with_features_full.csv")
            print(f"Training dataset: ../data/bills_training_data_full.csv")
        else:
            print("\nExtraction failed - check the log file for details")

2025-07-27 16:33:21,658 - INFO - Checkpoint loaded: 12100 bills already processed
2025-07-27 16:33:21,661 - INFO - Fetching all HR bills from Congress 114


Starting comprehensive bill extraction...
This will fetch bills from the 113th-118th Congress
The process is resumable - you can stop and restart anytime
------------------------------------------------------------


2025-07-27 16:33:41,270 - INFO - Found 6536 HR bills in Congress 114
Congress 114 HR:  48%|█████████████████████████████▉                                 | 3108/6536 [25:44<3:35:19,  3.77s/it]2025-07-27 16:59:29,552 - INFO - Checkpoint saved: 12200 bills processed
Congress 114 HR:  49%|██████████████████████████████▉                                | 3208/6536 [32:46<3:28:03,  3.75s/it]2025-07-27 17:06:30,697 - INFO - Checkpoint saved: 12300 bills processed
Congress 114 HR:  51%|███████████████████████████████▉                               | 3308/6536 [39:04<3:24:22,  3.80s/it]2025-07-27 17:12:51,849 - INFO - Checkpoint saved: 12400 bills processed
Congress 114 HR:  52%|████████████████████████████████▊                              | 3408/6536 [45:37<3:32:28,  4.08s/it]2025-07-27 17:19:21,522 - INFO - Checkpoint saved: 12500 bills processed
Congress 114 HR:  54%|█████████████████████████████████▊                             | 3508/6536 [51:45<2:58:43,  3.54s/it]2025-07-27 17:25:29,521 


EXTRACTION COMPLETE
Total bills extracted: 76897
Bills with known outcomes: 2536
Pass rate: 81.1%

Bills by Congress:
congress
113     9091
114    10233
115    11421
116    14345
117    15242
118    16565
Name: count, dtype: int64

Bills by type:
bill_type
HR       49170
S        26465
HJRES      831
SJRES      431
Name: count, dtype: int64

Sponsor party distribution:
sponsor_party
D          41343
R          35087
I            443
L             13
Unknown       11
Name: count, dtype: int64

Extraction completed successfully!
Full dataset: ../data/bills_with_features_full.csv
Training dataset: ../data/bills_training_data_full.csv
