In [1]:
# TIME-AWARE DUAL MODEL APPROACH

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, precision_recall_curve
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.calibration import CalibratedClassifierCV
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("TIME-AWARE MODEL TRAINING")
print("="*60)

# Load data
available_datasets = {}
if os.path.exists('../data/bills_118th_congress_training_enhanced.csv'):
    available_datasets = {'118th_enhanced': '../data/bills_118th_congress_training_enhanced.csv'}
else:
    print("ERROR: No dataset found.")
    raise FileNotFoundError
dataset_path = list(available_datasets.values())[0]
dataset_name = list(available_datasets.keys())[0]

print(f"Using {dataset_name} dataset")

df = pd.read_csv(dataset_path)
print(f"Loaded {len(df)} bills")
print(f"Passed bills: {df['passed'].sum()} ({df['passed'].mean()*100:.1f}%)")

# Viability definition
print("\n" + "="*60)
print("CREATING VIABILITY TARGET")
print("="*60)

# Balanced viability criteria - aim for ~15-20% viable
df['viable'] = (
    (df['passed'] == 1) |  # Passed
    ((df.get('action_count', 0) >= 6) & (df.get('committee_count', 0) >= 1)) |  # Good activity + committee
    ((df.get('cosponsor_count', 0) >= 30) & (df.get('action_count', 0) >= 4)) |  # Strong support + some activity
    ((df.get('action_count', 0) >= 10)) |  # Very high activity alone
    (df.get('failure_reason', '') == 'failed_to_complete')  # Almost passed
).astype(int)

# Add milestone-based viability
if 'latest_action' in df.columns:
    strong_milestones = ['passed', 'reported', 'ordered reported', 'markup', 'hearing held']
    pattern = '|'.join(strong_milestones)
    df['has_strong_milestone'] = df['latest_action'].fillna('').str.lower().str.contains(pattern)
    df['viable'] = df['viable'] | df['has_strong_milestone']
    df['viable'] = df['viable'].astype(int)

print(f"Viable bills: {df['viable'].sum()} ({df['viable'].mean()*100:.1f}%)")

# Feature engineering
print("\nCreating enhanced features...")

# Fill missing values with sensible defaults
df['sponsor_party'] = df['sponsor_party'].fillna('Unknown')
df['policy_area'] = df['policy_area'].fillna('Unknown')
df['days_since_introduction'] = df.get('days_since_introduction', 30).fillna(30).clip(1, 730)
df['action_count'] = df.get('action_count', 1).fillna(1).clip(1, 100)
df['committee_count'] = df.get('committee_count', 0).fillna(0)
df['cosponsor_count'] = df.get('cosponsor_count', 0).fillna(0)
df['original_cosponsor_count'] = df.get('original_cosponsor_count', 0).fillna(0)

# Label encoding
le_party = LabelEncoder()
df['sponsor_party_encoded'] = le_party.fit_transform(df['sponsor_party'])

le_policy = LabelEncoder()
df['policy_area_encoded'] = le_policy.fit_transform(df['policy_area'])

# Enhanced feature engineering
# Basic sponsor features
df['dem_sponsors'] = df.get('dem_sponsors', 0).fillna(0)
df['rep_sponsors'] = df.get('rep_sponsors', 0).fillna(0)
df['ind_sponsors'] = df.get('ind_sponsors', 0).fillna(0)
df['dem_cosponsors'] = df.get('dem_cosponsors', 0).fillna(0)
df['rep_cosponsors'] = df.get('rep_cosponsors', 0).fillna(0)

df['sponsor_count'] = (df['dem_sponsors'] + df['rep_sponsors'] + df['ind_sponsors']).clip(1, 10)
df['total_sponsors'] = df['sponsor_count'] + df['cosponsor_count']

# Party balance features
df['dem_total'] = df['dem_sponsors'] + df['dem_cosponsors']
df['rep_total'] = df['rep_sponsors'] + df['rep_cosponsors']
df['party_balance'] = (df['dem_total'] - df['rep_total']) / (df['total_sponsors'] + 1)
df['party_dominance'] = abs(df['party_balance'])

# Bipartisan features
df['bipartisan_score'] = 1 - df['party_dominance']
df['has_bipartisan_support'] = ((df['dem_total'] > 0) & (df['rep_total'] > 0)).astype(int)

# Temporal features - more robust
df['month_introduced'] = df.get('month_introduced', datetime.now().month).fillna(datetime.now().month)
df['quarter_introduced'] = ((df['month_introduced'] - 1) // 3 + 1).astype(int)
df['is_election_year'] = df.get('is_election_year', 0).fillna(0)

# Text features
df['title_length'] = df.get('title_length', 100).fillna(100).clip(10, 500)
df['title_word_count'] = df.get('title_word_count', 20).fillna(20).clip(2, 100)
df['title_complexity'] = df['title_length'] / (df['title_word_count'] + 1)

# Subject features
df['subject_count'] = df.get('subject_count', 1).fillna(1).clip(1, 20)

# Time-aware features with better scaling
df['days_active'] = df['days_since_introduction'].clip(1, 730)
df['log_days_active'] = np.log1p(df['days_active'])
df['sqrt_days_active'] = np.sqrt(df['days_active'])

# Activity features - normalized
df['activity_rate'] = df['action_count'] / df['days_active']
df['normalized_activity'] = df['action_count'] / np.log1p(df['days_active'])
df['early_activity'] = df['action_count'] / (df['days_active'].clip(upper=30) + 1)

# Momentum features
df['is_fresh'] = (df['days_active'] <= 30).astype(int)
df['is_active'] = (df['days_active'] <= 90).astype(int)
df['is_stale'] = (df['days_active'] > 180).astype(int)

# Committee features
df['committee_density'] = df['committee_count'] / (df['days_active'] / 30).clip(lower=1)
df['has_committee'] = (df['committee_count'] > 0).astype(int)
df['multi_committee'] = (df['committee_count'] >= 2).astype(int)

# Support growth features
df['cosponsor_growth'] = (df['cosponsor_count'] - df['original_cosponsor_count']) / (df['days_active'] / 30).clip(lower=1)
df['support_velocity'] = df['total_sponsors'] / np.sqrt(df['days_active'])

# Interaction features
df['bipartisan_momentum'] = df['bipartisan_score'] * df['normalized_activity']
df['committee_activity'] = df['committee_count'] * df['activity_rate']

# Define feature sets with overlapping features for smoother transitions
base_features = [
    'sponsor_party_encoded',
    'sponsor_count',
    'original_cosponsor_count',
    'month_introduced',
    'quarter_introduced',
    'is_election_year',
    'title_length',
    'title_word_count',
    'title_complexity',
    'subject_count',
    'policy_area_encoded',
    'party_balance',
    'party_dominance',
    'bipartisan_score',
    'has_bipartisan_support'
]

# Additional features for models with more information
extended_features = base_features + [
    'cosponsor_count',
    'total_sponsors',
    'is_fresh',
    'support_velocity',
    'cosponsor_growth',
    'dem_total',
    'rep_total'
]

progressive_features = extended_features + [
    'days_active',
    'log_days_active',
    'activity_rate',
    'normalized_activity',
    'is_active',
    'is_stale',
    'committee_count',
    'has_committee',
    'multi_committee',
    'committee_density',
    'action_count',
    'early_activity',
    'bipartisan_momentum',
    'committee_activity'
]

# Model training function with improved methodology
def train_robust_model(df, target_col, model_name, feature_sets, use_calibration=True):
    """Train models with better methodology"""
    
    print(f"\n{'='*60}")
    print(f"TRAINING {model_name.upper()} MODELS")
    print(f"{'='*60}")
    
    models = {}
    
    for stage_name, features in feature_sets.items():
        print(f"\n--- Training {stage_name} model ---")
        
        # Filter available features
        available_features = [f for f in features if f in df.columns]
        print(f"Using {len(available_features)} features")
        
        # Prepare data
        X = df[available_features].fillna(0)
        X = X.replace([np.inf, -np.inf], 0)
        y = df[target_col]
        
        # Check class distribution
        pos_rate = y.mean()
        print(f"Positive class rate: {pos_rate:.1%}")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(
            scaler.fit_transform(X_train),
            columns=X_train.columns,
            index=X_train.index
        )
        X_test_scaled = pd.DataFrame(
            scaler.transform(X_test),
            columns=X_test.columns,
            index=X_test.index
        )
        
        # Feature selection - use mutual information for better feature selection
        k_features = min(20, len(available_features) - 1)
        selector = SelectKBest(mutual_info_classif, k=k_features)
        selector.fit(X_train_scaled, y_train)
        
        selected_indices = selector.get_support(indices=True)
        selected_features = [available_features[i] for i in selected_indices]
        
        X_train_selected = X_train_scaled[selected_features]
        X_test_selected = X_test_scaled[selected_features]
        
        print(f"Selected top {k_features} features")
        
        # Create diverse ensemble
        # 1. Random Forest with balanced weights
        rf_model = RandomForestClassifier(
            n_estimators=300,
            max_depth=15,
            min_samples_split=20,
            min_samples_leaf=10,
            class_weight='balanced_subsample',
            random_state=42,
            n_jobs=-1
        )
        
        # 2. Gradient Boosting with careful tuning
        gb_model = GradientBoostingClassifier(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=5,
            subsample=0.8,
            min_samples_split=20,
            min_samples_leaf=10,
            random_state=42
        )
        
        # 3. Logistic Regression for stable predictions
        lr_model = LogisticRegression(
            class_weight='balanced',
            max_iter=1000,
            random_state=42
        )
        
        # Train individual models first
        print("Training individual models...")
        rf_model.fit(X_train_selected, y_train)
        gb_model.fit(X_train_selected, y_train)
        lr_model.fit(X_train_selected, y_train)
        
        # Create voting ensemble
        ensemble = VotingClassifier(
            estimators=[
                ('rf', rf_model),
                ('gb', gb_model),
                ('lr', lr_model)
            ],
            voting='soft',
            weights=[0.4, 0.4, 0.2]  # Weight trees more than linear
        )
        
        # Train ensemble
        print("Training ensemble model...")
        ensemble.fit(X_train_selected, y_train)
        
        # Calibrate probabilities if requested
        if use_calibration and pos_rate < 0.3:
            print("Calibrating probabilities...")
            calibrated_ensemble = CalibratedClassifierCV(
                ensemble, 
                method='isotonic',
                cv=3
            )
            calibrated_ensemble.fit(X_train_selected, y_train)
            final_model = calibrated_ensemble
        else:
            final_model = ensemble
        
        # Make predictions
        y_pred_proba = final_model.predict_proba(X_test_selected)[:, 1]
        
        # Use adaptive threshold based on class balance
        # For imbalanced data, use a lower threshold
        if pos_rate < 0.15:  # Very imbalanced
            threshold = pos_rate * 2  # More lenient threshold
        else:
            threshold = 0.5
        
        y_pred = (y_pred_proba >= threshold).astype(int)
        
        # Calculate metrics with adaptive threshold
        accuracy = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        
        # Calculate precision and recall
        from sklearn.metrics import precision_score, recall_score, f1_score
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        
        # Also calculate metrics at default threshold for comparison
        y_pred_default = (y_pred_proba >= 0.5).astype(int)
        f1_default = f1_score(y_test, y_pred_default, zero_division=0)
        
        print(f"\nPerformance (threshold={threshold:.3f}):")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  ROC-AUC: {roc_auc:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1 Score: {f1:.4f}")
        if threshold != 0.5:
            print(f"  F1 at 0.5 threshold: {f1_default:.4f}")
        
        # Cross-validation for robustness check
        cv_scores = cross_val_score(
            ensemble, X_train_selected, y_train, 
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            scoring='roc_auc'
        )
        print(f"  CV ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
        
        # Store model components
        models[stage_name] = {
            'model': final_model,
            'ensemble': ensemble,
            'rf_model': rf_model,
            'gb_model': gb_model,
            'lr_model': lr_model,
            'scaler': scaler,
            'selector': selector,
            'features': available_features,
            'selected_features': selected_features,
            'threshold': threshold,
            'performance': {
                'accuracy': accuracy,
                'roc_auc': roc_auc,
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'cv_roc_auc': cv_scores.mean(),
                'cv_std': cv_scores.std()
            }
        }
        
        # Feature importance analysis
        if hasattr(rf_model, 'feature_importances_'):
            importance_df = pd.DataFrame({
                'feature': selected_features,
                'importance': rf_model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            print(f"\nTop 5 features:")
            for _, row in importance_df.head(5).iterrows():
                print(f"  - {row['feature']}: {row['importance']:.4f}")
    
    return models

# Define feature sets
feature_sets = {
    'new_bill': base_features,
    'early_stage': extended_features,
    'progressive': progressive_features
}

# Train viability models
print("\n" + "="*70)
print("PHASE 1: VIABILITY PREDICTION")
print("="*70)
viability_models = train_robust_model(df, 'viable', 'Viability', feature_sets)

# Train passage models on viable bills only
viable_bills = df[df['viable'] == 1].copy()
print(f"\n" + "="*70)
print(f"PHASE 2: PASSAGE PREDICTION")
print(f"Training on {len(viable_bills)} viable bills ({viable_bills['passed'].mean():.1%} passed)")
print("="*70)

passage_models = train_robust_model(viable_bills, 'passed', 'Passage', feature_sets)

# Model evaluation on different bill stages
print("\n" + "="*60)
print("MODEL EVALUATION BY BILL AGE")
print("="*60)

for days_threshold, stage_name in [(1, 'Brand New'), (30, 'Early Stage'), (180, 'Progressive')]:
    if days_threshold == 1:
        stage_bills = df[df['days_active'] <= days_threshold]
        model_key = 'new_bill'
    elif days_threshold == 30:
        stage_bills = df[(df['days_active'] > 1) & (df['days_active'] <= days_threshold)]
        model_key = 'early_stage'
    else:
        stage_bills = df[df['days_active'] > 30]
        model_key = 'progressive'
    
    if len(stage_bills) > 0:
        print(f"\n{stage_name} Bills (n={len(stage_bills)}):")
        print(f"  Viability rate: {stage_bills['viable'].mean():.1%}")
        print(f"  Passage rate: {stage_bills['passed'].mean():.1%}")
        print(f"  Model ROC-AUC: {viability_models[model_key]['performance']['roc_auc']:.4f}")

# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save models in logical groups
print("\n" + "="*60)
print("SAVING MODELS IN ORGANIZED FILES")
print("="*60)

# Save metadata and encoders (this stays the same)
metadata = {
    'training_date': datetime.now().isoformat(),
    'dataset_size': len(df),
    'viable_rate': df['viable'].mean(),
    'passage_rate': df['passed'].mean(),
    'model_version': '3.0',
    'improvements': [
        'Conservative thresholds (0.5)',
        'Ensemble with calibration',
        'Mutual information feature selection',
        'Better handling of class imbalance',
        'More robust features for early bills'
    ],
    'feature_sets': feature_sets  # Make sure feature_sets is included here
}

joblib.dump({
    'metadata': metadata,
    'label_encoders': {
        'party': le_party,
        'policy': le_policy
    }
}, 'models/metadata.pkl')
print("✅ Saved metadata.pkl")

# Save viability models - one file per stage
for stage_name, model_data in viability_models.items():
    # Combine all stage-specific viability models into one file
    stage_package = {
        'rf_model': model_data['rf_model'],
        'gb_model': model_data['gb_model'],
        'lr_model': model_data['lr_model'],
        'ensemble': model_data['ensemble'],
        'final_model': model_data['model'],
        'scaler': model_data['scaler'],
        'selector': model_data['selector'],
        'features': model_data['features'],
        'selected_features': model_data['selected_features'],
        'threshold': model_data['threshold'],
        'performance': model_data['performance']
    }
    
    filename = f'models/viability_{stage_name}.pkl'
    joblib.dump(stage_package, filename)
    
    # Check file size
    size_mb = os.path.getsize(filename) / (1024 * 1024)
    print(f"✅ Saved {filename} ({size_mb:.1f} MB)")

# Save passage models - one file per stage
for stage_name, model_data in passage_models.items():
    # Combine all stage-specific passage models into one file
    stage_package = {
        'rf_model': model_data['rf_model'],
        'gb_model': model_data['gb_model'],
        'lr_model': model_data['lr_model'],
        'ensemble': model_data['ensemble'],
        'final_model': model_data['model'],
        'scaler': model_data['scaler'],
        'selector': model_data['selector'],
        'features': model_data['features'],
        'selected_features': model_data['selected_features'],
        'threshold': model_data['threshold'],
        'performance': model_data['performance']
    }
    
    filename = f'models/passage_{stage_name}.pkl'
    joblib.dump(stage_package, filename)
    
    # Check file size
    size_mb = os.path.getsize(filename) / (1024 * 1024)
    print(f"✅ Saved {filename} ({size_mb:.1f} MB)")

# List all saved files and their sizes
print("\n" + "="*60)
print("MODEL FILE SUMMARY")
print("="*60)

total_size = 0
file_count = 0
for file in sorted(os.listdir('models')):
    if file.endswith('.pkl'):
        file_path = os.path.join('models', file)
        size_mb = os.path.getsize(file_path) / (1024 * 1024)
        total_size += size_mb
        file_count += 1
        
        # Describe what each file contains
        if file == 'metadata.pkl':
            desc = "Training metadata and label encoders"
        elif 'viability' in file:
            stage = file.replace('viability_', '').replace('.pkl', '')
            desc = f"All viability models for {stage.replace('_', ' ')} bills"
        elif 'passage' in file:
            stage = file.replace('passage_', '').replace('.pkl', '')
            desc = f"All passage models for {stage.replace('_', ' ')} bills"
        else:
            desc = "Model file"
            
        print(f"{file:<30} {size_mb:>6.1f} MB  - {desc}")

print(f"\nTotal: {file_count} files, {total_size:.1f} MB")
print(f"All files under 100 MB ✓" if all(os.path.getsize(os.path.join('models', f)) < 100*1024*1024 for f in os.listdir('models') if f.endswith('.pkl')) else "WARNING: Some files exceed 100 MB!")

# Test predictions with confidence intervals
print("\n" + "="*60)
print("SAMPLE PREDICTIONS WITH CONFIDENCE")
print("="*60)

# Get a few test bills
test_bills = df.sample(5, random_state=42)

for _, bill in test_bills.iterrows():
    # Determine model stage
    if bill['days_active'] <= 1:
        model_key = 'new_bill'
    elif bill['days_active'] <= 30:
        model_key = 'early_stage'
    else:
        model_key = 'progressive'
    
    # Get features
    viability_model = viability_models[model_key]
    features = viability_model['features']
    
    # Prepare data
    bill_features = bill[features].values.reshape(1, -1)
    bill_features = np.nan_to_num(bill_features, 0)
    
    # Scale and select
    bill_scaled = viability_model['scaler'].transform(bill_features)
    bill_selected = bill_scaled[:, viability_model['selector'].get_support()]
    
    # Get prediction with individual model predictions for confidence
    viability_proba = viability_model['model'].predict_proba(bill_selected)[0, 1]
    
    # Get individual model predictions for confidence range
    rf_proba = viability_model['rf_model'].predict_proba(bill_selected)[0, 1]
    gb_proba = viability_model['gb_model'].predict_proba(bill_selected)[0, 1]
    lr_proba = viability_model['lr_model'].predict_proba(bill_selected)[0, 1]
    
    probas = [rf_proba, gb_proba, lr_proba]
    confidence_low = min(probas)
    confidence_high = max(probas)
    
    print(f"\nBill: {bill.get('bill_id', 'Unknown')} (Day {int(bill['days_active'])})")
    print(f"  Actual: Viable={bill['viable']}, Passed={bill['passed']}")
    print(f"  Model: {model_key}")
    print(f"  Viability: {viability_proba:.1%} (range: {confidence_low:.1%}-{confidence_high:.1%})")
    print(f"  Confidence spread: {(confidence_high - confidence_low):.1%}")

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE!")
print("Models saved in 'models' directory as separate files")
print("="*60)

TIME-AWARE MODEL TRAINING
Using 118th_enhanced dataset
Loaded 16565 bills
Passed bills: 274 (1.7%)

CREATING VIABILITY TARGET
Viable bills: 3364 (20.3%)

Creating enhanced features...

PHASE 1: VIABILITY PREDICTION

TRAINING VIABILITY MODELS

--- Training new_bill model ---
Using 15 features
Positive class rate: 20.3%
Selected top 14 features
Training individual models...
Training ensemble model...
Calibrating probabilities...

Performance (threshold=0.500):
  Accuracy: 0.8071
  ROC-AUC: 0.7679
  Precision: 0.5746
  Recall: 0.1947
  F1 Score: 0.2908
  CV ROC-AUC: 0.7626 (+/- 0.0264)

Top 5 features:
  - policy_area_encoded: 0.1612
  - title_complexity: 0.1035
  - has_bipartisan_support: 0.1007
  - party_dominance: 0.0994
  - bipartisan_score: 0.0968

--- Training early_stage model ---
Using 22 features
Positive class rate: 20.3%
Selected top 20 features
Training individual models...
Training ensemble model...
Calibrating probabilities...

Performance (threshold=0.500):
  Accuracy: 0.81

KeyboardInterrupt: 