In [1]:
# TIME-AWARE DUAL MODEL APPROACH FOR 6-CONGRESS DATASET - OPTIMIZED SPLIT COMPONENTS

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, precision_recall_curve
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.calibration import CalibratedClassifierCV
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("TIME-AWARE MODEL TRAINING - 6 CONGRESS DATASET")
print("="*60)

# Load data - use the training dataset with known outcomes
dataset_path = '../data/bills_6congress_training.csv'
print(f"Loading dataset from: {dataset_path}")

df = pd.read_csv(dataset_path)
print(f"Loaded {len(df)} bills with known outcomes")
print(f"Congresses: {sorted(df['congress'].unique())}")
print(f"Passed bills: {df['passed'].sum()} ({df['passed'].mean()*100:.1f}%)")

# Viability definition
print("\n" + "="*60)
print("CREATING VIABILITY TARGET")
print("="*60)

# Balanced viability criteria - aim for ~15-20% viable
df['viable'] = (
    (df['passed'] == 1) |  # Passed
    ((df.get('action_count', 0) >= 6) & (df.get('committee_count', 0) >= 1)) |  # Good activity + committee
    ((df.get('cosponsor_count', 0) >= 30) & (df.get('action_count', 0) >= 4)) |  # Strong support + some activity
    ((df.get('action_count', 0) >= 10)) |  # Very high activity alone
    (df.get('failure_reason', '') == 'failed_to_complete')  # Almost passed
).astype(int)

# Add milestone-based viability
if 'latest_action' in df.columns:
    strong_milestones = ['passed', 'reported', 'ordered reported', 'markup', 'hearing held']
    pattern = '|'.join(strong_milestones)
    df['has_strong_milestone'] = df['latest_action'].fillna('').str.lower().str.contains(pattern)
    df['viable'] = df['viable'] | df['has_strong_milestone']
    df['viable'] = df['viable'].astype(int)

print(f"Viable bills: {df['viable'].sum()} ({df['viable'].mean()*100:.1f}%)")

# Feature engineering adapted for 6-congress dataset
print("\nCreating enhanced features...")

# Handle column existence more carefully
def safe_get_column(df, col_name, default_value):
    """Safely get a column with a default value if it doesn't exist"""
    if col_name in df.columns:
        return df[col_name].fillna(default_value)
    else:
        return default_value

# Fill missing values with sensible defaults
df['sponsor_party'] = safe_get_column(df, 'sponsor_party', 'Unknown')
df['policy_area'] = safe_get_column(df, 'policy_area', 'Unknown')

# Use days_active instead of days_since_introduction (from our preprocessing)
if 'days_active' not in df.columns:
    # If somehow missing, calculate from dates
    if 'latest_action_date' in df.columns and 'introduced_date' in df.columns:
        df['introduced_date'] = pd.to_datetime(df['introduced_date'])
        df['latest_action_date'] = pd.to_datetime(df['latest_action_date'])
        df['days_active'] = (df['latest_action_date'] - df['introduced_date']).dt.days.fillna(30).clip(1, 730)
    else:
        df['days_active'] = 30

df['action_count'] = safe_get_column(df, 'action_count', 1).clip(1, 100)
df['committee_count'] = safe_get_column(df, 'committee_count', 0)
df['cosponsor_count'] = safe_get_column(df, 'cosponsor_count', 0)
df['original_cosponsor_count'] = safe_get_column(df, 'original_cosponsor_count', 0)

# Label encoding - handle unknown categories
le_party = LabelEncoder()
# Fit on all possible values including 'Unknown'
all_parties = list(df['sponsor_party'].unique()) + ['Unknown']
le_party.fit(all_parties)
df['sponsor_party_encoded'] = le_party.transform(df['sponsor_party'])

le_policy = LabelEncoder()
all_policies = list(df['policy_area'].unique()) + ['Unknown']
le_policy.fit(all_policies)
df['policy_area_encoded'] = le_policy.transform(df['policy_area'])

# Enhanced feature engineering
# Basic sponsor features
df['dem_sponsors'] = safe_get_column(df, 'dem_sponsors', 0)
df['rep_sponsors'] = safe_get_column(df, 'rep_sponsors', 0)
df['ind_sponsors'] = safe_get_column(df, 'ind_sponsors', 0)
df['dem_cosponsors'] = safe_get_column(df, 'dem_cosponsors', 0)
df['rep_cosponsors'] = safe_get_column(df, 'rep_cosponsors', 0)

df['sponsor_count'] = (df['dem_sponsors'] + df['rep_sponsors'] + df['ind_sponsors']).clip(1, 10)
df['total_sponsors'] = df['sponsor_count'] + df['cosponsor_count']

# Party balance features
df['dem_total'] = df['dem_sponsors'] + df['dem_cosponsors']
df['rep_total'] = df['rep_sponsors'] + df['rep_cosponsors']
df['party_balance'] = (df['dem_total'] - df['rep_total']) / (df['total_sponsors'] + 1)
df['party_dominance'] = abs(df['party_balance'])

# Bipartisan features
df['bipartisan_score'] = 1 - df['party_dominance']
df['has_bipartisan_support'] = safe_get_column(df, 'is_bipartisan', 0)
if 'has_bipartisan_support' not in df.columns or df['has_bipartisan_support'].isna().all():
    df['has_bipartisan_support'] = ((df['dem_total'] > 0) & (df['rep_total'] > 0)).astype(int)

# Temporal features
df['month_introduced'] = safe_get_column(df, 'month_introduced', datetime.now().month)
df['quarter_introduced'] = safe_get_column(df, 'quarter_introduced', ((datetime.now().month - 1) // 3 + 1))
df['is_election_year'] = safe_get_column(df, 'is_election_year', 0)

# Congress-specific features
df['congress_numeric'] = df['congress'].astype(int)
df['is_recent_congress'] = (df['congress_numeric'] >= 117).astype(int)

# Text features
df['title_length'] = safe_get_column(df, 'title_length', 100).clip(10, 500)
df['title_word_count'] = safe_get_column(df, 'title_word_count', 20).clip(2, 100)
df['title_complexity'] = df['title_length'] / (df['title_word_count'] + 1)

# Subject features
df['subject_count'] = safe_get_column(df, 'subject_count', 1).clip(1, 20)

# Time-aware features with better scaling
df['log_days_active'] = np.log1p(df['days_active'])
df['sqrt_days_active'] = np.sqrt(df['days_active'])

# Activity features - use existing if available
if 'legislative_velocity' in df.columns:
    df['activity_rate'] = df['legislative_velocity']
else:
    df['activity_rate'] = df['action_count'] / df['days_active']

df['normalized_activity'] = df['action_count'] / np.log1p(df['days_active'])

# Use existing temporal features if available
if 'early_activity' not in df.columns:
    df['early_activity'] = df['action_count'] / (df['days_active'].clip(upper=30) + 1)
if 'sustained_activity' not in df.columns:
    df['sustained_activity'] = df['action_count'] / (df['days_active'].clip(upper=180) + 1)

# Momentum features
df['is_fresh'] = (df['days_active'] <= 30).astype(int)
df['is_active'] = (df['days_active'] <= 90).astype(int)
df['is_stale'] = (df['days_active'] > 180).astype(int)

# Committee features
if 'committee_engagement_speed' not in df.columns:
    df['committee_density'] = df['committee_count'] / (df['days_active'] / 30).clip(lower=1)
else:
    df['committee_density'] = df['committee_engagement_speed']

df['has_committee'] = (df['committee_count'] > 0).astype(int)
df['multi_committee'] = (df['committee_count'] >= 2).astype(int)

# Support growth features
df['cosponsor_growth'] = (df['cosponsor_count'] - df['original_cosponsor_count']) / (df['days_active'] / 30).clip(lower=1)
df['support_velocity'] = df['total_sponsors'] / np.sqrt(df['days_active'])

# Interaction features
df['bipartisan_momentum'] = df['bipartisan_score'] * df['normalized_activity']
df['committee_activity'] = df['committee_count'] * df['activity_rate']

# Define feature sets
base_features = [
    'sponsor_party_encoded',
    'sponsor_count',
    'original_cosponsor_count',
    'month_introduced',
    'quarter_introduced',
    'is_election_year',
    'title_length',
    'title_word_count',
    'title_complexity',
    'subject_count',
    'policy_area_encoded',
    'party_balance',
    'party_dominance',
    'bipartisan_score',
    'has_bipartisan_support',
    'congress_numeric',  # Added congress info
    'is_recent_congress'  # Added to distinguish newer congresses
]

extended_features = base_features + [
    'cosponsor_count',
    'total_sponsors',
    'is_fresh',
    'support_velocity',
    'cosponsor_growth',
    'dem_total',
    'rep_total',
    'early_activity'  # Added from preprocessing
]

progressive_features = extended_features + [
    'days_active',
    'log_days_active',
    'activity_rate',
    'normalized_activity',
    'sustained_activity',  # Added from preprocessing
    'is_active',
    'is_stale',
    'committee_count',
    'has_committee',
    'multi_committee',
    'committee_density',
    'action_count',
    'bipartisan_momentum',
    'committee_activity'
]

# Filter out features that don't exist
print("\nVerifying feature availability...")
for feature_set_name, features in [('base', base_features), ('extended', extended_features), ('progressive', progressive_features)]:
    missing = [f for f in features if f not in df.columns]
    if missing:
        print(f"Warning: Missing features in {feature_set_name}: {missing}")

# Remove missing features from sets
base_features = [f for f in base_features if f in df.columns]
extended_features = [f for f in extended_features if f in df.columns]
progressive_features = [f for f in progressive_features if f in df.columns]

print(f"\nFeature counts:")
print(f"- Base: {len(base_features)}")
print(f"- Extended: {len(extended_features)}")
print(f"- Progressive: {len(progressive_features)}")

# Model training function remains the same
def train_robust_model(df, target_col, model_name, feature_sets, use_calibration=True):
    """Train models with better methodology"""
    
    print(f"\n{'='*60}")
    print(f"TRAINING {model_name.upper()} MODELS")
    print(f"{'='*60}")
    
    models = {}
    
    for stage_name, features in feature_sets.items():
        print(f"\n--- Training {stage_name} model ---")
        
        # Filter available features
        available_features = [f for f in features if f in df.columns]
        print(f"Using {len(available_features)} features")
        
        # Prepare data
        X = df[available_features].fillna(0)
        X = X.replace([np.inf, -np.inf], 0)
        y = df[target_col]
        
        # Check class distribution
        pos_rate = y.mean()
        print(f"Positive class rate: {pos_rate:.1%}")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(
            scaler.fit_transform(X_train),
            columns=X_train.columns,
            index=X_train.index
        )
        X_test_scaled = pd.DataFrame(
            scaler.transform(X_test),
            columns=X_test.columns,
            index=X_test.index
        )
        
        # Feature selection
        k_features = min(20, len(available_features) - 1)
        selector = SelectKBest(mutual_info_classif, k=k_features)
        selector.fit(X_train_scaled, y_train)
        
        selected_indices = selector.get_support(indices=True)
        selected_features = [available_features[i] for i in selected_indices]
        
        X_train_selected = X_train_scaled[selected_features]
        X_test_selected = X_test_scaled[selected_features]
        
        print(f"Selected top {k_features} features")
        
        # Create diverse ensemble
        rf_model = RandomForestClassifier(
            n_estimators=300,
            max_depth=15,
            min_samples_split=20,
            min_samples_leaf=10,
            class_weight='balanced_subsample',
            random_state=42,
            n_jobs=-1
        )
        
        gb_model = GradientBoostingClassifier(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=5,
            subsample=0.8,
            min_samples_split=20,
            min_samples_leaf=10,
            random_state=42
        )
        
        lr_model = LogisticRegression(
            class_weight='balanced',
            max_iter=1000,
            random_state=42
        )
        
        # Train individual models
        print("Training individual models...")
        rf_model.fit(X_train_selected, y_train)
        gb_model.fit(X_train_selected, y_train)
        lr_model.fit(X_train_selected, y_train)
        
        # Create voting ensemble
        ensemble = VotingClassifier(
            estimators=[
                ('rf', rf_model),
                ('gb', gb_model),
                ('lr', lr_model)
            ],
            voting='soft',
            weights=[0.4, 0.4, 0.2]
        )
        
        # Train ensemble
        print("Training ensemble model...")
        ensemble.fit(X_train_selected, y_train)
        
        # Calibrate if needed
        if use_calibration and pos_rate < 0.3:
            print("Calibrating probabilities...")
            calibrated_ensemble = CalibratedClassifierCV(
                ensemble, 
                method='isotonic',
                cv=3
            )
            calibrated_ensemble.fit(X_train_selected, y_train)
            final_model = calibrated_ensemble
        else:
            final_model = ensemble
        
        # Make predictions
        y_pred_proba = final_model.predict_proba(X_test_selected)[:, 1]
        
        # Use adaptive threshold
        if pos_rate < 0.15:
            threshold = pos_rate * 2
        else:
            threshold = 0.5
        
        y_pred = (y_pred_proba >= threshold).astype(int)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        
        from sklearn.metrics import precision_score, recall_score, f1_score
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        
        print(f"\nPerformance (threshold={threshold:.3f}):")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  ROC-AUC: {roc_auc:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1 Score: {f1:.4f}")
        
        # Cross-validation
        cv_scores = cross_val_score(
            ensemble, X_train_selected, y_train, 
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            scoring='roc_auc'
        )
        print(f"  CV ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
        
        # Store model components
        models[stage_name] = {
            'model': final_model,
            'ensemble': ensemble,
            'rf_model': rf_model,
            'gb_model': gb_model,
            'lr_model': lr_model,
            'scaler': scaler,
            'selector': selector,
            'features': available_features,
            'selected_features': selected_features,
            'threshold': threshold,
            'performance': {
                'accuracy': accuracy,
                'roc_auc': roc_auc,
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'cv_roc_auc': cv_scores.mean(),
                'cv_std': cv_scores.std()
            },
            'is_calibrated': use_calibration and pos_rate < 0.3
        }
        
        # Feature importance
        if hasattr(rf_model, 'feature_importances_'):
            importance_df = pd.DataFrame({
                'feature': selected_features,
                'importance': rf_model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            print(f"\nTop 5 features:")
            for _, row in importance_df.head(5).iterrows():
                print(f"  - {row['feature']}: {row['importance']:.4f}")
    
    return models

# Define feature sets
feature_sets = {
    'new_bill': base_features,
    'early_stage': extended_features,
    'progressive': progressive_features
}

# Train viability models
print("\n" + "="*70)
print("PHASE 1: VIABILITY PREDICTION")
print("="*70)
viability_models = train_robust_model(df, 'viable', 'Viability', feature_sets)

# Train passage models on viable bills only
viable_bills = df[df['viable'] == 1].copy()
print(f"\n" + "="*70)
print(f"PHASE 2: PASSAGE PREDICTION")
print(f"Training on {len(viable_bills)} viable bills ({viable_bills['passed'].mean():.1%} passed)")
print("="*70)

passage_models = train_robust_model(viable_bills, 'passed', 'Passage', feature_sets)

# Model evaluation by congress
print("\n" + "="*60)
print("MODEL EVALUATION BY CONGRESS")
print("="*60)

for congress in sorted(df['congress'].unique()):
    congress_df = df[df['congress'] == congress]
    print(f"\n{congress}th Congress (n={len(congress_df)}):")
    print(f"  Viability rate: {congress_df['viable'].mean():.1%}")
    print(f"  Passage rate: {congress_df['passed'].mean():.1%}")

# Create models directory
os.makedirs('models', exist_ok=True)

# Save models IN OPTIMIZED SPLIT COMPONENTS
print("\n" + "="*60)
print("SAVING MODELS IN OPTIMIZED SPLIT COMPONENTS")
print("="*60)

# Save metadata
metadata = {
    'training_date': datetime.now().isoformat(),
    'dataset_size': len(df),
    'congresses': sorted(df['congress'].unique().tolist()),
    'viable_rate': df['viable'].mean(),
    'passage_rate': df['passed'].mean(),
    'model_version': '6.0-6congress-optimized',
    'improvements': [
        'Trained on 6 congresses (113-118)',
        'Uses days_active instead of days_since_introduction',
        'Includes congress-specific features',
        'Better temporal feature engineering',
        'Optimized split components to avoid duplication',
        'Combined small components into single file'
    ],
    'feature_sets': feature_sets
}

joblib.dump({
    'metadata': metadata,
    'label_encoders': {
        'party': le_party,
        'policy': le_policy
    }
}, '../models/metadata.pkl')
print("✅ Saved metadata.pkl")

# Function to save model components optimally
def save_model_components_optimized(models_dict, model_type):
    """Save each model stage as optimized components without duplication"""
    for stage_name, model_data in models_dict.items():
        # Create directory for this model
        model_dir = f'../models/{model_type}_{stage_name}'
        os.makedirs(model_dir, exist_ok=True)
        
        print(f"\n--- Saving {model_type}_{stage_name} components ---")
        
        # Save RF model separately (it's the largest)
        rf_file = f'{model_dir}/rf_model.pkl'
        joblib.dump(model_data['rf_model'], rf_file)
        rf_size = os.path.getsize(rf_file) / (1024 * 1024)
        print(f"  ✅ Saved rf_model.pkl ({rf_size:.1f} MB)")
        
        # Combine smaller components into one file
        small_components = {
            'gb_model': model_data['gb_model'],
            'lr_model': model_data['lr_model'],
            'scaler': model_data['scaler'],
            'selector': model_data['selector'],
            'metadata': {
                'features': model_data['features'],
                'selected_features': model_data['selected_features'],
                'threshold': model_data['threshold'],
                'performance': model_data['performance'],
                'is_calibrated': model_data['is_calibrated']
            }
        }
        
        components_file = f'{model_dir}/components.pkl'
        joblib.dump(small_components, components_file)
        components_size = os.path.getsize(components_file) / (1024 * 1024)
        print(f"  ✅ Saved components.pkl ({components_size:.1f} MB) - contains gb_model, lr_model, scaler, selector, metadata")
        
        # Save ensemble configuration (not the models themselves)
        ensemble_config = {
            'voting': 'soft',
            'weights': [0.4, 0.4, 0.2],
            'estimator_names': ['rf', 'gb', 'lr']
        }
        
        ensemble_config_file = f'{model_dir}/ensemble_config.pkl'
        joblib.dump(ensemble_config, ensemble_config_file)
        print(f"  ✅ Saved ensemble_config.pkl (<0.1 MB)")
        
        # If calibrated, save calibration data separately
        if model_data['is_calibrated']:
            # Extract calibration data from the CalibratedClassifierCV
            calibrated_model = model_data['model']
            calibration_data = {
                'method': 'isotonic',
                'cv': 3,
                # Store the calibration mapping if accessible
                'calibrators': []
            }
            
            # Try to extract calibrators if they exist
            if hasattr(calibrated_model, 'calibrated_classifiers_'):
                for cal_clf in calibrated_model.calibrated_classifiers_:
                    if hasattr(cal_clf, 'calibrator_'):
                        calibration_data['calibrators'].append({
                            'calibrator': cal_clf.calibrator_
                        })
            
            calibration_file = f'{model_dir}/calibration.pkl'
            joblib.dump(calibration_data, calibration_file)
            cal_size = os.path.getsize(calibration_file) / (1024 * 1024)
            print(f"  ✅ Saved calibration.pkl ({cal_size:.1f} MB)")
        
        # Calculate total size
        total_size = rf_size + components_size + 0.1  # 0.1 for ensemble config
        if model_data['is_calibrated']:
            total_size += cal_size
        
        print(f"  Total size for {model_type}_{stage_name}: {total_size:.1f} MB")

# Save viability models
save_model_components_optimized(viability_models, 'viability')

# Save passage models
save_model_components_optimized(passage_models, 'passage')

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE!")
print(f"Models saved in 'models' directory as optimized components")
print(f"Large RF models saved separately, small components combined")
print(f"Trained on {len(df)} bills from congresses: {sorted(df['congress'].unique())}")
print("="*60)

TIME-AWARE MODEL TRAINING - 6 CONGRESS DATASET
Loading dataset from: ../data/bills_6congress_training.csv
Loaded 76897 bills with known outcomes
Congresses: [np.int64(113), np.int64(114), np.int64(115), np.int64(116), np.int64(117), np.int64(118)]
Passed bills: 2056 (2.7%)

CREATING VIABILITY TARGET
Viable bills: 12315 (16.0%)

Creating enhanced features...

Verifying feature availability...

Feature counts:
- Base: 17
- Extended: 25
- Progressive: 39

PHASE 1: VIABILITY PREDICTION

TRAINING VIABILITY MODELS

--- Training new_bill model ---
Using 17 features
Positive class rate: 16.0%
Selected top 16 features
Training individual models...
Training ensemble model...
Calibrating probabilities...

Performance (threshold=0.500):
  Accuracy: 0.8875
  ROC-AUC: 0.8856
  Precision: 0.7253
  Recall: 0.4791
  F1 Score: 0.5770
  CV ROC-AUC: 0.8883 (+/- 0.0083)

Top 5 features:
  - subject_count: 0.1601
  - has_bipartisan_support: 0.1328
  - bipartisan_score: 0.1018
  - party_dominance: 0.0946
  -