In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle
from datetime import datetime, timedelta

class ElectionMLModel:
    def __init__(self):
        self.rf_model = None
        self.scaler = StandardScaler()
        self.features = [
            'unemployment_rate', 'gdp_growth', 'presidential_approval',
            'generic_ballot', 'fundraising_difference', 'incumbent_party',
            'days_to_election', 'previous_margin'
        ]
    
    def generate_training_data(self):
        """
        Generate synthetic historical election data for training
        Returns DataFrame with features and actual results
        """
        n_samples = 1000
        
        
        data = {
            'unemployment_rate': np.random.normal(5.5, 1.5, n_samples),
            'gdp_growth': np.random.normal(2.0, 2.0, n_samples),
            'presidential_approval': np.random.normal(45, 10, n_samples),
            'generic_ballot': np.random.normal(0, 5, n_samples),  # Difference between parties
            'fundraising_difference': np.random.normal(0, 20, n_samples),  # In millions
            'incumbent_party': np.random.binomial(1, 0.5, n_samples),
            'days_to_election': np.random.randint(1, 365, n_samples),
            'previous_margin': np.random.normal(0, 5, n_samples)
        }
        
        
        margin = (
            -0.5 * data['unemployment_rate'] +
            2.0 * data['gdp_growth'] +
            0.3 * data['presidential_approval'] +
            0.7 * data['generic_ballot'] +
            0.1 * data['fundraising_difference'] +
            2.0 * data['incumbent_party'] +
            -0.01 * data['days_to_election'] +
            0.3 * data['previous_margin'] +
            np.random.normal(0, 2, n_samples)
        )
        
        data['victory_margin'] = margin
        return pd.DataFrame(data)

    def train_model(self):
        """Train the random forest model on historical data"""
        
        df = self.generate_training_data()
        
        
        X = df[self.features]
        y = df['victory_margin']
        
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        
        
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_val_scaled = self.scaler.transform(X_val)
        
        
        self.rf_model = RandomForestRegressor(
            n_estimators=100,
            max_depth=10,
            random_state=42
        )
        self.rf_model.fit(X_train_scaled, y_train)
        
        
        val_score = self.rf_model.score(X_val_scaled, y_val)
        print(f"Model R² score: {val_score:.3f}")
        
        return val_score

    def predict_probability(self, current_data):
        """
        Predict win probability based on current conditions
        Returns probability and confidence interval
        """
        if self.rf_model is None:
            self.train_model()
        
        
        scaled_data = self.scaler.transform(current_data[self.features].values.reshape(1, -1))
        
        
        predictions = []
        for estimator in self.rf_model.estimators_:
            predictions.append(estimator.predict(scaled_data)[0])
        
        
        mean_prediction = np.mean(predictions)
        confidence_interval = np.percentile(predictions, [2.5, 97.5])
        
        
        win_probability = 1 / (1 + np.exp(-mean_prediction/2))  # Logistic transformation
        
        return {
            'win_probability': win_probability * 100,
            'confidence_interval': [
                1 / (1 + np.exp(-ci/2)) * 100 for ci in confidence_interval
            ],
            'predicted_margin': mean_prediction
        }

    def simulate_with_uncertainty(self, current_data, n_simulations=1000):
        """
        Run Monte Carlo simulation incorporating model uncertainty
        """
        if self.rf_model is None:
            self.train_model()
        
        results = []
        scaled_data = self.scaler.transform(current_data[self.features].values.reshape(1, -1))
        
        for _ in range(n_simulations):
            # Randomly select trees and average their predictions
            tree_indices = np.random.choice(
                len(self.rf_model.estimators_),
                size=int(len(self.rf_model.estimators_) * 0.8),
                replace=True
            )
            
            predictions = []
            for idx in tree_indices:
                pred = self.rf_model.estimators_[idx].predict(scaled_data)[0]
                predictions.append(pred)
            
            mean_pred = np.mean(predictions)
            
           
            final_pred = mean_pred + np.random.normal(0, 1)
            
            results.append({
                'predicted_margin': final_pred,
                'win_probability': 1 / (1 + np.exp(-final_pred/2)) * 100
            })
        
        return pd.DataFrame(results)

def get_current_conditions():
    """
    Generate current economic and political conditions
    In practice, this would pull from real APIs and data sources
    """
    return pd.DataFrame({
        'unemployment_rate': [5.2],
        'gdp_growth': [2.1],
        'presidential_approval': [43],
        'generic_ballot': [1.5],
        'fundraising_difference': [5.2],
        'incumbent_party': [1],
        'days_to_election': [(datetime(2024, 11, 5) - datetime.now()).days],
        'previous_margin': [4.4]
    })


model = ElectionMLModel()
model.train_model()
with open('election_model.pkl', 'wb') as f:
    pickle.dump(model, f)

Model R² score: 0.832
