In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
import os
from typing import List, Dict, Tuple, Union, Optional

warnings.filterwarnings("ignore")

# Create directory for models if it doesn't exist
os.makedirs('models', exist_ok=True)


def load_stacking_data() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Load the stacking data from CSV files prepared by team members
    
    Returns:
        Tuple containing:
        - Revenue training data
        - CAR training data
        - Revenue test data
        - CAR test data
    """
    # Load Revenue prediction models - Training data
    light_gbm_rev = pd.read_csv("data/stacking_data/lightgbm_rev_predict.csv")
    neural_network_rev = pd.read_csv("data/stacking_data/neural_network_rev_predict.csv")
    # ridge_reg_rev = pd.read_csv("data/stacking_data/ridge_regression_rev_predict.csv")  # Not ready yet!
    rev_truth = pd.read_csv("data/train_data_REV_with_text.csv")[["tic", "datacqtr", "Total Current Operating Revenue"]]
    
    # Merge revenue training data
    df_rev_train = light_gbm_rev.merge(neural_network_rev, on=["tic", "datacqtr"], how="left")
    # df_rev_train = df_rev_train.merge(ridge_reg_rev, on=["tic", "datacqtr"], how="left")  # Uncomment when ready
    df_rev_train = df_rev_train.merge(rev_truth, on=["tic", "datacqtr"], how="left")

    # Load CAR prediction models - Training data
    # random_forest_car = pd.read_csv("data/stacking_data/random_forest_car_predict.csv")  # Not ready yet
    neural_network_car = pd.read_csv("data/stacking_data/neural_network_car_predict.csv")
    # lasso_reg_car = pd.read_csv("data/stacking_data/lasso_regression_car_predict.csv")  # Not ready yet!
    car_truth = pd.read_csv("data/train_data_CAR5_with_text.csv")[["tic", "datacqtr", "car5"]]
    
    # Merge CAR training data
    # For now, we only have neural network predictions for CAR
    df_car_train = neural_network_car.copy()
    # df_car_train = neural_network_car.merge(random_forest_car, on=["tic", "datacqtr"], how="left")  # Uncomment when ready
    # df_car_train = df_car_train.merge(lasso_reg_car, on=["tic", "datacqtr"], how="left")  # Uncomment when ready
    df_car_train = df_car_train.merge(car_truth, on=["tic", "datacqtr"], how="left")
    
    # Load Revenue prediction models - Test data
    light_gbm_rev_test = pd.read_csv("data/results/lightgbm_rev_predict_test.csv")
    neural_network_rev_test = pd.read_csv("data/results/neural_network_rev_predict_test.csv")
    # ridge_reg_rev_test = pd.read_csv("data/results/ridge_regression_rev_predict_test.csv")  # Not ready yet!
    rev_truth_test = pd.read_csv("data/test_data_REV_with_text.csv")[["tic", "datacqtr", "Total Current Operating Revenue"]]
    
    # Merge revenue test data
    df_rev_test = light_gbm_rev_test.merge(neural_network_rev_test, on=["tic", "datacqtr"], how="left")
    # df_rev_test = df_rev_test.merge(ridge_reg_rev_test, on=["tic", "datacqtr"], how="left")  # Uncomment when ready
    df_rev_test = df_rev_test.merge(rev_truth_test, on=["tic", "datacqtr"], how="left")
    
    # Load CAR prediction models - Test data
    # random_forest_car_test = pd.read_csv("data/results/random_forest_car_predict_test.csv")  # Not ready yet
    neural_network_car_test = pd.read_csv("data/results/neural_network_car_predict_test.csv")
    # lasso_reg_car_test = pd.read_csv("data/results/lasso_regression_car_predict_test.csv")  # Not ready yet!
    car_truth_test = pd.read_csv("data/test_data_CAR5_with_text.csv")[["tic", "datacqtr", "car5"]]
    
    # Merge CAR test data
    # For now, we only have neural network predictions for CAR
    df_car_test = neural_network_car_test.copy()
    # df_car_test = neural_network_car_test.merge(random_forest_car_test, on=["tic", "datacqtr"], how="left")  # Uncomment when ready
    # df_car_test = df_car_test.merge(lasso_reg_car_test, on=["tic", "datacqtr"], how="left")  # Uncomment when ready
    df_car_test = df_car_test.merge(car_truth_test, on=["tic", "datacqtr"], how="left")
    
    # Print some basic information about the data
    print(f"Revenue training data shape: {df_rev_train.shape}")
    print(f"CAR training data shape: {df_car_train.shape}")
    print(f"Revenue test data shape: {df_rev_test.shape}")
    print(f"CAR test data shape: {df_car_test.shape}")
    
    return df_rev_train, df_car_train, df_rev_test, df_car_test


def prepare_features_for_stacking(
    df: pd.DataFrame, 
    target_column: str,
    id_columns: List[str] = ['tic', 'datacqtr']
) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Prepare features for stacking model
    
    Args:
        df: Input DataFrame with predictions from base models
        target_column: Name of the target column (actual values)
        id_columns: List of identifier columns to exclude from features
        
    Returns:
        X and y for training
    """
    # Filter columns that contain predictions (feature columns)
    feature_columns = [
        col for col in df.columns 
        if col not in id_columns and col != target_column
    ]
    
    # Create feature matrix
    X = df[feature_columns].copy()
    
    # Create target vector
    y = df[target_column].copy()
    
    print(f"Features for stacking: {feature_columns}")
    print(f"X shape: {X.shape}, y shape: {y.shape}")
    
    return X, y


def create_time_based_validation(
    df: pd.DataFrame, 
    time_column: str = 'datacqtr',
    n_splits: int = 3
) -> List[Tuple[np.ndarray, np.ndarray]]:
    """
    Create time-based cross-validation splits based on quarter/year
    
    Args:
        df: DataFrame to split
        time_column: Column containing time information (datacqtr)
        n_splits: Number of splits
        
    Returns:
        List of train and validation indices
    """
    # Extract year and quarter from datacqtr (format: YYYYQN)
    df['year'] = df[time_column].str.extract(r'(\d{4})').astype(int)
    df['quarter'] = df[time_column].str.extract(r'Q(\d)').astype(int)
    
    # Create a numerical time_id for sorting
    df['time_id'] = df['year'] * 4 + df['quarter']
    
    # Sort by time
    df = df.sort_values('time_id')
    
    # Create time-based splits
    tscv = TimeSeriesSplit(n_splits=n_splits)
    splits = []
    
    for train_idx, val_idx in tscv.split(df):
        splits.append((train_idx, val_idx))
    
    return splits


def stacking_pipeline() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Run the full stacking pipeline with models suggested by the team:
    - Simple linear model for revenue (to avoid overfitting)
    - More complex model like Random Forest for CAR
    - Focus on R², MSE and MAE metrics
    
    Returns:
        Tuple of two DataFrames:
        - Revenue predictions
        - CAR predictions
    """
    # Load stacking data
    df_rev_train, df_car_train, df_rev_test, df_car_test = load_stacking_data()
    
    # Create time-based CV splits - do this once and reuse
    cv_splits_rev = create_time_based_validation(df_rev_train)
    cv_splits_car = create_time_based_validation(df_car_train)
    
    # Revenue Stacking - Use simple model (linear regression) as recommended
    print("\n=== Revenue Stacking ===")
    
    # Prepare features for revenue prediction - Only use the model prediction columns
    # Extract just the prediction columns (not the time columns)
    rev_feature_cols = [col for col in df_rev_train.columns if '_predict' in col]
    print(f"Using only these features for revenue stacking: {rev_feature_cols}")
    
    X_rev = df_rev_train[rev_feature_cols].copy()
    y_rev = df_rev_train['Total Current Operating Revenue']
    X_rev_test = df_rev_test[rev_feature_cols].copy()
    
    print(f"X_rev shape: {X_rev.shape}, y_rev shape: {y_rev.shape}")
    print(f"X_rev_test shape: {X_rev_test.shape}")
    
    # Use Ridge regression (a regularized linear model) for revenue to avoid overfitting
    print("Training Linear Model (Ridge) for Revenue stacking...")
    scaler_rev = StandardScaler()
    X_rev_scaled = scaler_rev.fit_transform(X_rev)
    X_rev_test_scaled = scaler_rev.transform(X_rev_test)
    
    linear_model_rev = Ridge(alpha=1.0, random_state=42)
    
    # Evaluate with cross-validation
    val_scores_r2 = []
    val_scores_mse = []
    val_scores_mae = []
    
    for i, (train_idx, val_idx) in enumerate(cv_splits_rev):
        X_train, y_train = X_rev_scaled[train_idx], y_rev.iloc[train_idx]
        X_val, y_val = X_rev_scaled[val_idx], y_rev.iloc[val_idx]
        
        # Train model
        linear_model_rev.fit(X_train, y_train)
        
        # Make predictions
        val_preds = linear_model_rev.predict(X_val)
        
        # Calculate metrics
        r2 = r2_score(y_val, val_preds)
        mse = mean_squared_error(y_val, val_preds)
        mae = np.mean(np.abs(y_val - val_preds))
        
        val_scores_r2.append(r2)
        val_scores_mse.append(mse)
        val_scores_mae.append(mae)
        
        print(f"Fold {i+1}/{len(cv_splits_rev)}: R² = {r2:.4f}, MSE = {mse:.4f}, MAE = {mae:.4f}")
    
    # Calculate average validation scores
    avg_r2 = np.mean(val_scores_r2)
    avg_mse = np.mean(val_scores_mse)
    avg_mae = np.mean(val_scores_mae)
    
    print(f"Average metrics for Revenue model: R² = {avg_r2:.4f}, MSE = {avg_mse:.4f}, MAE = {avg_mae:.4f}")
    
    # Re-fit on all training data
    linear_model_rev.fit(X_rev_scaled, y_rev)
    
    # Save the model
    joblib.dump((linear_model_rev, scaler_rev), 'models/stacking_linear_revenue.pkl')
    
    # Generate revenue predictions for test data
    revenue_results = df_rev_test[['tic', 'datacqtr']].copy()
    revenue_results['revenue_prediction'] = linear_model_rev.predict(X_rev_test_scaled)
    
    # CAR Stacking - Use more complex model (Random Forest) as recommended
    print("\n=== CAR Stacking ===")
    
    # Prepare features for CAR prediction - Only use the model prediction columns
    car_feature_cols = [col for col in df_car_train.columns if '_predict' in col]
    print(f"Using only these features for CAR stacking: {car_feature_cols}")
    
    X_car = df_car_train[car_feature_cols].copy()
    y_car = df_car_train['car5']
    X_car_test = df_car_test[car_feature_cols].copy()
    
    print(f"X_car shape: {X_car.shape}, y_car shape: {y_car.shape}")
    print(f"X_car_test shape: {X_car_test.shape}")
    
    # Use Random Forest for CAR as recommended
    print("Training Random Forest for CAR stacking...")
    rf_model_car = RandomForestRegressor(
        n_estimators=100, 
        max_depth=5,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42
    )
    
    # Evaluate with cross-validation
    val_scores_r2 = []
    val_scores_mse = []
    val_scores_mae = []
    
    for i, (train_idx, val_idx) in enumerate(cv_splits_car):
        X_train, y_train = X_car.iloc[train_idx], y_car.iloc[train_idx]
        X_val, y_val = X_car.iloc[val_idx], y_car.iloc[val_idx]
        
        # Train model
        rf_model_car.fit(X_train, y_train)
        
        # Make predictions
        val_preds = rf_model_car.predict(X_val)
        
        # Calculate metrics
        r2 = r2_score(y_val, val_preds)
        mse = mean_squared_error(y_val, val_preds)
        mae = np.mean(np.abs(y_val - val_preds))
        
        val_scores_r2.append(r2)
        val_scores_mse.append(mse)
        val_scores_mae.append(mae)
        
        print(f"Fold {i+1}/{len(cv_splits_car)}: R² = {r2:.4f}, MSE = {mse:.4f}, MAE = {mae:.4f}")
    
    # Calculate average validation scores
    avg_r2 = np.mean(val_scores_r2)
    avg_mse = np.mean(val_scores_mse)
    avg_mae = np.mean(val_scores_mae)
    
    print(f"Average metrics for CAR model: R² = {avg_r2:.4f}, MSE = {avg_mse:.4f}, MAE = {avg_mae:.4f}")
    
    # Re-fit on all training data
    rf_model_car.fit(X_car, y_car)
    
    # Save the model
    joblib.dump(rf_model_car, 'models/stacking_randomforest_car.pkl')
    
    # Generate CAR predictions for test data
    car_results = df_car_test[['tic', 'datacqtr']].copy()
    car_results['car_prediction'] = rf_model_car.predict(X_car_test)
    
    # Return both DataFrames with predictions
    return revenue_results, car_results


def main():
    """
    Main function to run the stacking pipeline
    """
    try:
        # Run the stacking pipeline - now returns two separate DataFrames
        revenue_predictions, car_predictions = stacking_pipeline()
        
        # Save the predictions to separate files
        revenue_predictions.to_csv('stacking_revenue_predictions.csv', index=False)
        car_predictions.to_csv('stacking_car_predictions.csv', index=False)
        
        print("\nStacking pipeline completed successfully!")
        print("Revenue predictions saved to 'stacking_revenue_predictions.csv'")
        print("CAR predictions saved to 'stacking_car_predictions.csv'")
        
        # Try to merge predictions if tic and datacqtr match (without forcing)
        print("\nAttempting to merge predictions for common tic/datacqtr pairs...")
        merged_predictions = revenue_predictions.merge(
            car_predictions, 
            on=['tic', 'datacqtr'], 
            how='inner',
            suffixes=('', '_car')
        )
        
        if len(merged_predictions) > 0:
            merged_predictions.to_csv('stacking_merged_predictions.csv', index=False)
            print(f"Successfully merged {len(merged_predictions)} matching predictions saved to 'stacking_merged_predictions.csv'")
            print("\nSample merged predictions:")
            print(merged_predictions.head(5))
        else:
            print("No matching tic/datacqtr pairs found between revenue and CAR predictions.")
        
        # Display samples of individual predictions
        print("\nSample revenue predictions:")
        print(revenue_predictions.head(5))
        print("\nSample CAR predictions:")
        print(car_predictions.head(5))
        
        return revenue_predictions, car_predictions, merged_predictions if len(merged_predictions) > 0 else None
    
    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()
        return None


if __name__ == "__main__":
    # Run the main pipeline
    final_predictions = main()