In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, cohen_kappa_score, balanced_accuracy_score
import xgboost as xgb
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, SimpleRNN, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import time
import psutil
import warnings
import shap

warnings.filterwarnings('ignore')

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define data paths
macro_folder = '/content/drive/MyDrive/processed_data'
financial_folder = '/content/drive/MyDrive/financial_info'

# 1. DATA LOADING
def load_country_data(country_name):
    """Load stock and macro data for a specific country"""
    country_lower = country_name.lower()
    stock_file = f'{financial_folder}/{country_lower}_stock.csv'
    macro_file = f'{macro_folder}/{country_lower}_macro.csv'

    try:
        stock_df = pd.read_csv(stock_file, parse_dates=['Date'], skiprows=[1])
        stock_df.columns = [col.lower() for col in stock_df.columns]
        if 'date' not in stock_df.columns and 'Date' in stock_df.columns:
            stock_df = stock_df.rename(columns={'Date': 'date'})
        stock_df['close'] = pd.to_numeric(stock_df['close'], errors='coerce')

        macro_df = pd.read_csv(macro_file, parse_dates=['date'])
        macro_df.columns = [col.lower() for col in macro_df.columns]

        print(f"Loaded {country_name} data: {stock_df.shape[0]} stock rows, {macro_df.shape[0]} macro rows")
        return stock_df, macro_df
    except Exception as e:
        print(f"Error loading data for {country_name}: {str(e)}")
        return None, None

# 2. PREPROCESSING
def preprocess_country_data(stock_df, macro_df, country_name, include_macro=True):
    """Preprocess stock and macro data for model training"""
    stock_df = stock_df.copy()
    macro_df = macro_df.copy()

    # Calculate returns and volatility features
    stock_df['return_1q'] = stock_df['close'].pct_change(63)
    stock_df['return_2q'] = stock_df['close'].pct_change(126)
    stock_df['volatility_2q'] = stock_df['return_1q'].rolling(126).std()

    # Add monthly metrics as fallback
    stock_df['return_1m'] = stock_df['close'].pct_change(21)
    stock_df['return_3m'] = stock_df['close'].pct_change(63)
    stock_df['volatility'] = stock_df['return_1m'].rolling(21).std()

    # Technical indicators
    stock_df['ma_ratio'] = stock_df['close'] / stock_df['close'].rolling(20).mean()
    stock_df['momentum'] = stock_df['return_1m'].rolling(3).sum()

    processed_df = stock_df.copy()

    if include_macro:
        # Process macro data
        macro_cols_to_keep = [col for col in macro_df.columns
                             if not col.endswith('_x') and not col.endswith('_y') and col != 'date']
        print(f"Available macro columns for {country_name}: {macro_cols_to_keep}")

        macro_df = macro_df[['date'] + macro_cols_to_keep]
        for col in macro_cols_to_keep:
            macro_df[col] = pd.to_numeric(macro_df[col], errors='coerce')

        # Forward fill missing values in macro data
        macro_df = macro_df.sort_values('date').set_index('date')
        macro_df = macro_df.ffill().bfill()
        macro_df = macro_df.reset_index()

        # Merge stock and macro data
        processed_df = pd.merge_asof(stock_df.sort_values('date'),
                                     macro_df.sort_values('date'), on='date')

        # Calculate percentage changes for macro columns
        for col in macro_cols_to_keep:
            if col in processed_df.columns:
                processed_df[f'{col}_change'] = processed_df[col].pct_change(1).fillna(0)

    # Fill missing values
    for col in processed_df.columns:
        if col not in ['date', 'country']:
            if col.endswith('_change'):
                processed_df[col] = processed_df[col].fillna(0)
            else:
                processed_df[col] = processed_df[col].ffill().bfill()

    processed_df['country'] = country_name
    print(f"Successfully processed {country_name} data: {processed_df.shape[0]} rows, {processed_df.shape[1]} columns")
    return processed_df

# 3. REGIME CLASSIFICATION
def define_market_regime(df, method='adaptive'):
    """Define market regimes using adaptive thresholds"""
    df_copy = df.copy()

    # Check if quarterly metrics are available
    has_quarterly = (df_copy['return_1q'].notna().sum() > len(df_copy) * 0.5 and
                    df_copy['volatility_2q'].notna().sum() > len(df_copy) * 0.5)

    if has_quarterly:
        print("Using quarterly metrics for regime definition")
        return_col = 'return_1q'
        vol_col = 'volatility_2q'
    else:
        print("Quarterly metrics not available, using monthly metrics")
        return_col = 'return_1m'
        vol_col = 'volatility'

    # Remove rows with missing values in key columns
    mask = df_copy[return_col].notna() & df_copy[vol_col].notna()
    valid_data = df_copy[mask]

    if method == 'adaptive':
        # Adaptive thresholds based on historical distribution
        return_high = max(valid_data[return_col].quantile(0.7), 0.03)
        return_low = min(valid_data[return_col].quantile(0.3), -0.02)
        vol_high = valid_data[vol_col].quantile(0.65)

        # Define conditions for regimes (0: Bear, 1: Neutral, 2: Bull)
        conditions = [
            (df_copy[return_col] < return_low) & (df_copy[vol_col] > vol_high),
            (df_copy[return_col] >= return_low) & (df_copy[return_col] <= return_high),
            (df_copy[return_col] > return_high)
        ]
    else:
        # Quantile-based thresholds
        return_high = valid_data[return_col].quantile(0.65)
        return_low = valid_data[return_col].quantile(0.35)
        vol_high = valid_data[vol_col].quantile(0.65)

        conditions = [
            (df_copy[return_col] < return_low) & (df_copy[vol_col] > vol_high),
            (df_copy[return_col] >= return_low) & (df_copy[return_col] <= return_high),
            (df_copy[return_col] > return_high)
        ]

    choices = [0, 1, 2]  # 0: Bear, 1: Neutral, 2: Bull
    df_copy['regime'] = np.select(conditions, choices, default=1)

    # Print regime distribution
    regime_counts = df_copy['regime'].value_counts()
    print(f"Regime distribution: {dict(regime_counts)}")

    # Ensure balanced class distribution
    if len(regime_counts) < 3:
        print("WARNING: Not all regimes are present in the data!")
        missing_regimes = set([0, 1, 2]) - set(regime_counts.index)
        print(f"Missing regimes: {missing_regimes}")

        if 0 not in regime_counts.index:
            worst_returns = df_copy[df_copy[return_col].notna()].nsmallest(max(5, int(len(df_copy) * 0.1)), return_col).index
            df_copy.loc[worst_returns, 'regime'] = 0
            print(f"Forced {len(worst_returns)} data points to be bear regime for balanced classification")

    regime_counts = df_copy['regime'].value_counts()
    print(f"Final regime distribution: {dict(regime_counts)}")
    return df_copy

# 4. FEATURE PREPARATION
def prepare_features(df, include_macro=True):
    """Prepare features for model training"""
    # Select appropriate features based on data availability
    if 'return_1q' in df.columns and df['return_1q'].notna().sum() > len(df) * 0.5:
        base_features = ['return_1q', 'return_2q', 'volatility_2q', 'ma_ratio', 'momentum']
    else:
        base_features = ['return_1m', 'return_3m', 'volatility', 'ma_ratio', 'momentum']

    # Add macro features if requested
    macro_features = []
    if include_macro:
        possible_macro = ['gdp', 'cpi', 'exchange_rate', 'imports', 'exports',
                         'gdp_change', 'cpi_change', 'imports_change', 'exports_change',
                         'interest_rate', 'interest_rate_change']
        macro_features = [f for f in possible_macro if f in df.columns and df[f].notna().sum() > len(df) * 0.3]

    # Combine all features
    feature_names = base_features + macro_features
    available_features = [f for f in feature_names if f in df.columns]

    # Remove empty or zero-variance features
    non_null_counts = df[available_features].count()
    empty_columns = non_null_counts[non_null_counts == 0].index.tolist()
    if empty_columns:
        print(f"Removing completely empty features: {empty_columns}")
        available_features = [f for f in available_features if f not in empty_columns]

    # Handle missing values
    df_clean = df.copy().sort_values('date')
    for col in available_features:
        df_clean[col] = df_clean[col].ffill()
        if df_clean[col].isna().any():
            if col.endswith('_change'):
                df_clean[col] = df_clean[col].fillna(0)
            else:
                df_clean[col] = df_clean[col].fillna(df_clean[col].median())

    X = df_clean[available_features].copy()
    y = df_clean['regime'].copy()
    return X, y, available_features

# 5. MODEL EVALUATION METRICS
def calculate_model_metrics(y_true, y_pred):
    """Calculate comprehensive model evaluation metrics"""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred, average='weighted'),
        'precision': precision_score(y_true, y_pred, average='weighted'),
        'recall': recall_score(y_true, y_pred, average='weighted'),
        'kappa': cohen_kappa_score(y_true, y_pred),
        'confusion_matrix': confusion_matrix(y_true, y_pred)
    }
    return metrics

# 6. TRADITIONAL MODEL TRAINING
def train_traditional_models(X_train, X_test, y_train, y_test, feature_names):
    """Train and evaluate traditional ML models"""
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialize models
    models = {
        'RandomForest': RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42),
        'SVC': SVC(probability=True, class_weight='balanced', random_state=42),
        'XGBoost': XGBClassifier(n_estimators=200, learning_rate=0.1, random_state=42,
                                scale_pos_weight=1, use_label_encoder=False, eval_metric='mlogloss')
    }

    results = {}
    start_time = time.time()

    for name, model in models.items():
        print(f"Training {name}...")
        model_start_time = time.time()
        mem_before = psutil.virtual_memory().percent

        try:
            # Train model
            model.fit(X_train_scaled, y_train)

            # Make predictions
            y_train_pred = model.predict(X_train_scaled)
            y_test_pred = model.predict(X_test_scaled)

            # Calculate metrics
            train_metrics = calculate_model_metrics(y_train, y_train_pred)
            test_metrics = calculate_model_metrics(y_test, y_test_pred)

            # Calculate training time and memory usage
            training_time = time.time() - model_start_time
            mem_after = psutil.virtual_memory().percent
            mem_usage = mem_after - mem_before

            # Store results
            results[name] = {
                'model': model,
                'train_metrics': train_metrics,
                'test_metrics': test_metrics,
                'y_pred': y_test_pred,
                'y_true': y_test,
                'training_time': training_time,
                'memory_usage': mem_usage
            }

            # Get feature importance for tree-based models
            if name in ['RandomForest', 'XGBoost'] and hasattr(model, 'feature_importances_'):
                results[name]['feature_importance'] = dict(zip(feature_names, model.feature_importances_))

            print(f"{name} - Train Accuracy: {train_metrics['accuracy']:.4f}, Test Accuracy: {test_metrics['accuracy']:.4f}")
            print(f"{name} - Train Balanced Acc: {train_metrics['balanced_accuracy']:.4f}, Test Balanced Acc: {test_metrics['balanced_accuracy']:.4f}")
            print(f"{name} - Training Time: {training_time:.2f}s, Memory Usage: {mem_usage:.2f}%")

        except Exception as e:
            print(f"Error training {name}: {str(e)}")

    return results

# 7. DEEP LEARNING MODEL TRAINING
def create_sequences(data, target, seq_length=10):
    """Create sequences for RNN models"""
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(target[i+seq_length])
    return np.array(X), np.array(y)

def train_deep_learning_models(X_train, X_test, y_train, y_test, feature_names):
    """Train and evaluate deep learning models"""
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Create sequences for RNN models
    seq_length = min(10, X_train_scaled.shape[0] // 5)
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train.values, seq_length)
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test.values, seq_length)

    # Check if we have enough data for sequences
    if len(X_train_seq) < 10 or len(X_test_seq) < 5:
        print("WARNING: Not enough data for sequence models. Skipping deep learning models.")
        return {}

    # One-hot encode target
    num_classes = len(np.unique(np.concatenate([y_train, y_test])))
    y_train_onehot = tf.keras.utils.to_categorical(y_train_seq, num_classes=num_classes)
    y_test_onehot = tf.keras.utils.to_categorical(y_test_seq, num_classes=num_classes)

    # Define models
    input_shape = (X_train_seq.shape[1], X_train_seq.shape[2])
    output_shape = y_train_onehot.shape[1]

    models = {
        'LSTM': Sequential([
            LSTM(64, input_shape=input_shape, return_sequences=True),
            Dropout(0.3),
            LSTM(32),
            Dropout(0.3),
            Dense(output_shape, activation='softmax')
        ]),
        'GRU': Sequential([
            GRU(64, input_shape=input_shape, return_sequences=True),
            Dropout(0.3),
            GRU(32),
            Dropout(0.3),
            Dense(output_shape, activation='softmax')
        ])
    }

    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        model_start_time = time.time()
        mem_before = psutil.virtual_memory().percent

        try:
            # Compile model
            model.compile(optimizer=Adam(learning_rate=0.001),
                         loss='categorical_crossentropy',
                         metrics=['accuracy'])

            # Early stopping
            early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

            # Train model
            batch_size = min(32, X_train_seq.shape[0] // 4)
            if batch_size < 1:
                batch_size = 1

            history = model.fit(X_train_seq, y_train_onehot,
                              epochs=50,
                              batch_size=batch_size,
                              validation_split=0.2,
                              callbacks=[early_stopping],
                              verbose=1)

            # Make predictions
            y_train_pred_prob = model.predict(X_train_seq)
            y_test_pred_prob = model.predict(X_test_seq)

            y_train_pred = np.argmax(y_train_pred_prob, axis=1)
            y_test_pred = np.argmax(y_test_pred_prob, axis=1)

            y_train_true = np.argmax(y_train_onehot, axis=1)
            y_test_true = np.argmax(y_test_onehot, axis=1)

            # Calculate metrics
            train_metrics = calculate_model_metrics(y_train_true, y_train_pred)
            test_metrics = calculate_model_metrics(y_test_true, y_test_pred)

            # Calculate training time and memory usage
            training_time = time.time() - model_start_time
            mem_after = psutil.virtual_memory().percent
            mem_usage = mem_after - mem_before

            # Store results
            results[name] = {
                'model': model,
                'train_metrics': train_metrics,
                'test_metrics': test_metrics,
                'history': history.history,
                'y_pred': y_test_pred,
                'y_true': y_test_true,
                'training_time': training_time,
                'memory_usage': mem_usage
            }

            print(f"{name} - Train Accuracy: {train_metrics['accuracy']:.4f}, Test Accuracy: {test_metrics['accuracy']:.4f}")
            print(f"{name} - Train Balanced Acc: {train_metrics['balanced_accuracy']:.4f}, Test Balanced Acc: {test_metrics['balanced_accuracy']:.4f}")
            print(f"{name} - Training Time: {training_time:.2f}s, Memory Usage: {mem_usage:.2f}%")

        except Exception as e:
            print(f"Error training {name}: {str(e)}")

    return results

# 8. PORTFOLIO PERFORMANCE SIMULATION
def simulate_portfolio(df, predictions, start_date, end_date):
    """Simulate portfolio performance based on model predictions"""
    # Filter data for simulation period
    sim_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)].copy()
    sim_df = sim_df.sort_values('date')

    # Calculate daily returns
    sim_df['daily_return'] = sim_df['close'].pct_change()

    # Initialize portfolio returns
    portfolio_returns = {
        'Buy and Hold': [],
        **{model_name: [] for model_name in predictions.keys()}
    }

    # Calculate cumulative returns
    cum_return_bh = 1.0
    cum_returns_models = {model_name: 1.0 for model_name in predictions.keys()}

    # Track positions for transaction costs
    current_positions = {model_name: None for model_name in predictions.keys()}
    transaction_cost = 0.001  # 0.1% per transaction

    # Simulate portfolio performance
    for i in range(1, len(sim_df)):
        date = sim_df.iloc[i]['date']
        daily_return = sim_df.iloc[i]['daily_return']

        # Buy and Hold strategy
        if not np.isnan(daily_return):
            cum_return_bh *= (1 + daily_return)
            portfolio_returns['Buy and Hold'].append({
                'date': date,
                'cumulative_return': cum_return_bh
            })

        # Model-based strategies
        for model_name, model_preds in predictions.items():
            if i-1 < len(model_preds):
                pred_regime = model_preds[i-1]

                # Investment strategy based on regime
                # 0: Bear - Short the market (-1x exposure)
                # 1: Neutral - Cash (0x exposure)
                # 2: Bull - Long the market (1x exposure)
                new_exposure = -1.0 if pred_regime == 0 else (0.0 if pred_regime == 1 else 1.0)

                # Apply transaction costs if position changed
                if current_positions[model_name] is not None and current_positions[model_name] != new_exposure:
                    cum_returns_models[model_name] *= (1 - transaction_cost)

                current_positions[model_name] = new_exposure

                if not np.isnan(daily_return):
                    model_return = daily_return * new_exposure
                    cum_returns_models[model_name] *= (1 + model_return)
                    portfolio_returns[model_name].append({
                        'date': date,
                        'cumulative_return': cum_returns_models[model_name]
                    })

    # Convert to DataFrames
    for strategy, returns in portfolio_returns.items():
        if returns:
            portfolio_returns[strategy] = pd.DataFrame(returns)
        else:
            portfolio_returns[strategy] = pd.DataFrame(columns=['date', 'cumulative_return'])

    return portfolio_returns

# 9. PERFORMANCE EVALUATION
def evaluate_portfolio_performance(portfolio_returns):
    """Calculate performance metrics for portfolios"""
    performance = {}

    for strategy, returns_df in portfolio_returns.items():
        if returns_df.empty:
            continue

        # Calculate daily returns
        returns_df['daily_return'] = returns_df['cumulative_return'].pct_change()

        # Calculate performance metrics
        total_return = returns_df['cumulative_return'].iloc[-1] / returns_df['cumulative_return'].iloc[0] - 1
        annual_return = (1 + total_return) ** (252 / len(returns_df)) - 1
        volatility = returns_df['daily_return'].std() * np.sqrt(252)
        sharpe_ratio = annual_return / volatility if volatility > 0 else 0

        # Calculate maximum drawdown
        cumulative = returns_df['cumulative_return']
        running_max = cumulative.cummax()
        drawdown = (cumulative / running_max - 1)
        max_drawdown = drawdown.min()

        # Calculate win rate
        win_rate = (returns_df['daily_return'] > 0).mean()

        performance[strategy] = {
            'Total Return': total_return,
            'Annual Return': annual_return,
            'Volatility': volatility,
            'Sharpe Ratio': sharpe_ratio,
            'Max Drawdown': max_drawdown,
            'Win Rate': win_rate
        }

    return pd.DataFrame(performance).T

# 10. CROSS-COUNTRY COMPARISON
def analyze_macro_value_added(simulation_results):
    """Analyze the value added by macroeconomic factors across countries"""
    macro_value_analysis = {}

    for country, results in simulation_results.items():
        perf_metrics = results['performance_metrics']

        for model_type in ['RandomForest', 'SVC', 'XGBoost', 'LSTM', 'GRU']:
            with_macro = f"{model_type}_with_macro"
            without_macro = f"{model_type}_without_macro"

            if with_macro in perf_metrics.index and without_macro in perf_metrics.index:
                sharpe_diff = perf_metrics.loc[with_macro, 'Sharpe Ratio'] - perf_metrics.loc[without_macro, 'Sharpe Ratio']
                return_diff = perf_metrics.loc[with_macro, 'Annual Return'] - perf_metrics.loc[without_macro, 'Annual Return']
                drawdown_diff = perf_metrics.loc[with_macro, 'Max Drawdown'] - perf_metrics.loc[without_macro, 'Max Drawdown']

                key = f"{country}_{model_type}"
                macro_value_analysis[key] = {
                    'Country': country,
                    'Model': model_type,
                    'Sharpe Ratio Improvement': sharpe_diff,
                    'Return Improvement': return_diff,
                    'Drawdown Improvement': drawdown_diff,
                    'Value Added': 'Positive' if sharpe_diff > 0 else 'Negative'
                }

    return pd.DataFrame(macro_value_analysis).T

# 11. REGIME TRANSITION ANALYSIS
def analyze_regime_transitions(simulation_results):
    """Analyze regime transitions and their impact on performance"""
    transition_analysis = {}

    for country, results in simulation_results.items():
        data = results['data_with_macro']

        # Identify regime transitions
        data['regime_shift'] = data['regime'].diff().abs() > 0

        # Calculate returns around transitions
        pre_transition_returns = []
        post_transition_returns = []

        transition_indices = data[data['regime_shift']].index.tolist()

        for idx in transition_indices:
            if idx > 5 and idx < len(data) - 5:
                # Use quarterly returns if available, otherwise monthly
                if 'return_1q' in data.columns and data['return_1q'].notna().sum() > len(data) * 0.5:
                    pre_return = data.iloc[idx-5:idx]['return_1q'].mean()
                    post_return = data.iloc[idx:idx+5]['return_1q'].mean()
                else:
                    pre_return = data.iloc[idx-5:idx]['return_1m'].mean()
                    post_return = data.iloc[idx:idx+5]['return_1m'].mean()

                if not np.isnan(pre_return) and not np.isnan(post_return):
                    pre_transition_returns.append(pre_return)
                    post_transition_returns.append(post_return)

        # Calculate statistics
        if pre_transition_returns and post_transition_returns:
            transition_analysis[country] = {
                'Number of Transitions': len(transition_indices),
                'Avg Pre-Transition Return': np.mean(pre_transition_returns),
                'Avg Post-Transition Return': np.mean(post_transition_returns),
                'Return Differential': np.mean(post_transition_returns) - np.mean(pre_transition_returns)
            }
        else:
            transition_analysis[country] = {
                'Number of Transitions': len(transition_indices),
                'Avg Pre-Transition Return': np.nan,
                'Avg Post-Transition Return': np.nan,
                'Return Differential': np.nan
            }

    return transition_analysis

# 12. MAIN ANALYSIS FUNCTION
def run_market_regime_analysis(countries=['brazil', 'india', 'south_africa']):
    """Run market regime analysis for multiple countries"""
    simulation_results = {}

    for country in countries:
        print(f"\n{'='*50}")
        print(f"ANALYZING {country.upper()}")
        print(f"{'='*50}")

        try:
            # Load and process data
            stock_df, macro_df = load_country_data(country)
            if stock_df is None or macro_df is None:
                print(f"Skipping {country} due to data loading errors")
                continue

            # Process data with and without macro factors
            data_with_macro = preprocess_country_data(stock_df, macro_df, country, include_macro=True)
            data_without_macro = preprocess_country_data(stock_df, macro_df, country, include_macro=False)

            # Define market regimes
            data_with_macro = define_market_regime(data_with_macro, method='adaptive')
            data_without_macro = define_market_regime(data_without_macro, method='adaptive')

            # Prepare features
            X_with_macro, y_with_macro, features_with_macro = prepare_features(data_with_macro, include_macro=True)
            X_without_macro, y_without_macro, features_without_macro = prepare_features(data_without_macro, include_macro=False)

            print(f"Features with macro: {features_with_macro}")
            print(f"Features without macro: {features_without_macro}")

            # Split data using stratified time series split
            def get_stratified_time_split(X, y, test_size=0.3):
                # Check if we have enough data
                if len(X) < 30:
                    print("WARNING: Not enough data for proper time series split")
                    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)

                # Split by time
                train_size = int(len(X) * (1 - test_size))
                X_train_time, X_test_time = X.iloc[:train_size], X.iloc[train_size:]
                y_train_time, y_test_time = y.iloc[:train_size], y.iloc[train_size:]

                # Check if all classes are in training set
                if len(np.unique(y_train_time)) < 3:
                    print("WARNING: Not all classes in time-based training split. Using stratified split instead.")
                    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)

                return X_train_time, X_test_time, y_train_time, y_test_time

            # Split data
            X_train_with_macro, X_test_with_macro, y_train_with_macro, y_test_with_macro = get_stratified_time_split(
                X_with_macro, y_with_macro)

            X_train_without_macro, X_test_without_macro, y_train_without_macro, y_test_without_macro = get_stratified_time_split(
                X_without_macro, y_without_macro)

            # Check class distribution
            print(f"Training set with macro class distribution: {dict(y_train_with_macro.value_counts())}")
            print(f"Test set with macro class distribution: {dict(y_test_with_macro.value_counts())}")
            print(f"Training set without macro class distribution: {dict(y_train_without_macro.value_counts())}")
            print(f"Test set without macro class distribution: {dict(y_test_without_macro.value_counts())}")

            # Initialize results storage
            model_results = {'with_macro': {}, 'without_macro': {}}

            # Train traditional models
            print("\nTraining traditional models WITH macroeconomic factors:")
            model_results['with_macro'].update(
                train_traditional_models(
                    X_train_with_macro, X_test_with_macro,
                    y_train_with_macro, y_test_with_macro,
                    features_with_macro)
            )

            print("\nTraining traditional models WITHOUT macroeconomic factors:")
            model_results['without_macro'].update(
                train_traditional_models(
                    X_train_without_macro, X_test_without_macro,
                    y_train_without_macro, y_test_without_macro,
                    features_without_macro)
            )

            # Train deep learning models
            print("\nTraining deep learning models WITH macroeconomic factors:")
            model_results['with_macro'].update(
                train_deep_learning_models(
                    X_train_with_macro, X_test_with_macro,
                    y_train_with_macro, y_test_with_macro,
                    features_with_macro)
            )

            print("\nTraining deep learning models WITHOUT macroeconomic factors:")
            model_results['without_macro'].update(
                train_deep_learning_models(
                    X_train_without_macro, X_test_without_macro,
                    y_train_without_macro, y_test_without_macro,
                    features_without_macro)
            )

            # Portfolio simulation
            print("\nSimulating portfolio performance...")

            # Collect predictions
            predictions = {}
            for macro_status, results in model_results.items():
                for model_name, model_result in results.items():
                    if 'y_pred' in model_result:
                        key = f"{model_name}_{macro_status}"
                        predictions[key] = model_result['y_pred']

            # Simulation period
            start_date = data_with_macro.iloc[len(X_train_with_macro)]['date']
            end_date = data_with_macro.iloc[-1]['date']

            # Run portfolio simulation
            portfolio_returns = simulate_portfolio(
                data_with_macro, predictions, start_date, end_date)

            # Evaluate portfolio performance
            performance_metrics = evaluate_portfolio_performance(portfolio_returns)

            # Store country results
            simulation_results[country] = {
                'model_results': model_results,
                'portfolio_returns': portfolio_returns,
                'performance_metrics': performance_metrics,
                'features_with_macro': features_with_macro,
                'features_without_macro': features_without_macro,
                'data_with_macro': data_with_macro,
                'data_without_macro': data_without_macro
            }

            # Display performance metrics
            print("\nPortfolio Performance Metrics:")
            print(performance_metrics)

            # Create performance comparison table
            performance_table = pd.DataFrame({
                'Model': list(model_results['with_macro'].keys()) + list(model_results['without_macro'].keys()),
                'Accuracy': [model_results['with_macro'][m]['test_metrics']['accuracy'] if m in model_results['with_macro'] else np.nan for m in model_results['with_macro']] +
                           [model_results['without_macro'][m]['test_metrics']['accuracy'] if m in model_results['without_macro'] else np.nan for m in model_results['without_macro']],
                'F1 Score': [model_results['with_macro'][m]['test_metrics']['f1_score'] if m in model_results['with_macro'] else np.nan for m in model_results['with_macro']] +
                           [model_results['without_macro'][m]['test_metrics']['f1_score'] if m in model_results['without_macro'] else np.nan for m in model_results['without_macro']],
                'Training Time (s)': [model_results['with_macro'][m]['training_time'] if m in model_results['with_macro'] else np.nan for m in model_results['with_macro']] +
                                    [model_results['without_macro'][m]['training_time'] if m in model_results['without_macro'] else np.nan for m in model_results['without_macro']],
                'Memory Usage (%)': [model_results['with_macro'][m]['memory_usage'] if m in model_results['with_macro'] else np.nan for m in model_results['with_macro']] +
                                   [model_results['without_macro'][m]['memory_usage'] if m in model_results['without_macro'] else np.nan for m in model_results['without_macro']]
            })

            print("\nModel Performance Comparison:")
            print(performance_table)

        except Exception as e:
            print(f"Error analyzing {country}: {str(e)}")

    # Cross-country analysis
    if simulation_results:
        print("\n" + "="*50)
        print("CROSS-COUNTRY ANALYSIS")
        print("="*50)

        # Compare performance across countries
        performance_comparison = {}

        for country, results in simulation_results.items():
            if 'performance_metrics' in results:
                perf_metrics = results['performance_metrics']

                for strategy in perf_metrics.index:
                    if strategy not in performance_comparison:
                        performance_comparison[strategy] = {}

                    performance_comparison[strategy][country] = {
                        'Sharpe Ratio': perf_metrics.loc[strategy, 'Sharpe Ratio'],
                        'Annual Return': perf_metrics.loc[strategy, 'Annual Return'],
                        'Max Drawdown': perf_metrics.loc[strategy, 'Max Drawdown']
                    }

        # Display cross-country comparison
        for strategy, country_perf in performance_comparison.items():
            print(f"\n{strategy} Performance Across Countries:")
            comparison_df = pd.DataFrame(country_perf).T
            print(comparison_df)

        # Analyze value added by macroeconomic factors
        print("\n" + "="*50)
        print("VALUE ADDED BY MACROECONOMIC FACTORS")
        print("="*50)

        macro_value_analysis = analyze_macro_value_added(simulation_results)
        print("\nValue Added by Macroeconomic Factors:")
        print(macro_value_analysis)

        # Analyze regime transitions
        transition_analysis = analyze_regime_transitions(simulation_results)
        print("\nRegime Transition Analysis:")
        for country, analysis in transition_analysis.items():
            print(f"\n{country.upper()}:")
            for metric, value in analysis.items():
                print(f"  {metric}: {value:.4f}" if isinstance(value, float) else f"  {metric}: {value}")

    return simulation_results

# Run the analysis
if __name__ == "__main__":
    results = run_market_regime_analysis()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

ANALYZING BRAZIL
Loaded brazil data: 148 stock rows, 52 macro rows
Available macro columns for brazil: ['imports', 'exports', 'gdp', 'interest_rate', 'cpi', 'exchange_rate']
Successfully processed brazil data: 148 rows, 27 columns
Successfully processed brazil data: 148 rows, 15 columns
Quarterly metrics not available, using monthly metrics
Regime distribution: {1: np.int64(96), 2: np.int64(45), 0: np.int64(7)}
Final regime distribution: {1: np.int64(96), 2: np.int64(45), 0: np.int64(7)}
Quarterly metrics not available, using monthly metrics
Regime distribution: {1: np.int64(96), 2: np.int64(45), 0: np.int64(7)}
Final regime distribution: {1: np.int64(96), 2: np.int64(45), 0: np.int64(7)}
Removing completely empty features: ['volatility_2q']
Removing completely empty features: ['volatility_2q']
Features with macro: ['return_1q', 'return_2q', 'ma_ratio', 'mom