In [None]:
import pandas as pd
import numpy as np
from fredapi import Fred
import yfinance as yf
from pathlib import Path
import json
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import plotly.graph_objs as go
import plotly.io as pio
import logging

pio.renderers.default = 'notebook'

class Config:
    """Configuration management"""
    def __init__(self, config_path='configs.json'):
        self.config_path = Path(config_path)
        self.cache_folder = Path('cache')
        self.cache_folder.mkdir(exist_ok=True)
        
    def load_api_key(self):
        if not self.config_path.exists():
            raise FileNotFoundError(f"Config file not found: {self.config_path}")
        with open(self.config_path, 'r') as f:
            return json.load(f)['api_key']


class FredData:
    def __init__(self, features=None, cache_folder='cache'):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.folder = Path(cache_folder)
        self.folder.mkdir(exist_ok=True)
        
        # Handle features input
        if features is None:
            self.features = []
        elif isinstance(features, str):
            self.features = [features]
        elif isinstance(features, list):
            self.features = features
        else:
            raise TypeError("features must be None, str, or list")
        
        self.fred = None
        
    def connect(self, api_key):
        """Initialize Fred connection"""
        self.fred = Fred(api_key)
        
    def get_data(self):
        if not self.fred:
            raise RuntimeError("Fred API not connected. Call connect() first.")
            
        if not self.features:
            self.logger.warning("No features specified")
            return False
            
        success = True
        for feature in self.features:
            path = self.folder / f'{feature}.csv'
            if path.exists():
                self.logger.info(f'Data for {feature} found in cache.')
                continue
                
            try:
                self.logger.info(f'Fetching data for {feature}...')
                series = self.fred.get_series(feature)
                series.name = feature
                series.to_csv(path)
                self.logger.info(f'Data for {feature} successfully downloaded.')
            except Exception as e:
                self.logger.warning(f'Could not download {feature}: {e}')
                success = False
                
        return success


class TickerData:
    def __init__(self, ticker, cache_folder='cache', interval='1d'):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.folder = Path(cache_folder)
        self.ticker = ticker if isinstance(ticker, str) else None
        self.interval = interval
        
        if self.ticker is None:
            raise ValueError("ticker must be a string")
        
    def get_data(self):
        path = self.folder / f'{self.ticker}_ohlc_{self.interval}.csv'
        
        if path.exists():
            self.logger.info(f'Data for {self.ticker} found in cache.')
            return True
            
        try:
            self.logger.info(f'Fetching data for {self.ticker}...')
            df = yf.download(
                tickers=self.ticker,
                interval=self.interval,
                period='max',
                progress=False,
                auto_adjust=True
            )
            
            if df.empty:
                raise ValueError(f"No data returned for {self.ticker}")
                
            df.columns = ['Open', 'High', 'Low', 'Close', 'Volume']
            df.to_csv(path)
            self.logger.info(f'Data for {self.ticker} successfully downloaded.')
            return True
            
        except Exception as e:
            self.logger.error(f'Could not download {self.ticker}: {e}')
            return False


class DataAnalyzer:
    def __init__(self, ticker, feature, cache_folder='cache'):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.folder = Path(cache_folder)
        
        if not isinstance(ticker, str) or not isinstance(feature, str):
            raise TypeError("ticker and feature must be strings")
            
        self.ticker = ticker
        self.feature = feature
    
    def _load_csv(self, path):
        """Helper to load and validate CSV data"""
        if not path.exists():
            self.logger.warning(f'{path.name} does not exist.')
            return pd.DataFrame()
            
        try:
            df = pd.read_csv(path, index_col=0, parse_dates=True).dropna()
            if df.empty:
                self.logger.warning(f'{path.name} is empty.')
            return df
        except Exception as e:
            self.logger.error(f'Error loading {path.name}: {e}')
            return pd.DataFrame()
    
    def _get_ticker(self):
        path = self.folder / f'{self.ticker}_ohlc_1d.csv'
        return self._load_csv(path)
        
    def _get_feature(self):
        path = self.folder / f'{self.feature}.csv'
        return self._load_csv(path)
    
    def analyze(self):
        """Analyze correlation between ticker and feature"""
        tdf = self._get_ticker()
        fdf = self._get_feature()
        
        if tdf.empty or fdf.empty:
            self.logger.warning(f'Cannot analyze {self.ticker} vs {self.feature}: insufficient data')
            return None, None
        
        # Use Close price for ticker
        tdf = tdf['Close'].to_frame(name=self.ticker)
        
        # Merge on date index
        df = pd.concat([tdf, fdf], join='inner', axis=1)
        
        if df.empty:
            self.logger.warning(f'No overlapping dates for {self.ticker} and {self.feature}')
            return None, None
        
        # Normalize
        scaler = StandardScaler()
        normalized = pd.DataFrame(
            scaler.fit_transform(df),
            columns=df.columns,
            index=df.index
        )
        
        # Calculate correlation
        corr = normalized[self.ticker].corr(normalized[self.feature])
        
        return normalized, corr


class EconomicPredictor:
    def __init__(self, lag=7, poly_degree=2, alpha=0.1):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.lag = lag
        self.poly_degree = poly_degree
        self.alpha = alpha
        self.model = Ridge(alpha=self.alpha)
        self.scaler = StandardScaler()
        self.poly = PolynomialFeatures(degree=self.poly_degree)
        self.feature_names = []
        
    def prepare_data(self, feature_dfs, ticker_name, corr_threshold=0.3):
        """Prepare features and target from analyzed data"""
        valid_features = []
        
        for df, corr in feature_dfs:
            if df is None or abs(corr) < corr_threshold:
                continue
            valid_features.append(df)
        
        if not valid_features:
            raise ValueError("No features meet correlation threshold")
        
        # Combine all valid features
        combined = pd.concat(valid_features, axis=1, join='inner')
        
        # Remove duplicate columns (keep first occurrence)
        combined = combined.loc[:, ~combined.columns.duplicated()]
        
        # Separate target (ticker) from features
        if ticker_name not in combined.columns:
            raise ValueError(f"Ticker {ticker_name} not found in combined data")
            
        y = combined[ticker_name].copy()
        X = combined.drop(columns=[ticker_name])
        
        # Store feature names for plotting
        self.feature_names = X.columns.tolist()
        
        # Apply lag: predict future y using past X
        # Shift y forward (or X backward) by lag periods
        y_future = y.shift(-self.lag).dropna()
        X_aligned = X.loc[y_future.index]
        
        self.logger.info(f"Prepared data shape: X={X_aligned.shape}, y={y_future.shape}")
        
        # Create polynomial features
        X_poly = self.poly.fit_transform(X_aligned)
        
        return X_poly, y_future.values, X_aligned, y_future
    
    def train(self, X, y, test_size=0.35):
        """Train model with time series split"""
        # Ensure y is 1D
        if len(y.shape) > 1:
            y = y.flatten()
            
        split_idx = int(len(X) * (1 - test_size))
        X_train, X_test = X[:split_idx], X[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]
        
        self.logger.info(f"Training on {len(X_train)} samples, testing on {len(X_test)}")
        self.logger.info(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        self.logger.info(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
        
        # Fit model
        self.model.fit(X_train, y_train)
        
        # Predict
        y_pred = self.model.predict(X_test)
        
        # Ensure both are 1D for metrics
        y_test = y_test.flatten()
        y_pred = y_pred.flatten()
        
        # Metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        self.logger.info(f"MSE: {mse:.4f}, RÂ²: {r2:.4f}")
        
        return {
            'y_test': y_test,
            'y_pred': y_pred,
            'mse': mse,
            'r2': r2,
            'split_idx': split_idx
        }


def plot_feature_correlations(analyzed_data, features_dict, symbol):
    """Plot correlation scatter plots for feature selection"""
    fig = go.Figure()
    fig.update_layout(template='presentation')
    
    for (df, corr), (name, code) in zip(analyzed_data, features_dict.items()):
        if df is not None and not df.empty:
            fig.add_trace(
                go.Scatter(
                    x=df[symbol],
                    y=df[code],
                    mode='markers',
                    name=f'{name} (r={corr:.2f})',
                    marker=dict(size=5),
                    hovertemplate='<b>%{x:.2f}<br>%{y:.2f}</b>',
                    opacity=0.8
                )
            )
    
    fig.update_layout(
        xaxis_title=f'{symbol} (Normalized)',
        yaxis_title='Feature (Normalized)',
        font=dict(size=11),
        title=dict(
            text='Feature & Ticker Relationship',
            font=dict(size=17, style='italic')
        ),
        hovermode='closest'
    )
    
    return fig


def plot_model_evaluation(results, dates=None):
    """Plot actual vs predicted values"""
    fig = go.Figure()
    
    x_axis = dates if dates is not None else np.arange(len(results['y_test']))
    
    # Actual values
    fig.add_trace(
        go.Scatter(
            x=x_axis,
            y=results['y_test'],
            mode='lines',
            name='Actual',
            line=dict(color='blue', width=2),
            hovertemplate='<b>Actual: %{y:.3f}</b><extra></extra>'
        )
    )
    
    # Predicted values
    fig.add_trace(
        go.Scatter(
            x=x_axis,
            y=results['y_pred'],
            mode='lines',
            name='Predicted',
            line=dict(color='red', width=2, dash='dash'),
            hovertemplate='<b>Predicted: %{y:.3f}</b><extra></extra>'
        )
    )
    
    # Add metrics annotation
    fig.add_annotation(
        xref='paper', yref='paper',
        x=0.02, y=0.98,
        text=f"RÂ² = {results['r2']:.3f}<br>MSE = {results['mse']:.4f}",
        showarrow=False,
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='black',
        borderwidth=1,
        font=dict(size=12)
    )
    
    fig.update_layout(
        template='presentation',
        title=dict(
            text='Machine Learning Model Evaluation',
            font=dict(style='italic', size=17)
        ),
        font=dict(size=11),
        xaxis_title='Time Index' if dates is None else 'Date',
        yaxis_title='Ticker (Normalized)',
        hovermode='x unified'
    )
    
    return fig


def plot_residuals(results):
    """Plot residuals to assess model performance"""
    residuals = results['y_test'] - results['y_pred']
    
    fig = go.Figure()
    
    # Residual scatter
    fig.add_trace(
        go.Scatter(
            x=results['y_pred'],
            y=residuals,
            mode='markers',
            name='Residuals',
            marker=dict(size=6, color='purple', opacity=0.6),
            hovertemplate='<b>Predicted: %{x:.3f}<br>Residual: %{y:.3f}</b><extra></extra>'
        )
    )
    
    # Zero line
    fig.add_hline(
        y=0, 
        line_dash="dash", 
        line_color="red",
        annotation_text="Zero Error"
    )
    
    fig.update_layout(
        template='presentation',
        title=dict(
            text='Residual Plot',
            font=dict(style='italic', size=17)
        ),
        xaxis_title='Predicted Values',
        yaxis_title='Residuals',
        font=dict(size=11),
        showlegend=False
    )
    
    return fig


def plot_feature_importance(model, feature_names):
    """Plot feature importance from model coefficients"""
    if hasattr(model, 'coef_'):
        # For polynomial features, only show original features (first n coefficients)
        n_features = len(feature_names)
        coeffs = model.coef_[0][:n_features] if len(model.coef_.shape) > 1 else model.coef_[:n_features]
        
        fig = go.Figure()
        
        fig.add_trace(
            go.Bar(
                x=feature_names,
                y=np.abs(coeffs),
                marker=dict(color=coeffs, colorscale='RdBu', showscale=True),
                hovertemplate='<b>%{x}<br>Coefficient: %{y:.3f}</b><extra></extra>'
            )
        )
        
        fig.update_layout(
            template='presentation',
            title=dict(
                text='Feature Importance (Absolute Coefficients)',
                font=dict(style='italic', size=17)
            ),
            xaxis_title='Features',
            yaxis_title='|Coefficient|',
            font=dict(size=11),
            showlegend=False
        )
        
        return fig
    
    return None


def main():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    
    # Configuration
    symbol = 'SPY'
    features = {
        'Consumer Price Index': 'CPIAUCSL',
        'Producer Price Index': 'PPIACO',
        'Unemployment Rate': 'UNRATE',
        'Job Openings': 'PAYEMS',
        'Interest Rate': 'FEDFUNDS',
        'Consumer Sentiment': 'UMCSENT',
        'PCE Price Index': 'PCEPI',
        'Core PCE (Non-Seasonal)': 'PCEPILFE'
    }
    
    # Load config and get data
    config = Config()
    api_key = config.load_api_key()
    
    # Fetch economic data
    fred_data = FredData(list(features.values()))
    fred_data.connect(api_key)
    fred_data.get_data()
    
    # Fetch ticker data
    ticker_data = TickerData(symbol)
    ticker_data.get_data()
    
    # Analyze correlations
    print("\n" + "="*60)
    print("CORRELATION ANALYSIS")
    print("="*60)
    analyzed_data = []
    for name, code in features.items():
        analyzer = DataAnalyzer(symbol, code)
        normalized_df, corr = analyzer.analyze()
        if normalized_df is not None:
            print(f"{name:30s} ({code:10s}): r = {corr:6.3f}")
            analyzed_data.append((normalized_df, corr))
        else:
            analyzed_data.append((None, 0))
    
    # Plot 1: Feature Correlations
    print("\nðŸ“Š Plotting feature correlations...")
    fig1 = plot_feature_correlations(analyzed_data, features, symbol)
    fig1.show()
    
    # Train model
    print("\n" + "="*60)
    print("MODEL TRAINING")
    print("="*60)
    predictor = EconomicPredictor(lag=7, poly_degree=2, alpha=0.1)
    X, y, X_original, y_original = predictor.prepare_data(analyzed_data, symbol, corr_threshold=0.3)
    results = predictor.train(X, y, test_size=0.22)
    
    # Get dates for x-axis
    test_dates = y_original.index[results['split_idx']:]
    
    # Plot 2: Model Evaluation
    print("\nðŸ“Š Plotting model evaluation...")
    fig2 = plot_model_evaluation(results, dates=test_dates)
    fig2.show()
    
    # Plot 3: Residuals
    print("\nðŸ“Š Plotting residuals...")
    fig3 = plot_residuals(results)
    fig3.show()
    
    # Plot 4: Feature Importance
    if predictor.feature_names:
        print("\nðŸ“Š Plotting feature importance...")
        fig4 = plot_feature_importance(predictor.model, predictor.feature_names)
        if fig4:
            fig4.show()
    
    # Summary
    print("\n" + "="*60)
    print("MODEL PERFORMANCE SUMMARY")
    print("="*60)
    print(f"RÂ² Score:     {results['r2']:.4f}")
    print(f"MSE:          {results['mse']:.4f}")
    print(f"RMSE:         {np.sqrt(results['mse']):.4f}")
    print(f"Train Size:   {results['split_idx']}")
    print(f"Test Size:    {len(results['y_test'])}")
    print("="*60)


if __name__ == '__main__':
    main()
