# INTERSECTON

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
import warnings
warnings.filterwarnings('ignore')


def est_ndwcov_factor(Y, factors, ic, lambda_min=True):
    """
    Estimate nodewise covariance with factor models using LASSO.
    
    Parameters:
    -----------
    Y : numpy.ndarray
        n x p matrix of observations
    factors : numpy.ndarray
        n x k matrix of factors
    ic : str
        Information criterion: 'WIC', 'BIC', 'GIC', 'AIC', or 'cv'
    lambda_min : bool
        If True and ic='cv', use lambda.min; otherwise use lambda.1se
        
    Returns:
    --------
    TAU : numpy.ndarray
        p x p precision matrix estimate
    """
    # Initialization
    p = Y.shape[1]
    n = Y.shape[0]
    C = np.zeros((p, p))
    np.fill_diagonal(C, 1)
    tau = []
    ns1 = np.ones((n, 1))
    
    # Fit factor model: Y = factors * beta + u
    # Add intercept to factors
    factors_with_intercept = np.column_stack([np.ones(n), factors])
    
    # Fit linear regression for each column of Y
    factormodel = LinearRegression(fit_intercept=False)
    factormodel.fit(factors_with_intercept, Y)
    
    # Get residuals and beta coefficients (excluding intercept)
    u = Y - factormodel.predict(factors_with_intercept)
    beta = factormodel.coef_[:, 1:]  # p x k matrix (excluding intercept)
    
    # Loop over the assets
    for j in range(p):
        # Create design matrix excluding column j
        X_j = np.delete(u, j, axis=1)
        y_j = u[:, j]
        
        if ic != 'cv':
            # Fit LASSO path
            alphas = np.logspace(-4, 1, 100)
            df_list = []
            sig_list = []
            bic_list = []
            coef_list = []
            res_list = []
            
            for alpha in alphas:
                model = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)
                model.fit(X_j, y_j)
                
                y_pred = model.predict(X_j)
                res = y_j - y_pred
                df = np.sum(np.abs(model.coef_) > 1e-8)
                sig = np.sum(res**2) / n
                
                if ic == 'WIC':
                    bic_val = np.log(sig) + df * np.log(n) / n * np.log(np.log(p))
                elif ic == 'BIC':
                    bic_val = np.log(sig) + df * np.log(n) / n
                elif ic == 'GIC':
                    bic_val = np.log(sig) + df * np.log(p) * np.log(np.log(n)) / n
                elif ic == 'AIC':
                    bic_val = np.log(sig) + 2 * df
                else:
                    raise ValueError(f"Unknown IC: {ic}")
                
                df_list.append(df)
                sig_list.append(sig)
                bic_list.append(bic_val)
                coef_list.append(model.coef_.copy())
                res_list.append(res)
            
            jind = np.argmin(bic_list)
            jpar = coef_list[jind]
            jres = res_list[jind]
            jtau = np.sum(y_j * jres) / n
            
        else:  # Cross-validation
            lasso_cv = LassoCV(cv=5, fit_intercept=False, max_iter=10000, n_alphas=100)
            lasso_cv.fit(X_j, y_j)
            
            if lambda_min:
                jfit = lasso_cv.predict(X_j)
                jpar = lasso_cv.coef_
            else:
                cv_scores = lasso_cv.mse_path_.mean(axis=1)
                cv_std = lasso_cv.mse_path_.std(axis=1)
                min_idx = np.argmin(cv_scores)
                threshold = cv_scores[min_idx] + cv_std[min_idx]
                valid_indices = np.where(cv_scores <= threshold)[0]
                se_idx = valid_indices[0] if len(valid_indices) > 0 else min_idx
                selected_alpha = lasso_cv.alphas_[se_idx]
                model_1se = Lasso(alpha=selected_alpha, fit_intercept=False, max_iter=10000)
                model_1se.fit(X_j, y_j)
                jfit = model_1se.predict(X_j)
                jpar = model_1se.coef_
            
            jres = y_j - jfit
            jtau = np.sum(y_j * jres) / n
        
        C_row = np.insert(-jpar / jtau, j, 0)
        C[j, :] = C_row
        tau.append(jtau)
    
    np.fill_diagonal(C, 1 / np.array(tau))
    omega = C.copy()
    omegasym = (C + C.T) / 2
    
    covft = (1/n) * (factors.T @ factors) - (1/(n**2)) * (factors.T @ ns1 @ ns1.T @ factors)
    covft = covft.astype(np.float64)
    beta = beta.astype(np.float64)
    omegasym = omegasym.astype(np.float64)
    
    covft_inv = np.linalg.inv(covft)
    p1 = np.linalg.inv(covft_inv + beta.T @ omegasym @ beta)
    TAU = omega - omega @ beta @ p1 @ beta.T @ omega
    
    return TAU


def gmv_weights(Theta_hat):
    """Compute Global Minimum Variance portfolio weights."""
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    numerator = Theta_hat @ ones_p
    denominator = ones_p @ Theta_hat @ ones_p
    
    if np.abs(denominator) < 1e-10:
        return ones_p / p
    
    return numerator / denominator


def mv_weights(Theta_hat, mu, target_return=0.01):
    """Compute Mean-Variance portfolio weights with target return."""
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    A = ones_p @ Theta_hat @ ones_p
    B = ones_p @ Theta_hat @ mu
    C = mu @ Theta_hat @ mu
    D = A * C - B * B
    
    if np.abs(D) < 1e-10:
        if np.abs(A) > 1e-10:
            return (Theta_hat @ ones_p) / A
        else:
            return ones_p / p
    
    lambda1 = (C - B * target_return) / D
    lambda2 = (A * target_return - B) / D
    w_star = lambda1 * (Theta_hat @ ones_p) + lambda2 * (Theta_hat @ mu)
    
    return w_star


def msr_weights(Theta_hat, mu):
    """Compute Maximum Sharpe Ratio portfolio weights."""
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    w_unnorm = Theta_hat @ mu
    weight_sum = np.sum(w_unnorm)
    
    if np.abs(weight_sum) < 1e-10:
        return ones_p / p
    
    return w_unnorm / weight_sum


def load_yearly_signals(year, buys_path_template='buys_{}.csv', sells_path_template='sells_{}.csv'):
    """Load buy and sell signals for a specific year."""
    try:
        buys = pd.read_csv(buys_path_template.format(year), index_col=1)
        sells = pd.read_csv(sells_path_template.format(year), index_col=1)
        buys.index.name = 'permno'
        sells.index.name = 'permno'
        buys_index = buys.index.astype(int)
        sells_index = sells.index.astype(int)
        return set(buys_index.union(sells_index))
    except FileNotFoundError as e:
        print(f"  ⚠ Warning: Could not load signals for year {year}: {e}")
        return set()


def load_ff_factors(factors_path='factors_ff_monthly_raw.csv'):
    """Load Fama-French factors from CSV file."""
    factors_df = pd.read_csv(factors_path)
    factors_df['date'] = pd.to_datetime(factors_df.iloc[:, 0].astype(str), format='%Y%m')
    factors_df['date'] = factors_df['date'] + pd.offsets.MonthEnd(0)
    factors_df = factors_df.set_index('date')[['Mkt-RF', 'SMB', 'HML']]
    factors_df = factors_df / 100
    return factors_df


def load_finbert_signals(signals_path):
    """Load FinBERT monthly signals from CSV file."""
    try:
        signals_df = pd.read_csv(signals_path)
        signals_df['date'] = pd.to_datetime(signals_df['year_month']) + pd.offsets.MonthEnd(0)
        return signals_df
    except FileNotFoundError as e:
        print(f"  ⚠ Warning: Could not load FinBERT signals: {e}")
        return pd.DataFrame(columns=['symbol', 'company', 'year_month', 'signal', 'date'])


def get_finbert_permnos_for_date(signals_df, ticker_to_permno, date):
    """Get set of permnos with 'buy' or 'sell' signals for a specific date."""
    date_signals = signals_df[signals_df['date'] == date]
    buy_signals = date_signals[date_signals['signal'] == 'buy']
    sell_signals = date_signals[date_signals['signal'] == 'sell']
    
    permnos = set()
    for ticker in buy_signals['symbol'].values:
        if ticker in ticker_to_permno:
            permnos.add(ticker_to_permno[ticker])
    for ticker in sell_signals['symbol'].values:
        if ticker in ticker_to_permno:
            permnos.add(ticker_to_permno[ticker])
    
    return permnos


def create_ticker_to_permno_mapping(df):
    """Create a mapping from ticker to permno."""
    if 'ticker' not in df.columns:
        raise ValueError("DataFrame must have 'ticker' column for mapping")
    valid_df = df[df['ticker'].notna()].copy()
    ticker_to_permno = valid_df.groupby('ticker')['permno'].last().to_dict()
    return ticker_to_permno


def calculate_exit_transaction_cost(prev_weights_dict, prev_oos_returns_dict, 
                                    prev_gross_return, transaction_cost, verbose=False):
    """Calculate transaction cost when exiting the market."""
    if len(prev_weights_dict) == 0:
        return 0.0, 0.0, 0.0
    
    adjusted_prev = {}
    for asset, prev_w in prev_weights_dict.items():
        if asset in prev_oos_returns_dict:
            prev_r = prev_oos_returns_dict[asset]
            if abs(1 + prev_gross_return) > 1e-6:
                adjusted_prev[asset] = prev_w * (1 + prev_r) / (1 + prev_gross_return)
            else:
                adjusted_prev[asset] = 0.0
        else:
            if abs(1 + prev_gross_return) > 1e-6:
                adjusted_prev[asset] = prev_w / (1 + prev_gross_return)
            else:
                adjusted_prev[asset] = 0.0
    
    turnover = sum(abs(w) for w in adjusted_prev.values())
    tc = transaction_cost * 1.0 * turnover
    net_return = -tc
    
    if verbose:
        print(f"  Liquidating positions | Turnover: {turnover:>6.4f} | TC: {tc:>8.6f}")
    
    return turnover, tc, net_return


def backtest_nodewise_all_portfolios(df, 
                                     factors_path='factors_ff_monthly_raw.csv',
                                     ic='GIC',
                                     target_return=0.01,
                                     test_start_date='2020-01-31', 
                                     test_end_date='2024-11-30',
                                     lookback_window=180,
                                     transaction_cost=0.001,
                                     buys_path_template='buys_{}.csv',
                                     sells_path_template='sells_{}.csv',
                                     finbert_signals_path=None,
                                     verbose=True):
    """
    Backtest all three portfolio strategies (GMV, MV, MSR) simultaneously.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with columns: permno, datadate, ticker, ret_fwd_1
    factors_path : str
        Path to Fama-French factors CSV
    ic : str
        Information criterion: 'WIC', 'BIC', 'GIC', 'AIC', or 'cv'
    target_return : float
        Target return for MV portfolio (default: 0.01 = 1% monthly)
    test_start_date : str
        First date for out-of-sample returns
    test_end_date : str
        Last date for out-of-sample returns
    lookback_window : int
        Number of months in rolling training window
    transaction_cost : float
        Proportional transaction cost
    buys_path_template : str
        Template for buys file path
    sells_path_template : str
        Template for sells file path
    finbert_signals_path : str or None
        Path to FinBERT signals CSV file
    verbose : bool
        If True, prints detailed log
    
    Returns:
    --------
    results_dict : dict
        Dictionary with keys 'GMV', 'MV', 'MSR', each containing (results_df, metrics)
    """
    
    df = df.copy()
    if 'datadate' not in df.columns or 'permno' not in df.columns:
        raise ValueError("DataFrame must have 'datadate' and 'permno' columns")
    df['datadate'] = pd.to_datetime(df['datadate'])

    factors_df = load_ff_factors(factors_path)
    
    if verbose:
        print("Creating ticker to permno mapping...")
    ticker_to_permno = create_ticker_to_permno_mapping(df)
    if verbose:
        print(f"Mapped {len(ticker_to_permno)} unique tickers to permnos")
    
    finbert_df = None
    if finbert_signals_path is not None:
        finbert_df = load_finbert_signals(finbert_signals_path)
        if verbose and len(finbert_df) > 0:
            print(f"Loaded FinBERT signals: {len(finbert_df)} monthly records")
            print(f"FinBERT signal distribution:")
            print(finbert_df['signal'].value_counts())
    
    all_dates = sorted(df['datadate'].unique())
    test_start_dt = pd.to_datetime(test_start_date)
    test_end_dt = pd.to_datetime(test_end_date)
    
    try:
        test_start_idx = all_dates.index(test_start_dt)
        test_end_idx = all_dates.index(test_end_dt)
    except ValueError as e:
        raise ValueError(f"Date not found in DataFrame: {e}")
    
    if test_start_idx < lookback_window:
        raise ValueError(f"Not enough data for lookback.")
    
    # Storage for all three portfolios
    portfolios = {
        'GMV': {
            'returns': [], 'dates': [], 'weights': [], 'turnover': [], 'gross_returns': [],
            'prev_weights': {}, 'prev_oos_returns': {}, 'prev_gross_return': 0.0
        },
        'MV': {
            'returns': [], 'dates': [], 'weights': [], 'turnover': [], 'gross_returns': [],
            'prev_weights': {}, 'prev_oos_returns': {}, 'prev_gross_return': 0.0
        },
        'MSR': {
            'returns': [], 'dates': [], 'weights': [], 'turnover': [], 'gross_returns': [],
            'prev_weights': {}, 'prev_oos_returns': {}, 'prev_gross_return': 0.0
        }
    }
    
    yearly_signals_cache = {}
    
    if verbose:
        print("="*60)
        print("STARTING BACKTEST: ALL PORTFOLIOS (GMV, MV, MSR)")
        print("="*60)
        
    for t in range(test_start_idx, test_end_idx + 1):
        current_date = all_dates[t]
        current_year = current_date.year
        
        if current_year not in yearly_signals_cache:
            yearly_signals_cache[current_year] = load_yearly_signals(
                current_year, buys_path_template, sells_path_template
            )
        
        yearly_permnos = yearly_signals_cache[current_year]
        
        finbert_permnos = set()
        if finbert_df is not None and len(finbert_df) > 0:
            finbert_permnos = get_finbert_permnos_for_date(finbert_df, ticker_to_permno, current_date)
        
        allowed_permnos = yearly_permnos.intersection(finbert_permnos)
        if len(allowed_permnos) <= 1:
            allowed_permnos = yearly_permnos.union(finbert_permnos)
        
        oos_data = df[(df['datadate'] == current_date) & (df['permno'].isin(allowed_permnos))]
        oos_returns_series = oos_data.set_index('permno')['ret_fwd_1']
        oos_returns_series = oos_returns_series.dropna()
        oos_returns_dict = oos_returns_series.to_dict()
        
        if len(allowed_permnos) == 0:
            if verbose:
                print(f"\n[{t - test_start_idx + 1}/{test_end_idx - test_start_idx + 1}] "
                      f"Date: {current_date.strftime('%Y-%m-%d')}")
                print(f"  ⚠ No signals, recording zero return for all portfolios")
            
            # Handle exit for all three portfolios
            for ptype in ['GMV', 'MV', 'MSR']:
                p = portfolios[ptype]
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    p['prev_weights'], p['prev_oos_returns'], p['prev_gross_return'], 
                    transaction_cost, verbose=True
                )
                
                p['returns'].append(net_return)
                p['dates'].append(current_date)
                p['weights'].append({})
                p['turnover'].append(turnover)
                p['gross_returns'].append(0.0)
                p['prev_weights'] = {}
                p['prev_oos_returns'] = {}
                p['prev_gross_return'] = 0.0
            
            continue
        
        window_start_date = all_dates[t - lookback_window]
        window_end_date = all_dates[t - 1]
        
        train_data = df[(df['datadate'] >= window_start_date) & 
                        (df['datadate'] <= window_end_date) &
                        (df['permno'].isin(allowed_permnos))]
        
        returns_pivot = train_data.pivot(index='datadate', columns='permno', values='ret_fwd_1')
        window_dates = all_dates[t - lookback_window : t]
        returns_pivot = returns_pivot.reindex(index=window_dates)
        
        factor_dates = [(d + pd.DateOffset(months=1) + pd.offsets.MonthEnd(0)) for d in window_dates]
        
        try:
            factors_window = factors_df.loc[factor_dates]
        except KeyError as e:
            raise ValueError(f"Factor dates not found: {e}")
        
        if factors_window.isna().any().any():
            if verbose:
                print(f"\n[{t - test_start_idx + 1}/{test_end_idx - test_start_idx + 1}] "
                      f"Date: {current_date.strftime('%Y-%m-%d')}")
                print(f"  ⚠ Missing factor data, skipping")
            continue
        
        nan_assets = returns_pivot.columns[returns_pivot.isna().any()]
        filtered_pivot = returns_pivot.drop(columns=nan_assets)
        
        current_assets = filtered_pivot.columns.tolist()
        Y = filtered_pivot.values
        factors = factors_window.values
        n_train, p_current = Y.shape
    
        if verbose:
            print(f"\n[{t - test_start_idx + 1}/{test_end_idx - test_start_idx + 1}] "
                  f"Date: {current_date.strftime('%Y-%m-%d')} | Year: {current_year}")
            print(f"  Window: {window_start_date.strftime('%Y-%m-%d')} to "
                  f"{window_end_date.strftime('%Y-%m-%d')}")
            print(f"  Yearly: {len(yearly_permnos)} | FinBERT: {len(finbert_permnos)} | "
                  f"Union/Intersection: {len(allowed_permnos)} | Assets: {p_current}")
    
        if n_train < lookback_window or p_current < 2:
            if verbose:
                print(f"  ⚠ Insufficient data (n={n_train}, p={p_current})")
            
            # Handle exit for all three portfolios
            for ptype in ['GMV', 'MV', 'MSR']:
                p = portfolios[ptype]
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    p['prev_weights'], p['prev_oos_returns'], p['prev_gross_return'],
                    transaction_cost, verbose=True
                )
                
                p['returns'].append(net_return)
                p['dates'].append(current_date)
                p['weights'].append({})
                p['turnover'].append(turnover)
                p['gross_returns'].append(0.0)
                p['prev_weights'] = {}
                p['prev_oos_returns'] = {}
                p['prev_gross_return'] = 0.0
            
            continue
        
        try:
            if verbose:
                print(f"  Running Nodewise Regression...")
            Theta_hat = est_ndwcov_factor(Y, factors, ic=ic, lambda_min=True)
            
            mu = Y.mean(axis=0)
            
            if verbose:
                print(f"  Computing weights for all portfolios...")
            
            # Compute weights for all three strategies
            weights_dict = {
                'GMV': gmv_weights(Theta_hat),
                'MV': mv_weights(Theta_hat, mu, target_return=target_return),
                'MSR': msr_weights(Theta_hat, mu)
            }
            
        except Exception as e:
            if verbose:
                print(f"  ✗ Error: {e}")
            
            # Handle error for all three portfolios
            for ptype in ['GMV', 'MV', 'MSR']:
                p = portfolios[ptype]
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    p['prev_weights'], p['prev_oos_returns'], p['prev_gross_return'],
                    transaction_cost, verbose=True
                )
                
                p['returns'].append(net_return)
                p['dates'].append(current_date)
                p['weights'].append({})
                p['turnover'].append(turnover)
                p['gross_returns'].append(0.0)
                p['prev_weights'] = {}
                p['prev_oos_returns'] = {}
                p['prev_gross_return'] = 0.0
            
            continue

        # Process each portfolio type
        for ptype in ['GMV', 'MV', 'MSR']:
            p = portfolios[ptype]
            w_star = weights_dict[ptype]
            new_weights_dict = {asset: w_star[i] for i, asset in enumerate(current_assets)}
            
            # Normalize weights
            weight_sum = sum(new_weights_dict.values())
            if weight_sum <= 1e-10:
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    p['prev_weights'], p['prev_oos_returns'], p['prev_gross_return'],
                    transaction_cost, verbose=True
                )
                
                p['returns'].append(net_return)
                p['dates'].append(current_date)
                p['weights'].append({})
                p['turnover'].append(turnover)
                p['gross_returns'].append(0.0)
                p['prev_weights'] = {}
                p['prev_oos_returns'] = {}
                p['prev_gross_return'] = 0.0
                continue
            
            new_weights_dict = {k: v/weight_sum for k, v in new_weights_dict.items()}
            
            # Find common assets with returns
            common_assets = set(new_weights_dict.keys()) & set(oos_returns_dict.keys())
            
            if len(common_assets) == 0:
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    p['prev_weights'], p['prev_oos_returns'], p['prev_gross_return'],
                    transaction_cost, verbose=True
                )
                
                p['returns'].append(net_return)
                p['dates'].append(current_date)
                p['weights'].append({})
                p['turnover'].append(turnover)
                p['gross_returns'].append(0.0)
                p['prev_weights'] = {}
                p['prev_oos_returns'] = {}
                p['prev_gross_return'] = 0.0
                continue
            
            # Filter to common assets and renormalize
            common_weights = {a: new_weights_dict[a] for a in common_assets}
            common_weight_sum = sum(common_weights.values())
            if common_weight_sum <= 1e-10:
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    p['prev_weights'], p['prev_oos_returns'], p['prev_gross_return'],
                    transaction_cost, verbose=True
                )
                
                p['returns'].append(net_return)
                p['dates'].append(current_date)
                p['weights'].append({})
                p['turnover'].append(turnover)
                p['gross_returns'].append(0.0)
                p['prev_weights'] = {}
                p['prev_oos_returns'] = {}
                p['prev_gross_return'] = 0.0
                continue
            
            common_weights = {k: v/common_weight_sum for k, v in common_weights.items()}
            
            # Compute gross return
            gross_return = sum(common_weights[a] * oos_returns_dict[a] for a in common_assets)
            
            if np.isnan(gross_return) or np.isinf(gross_return):
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    p['prev_weights'], p['prev_oos_returns'], p['prev_gross_return'],
                    transaction_cost, verbose=True
                )
                
                p['returns'].append(net_return)
                p['dates'].append(current_date)
                p['weights'].append({})
                p['turnover'].append(turnover)
                p['gross_returns'].append(0.0)
                p['prev_weights'] = {}
                p['prev_oos_returns'] = {}
                p['prev_gross_return'] = 0.0
                continue
            
            # Calculate transaction costs
            if len(p['prev_weights']) > 0:
                adjusted_prev = {}
                for asset, prev_w in p['prev_weights'].items():
                    if asset in p['prev_oos_returns']:
                        prev_r = p['prev_oos_returns'][asset]
                        if abs(1 + p['prev_gross_return']) > 1e-6:
                            adjusted_prev[asset] = prev_w * (1 + prev_r) / (1 + p['prev_gross_return'])
                        else:
                            adjusted_prev[asset] = 0.0
                    else:
                        if abs(1 + p['prev_gross_return']) > 1e-6:
                            adjusted_prev[asset] = prev_w / (1 + p['prev_gross_return'])
                        else:
                            adjusted_prev[asset] = 0.0
                
                all_assets = set(adjusted_prev.keys()) | set(common_weights.keys())
                turnover = 0.0
                for asset in all_assets:
                    old_w = adjusted_prev.get(asset, 0.0)
                    new_w = common_weights.get(asset, 0.0)
                    turnover += abs(new_w - old_w)
                
                tc = transaction_cost * (1 + gross_return) * turnover
            else:
                turnover = sum(abs(w) for w in common_weights.values())
                tc = transaction_cost * (1 + gross_return) * turnover
            
            net_return = gross_return - tc
            
            # Store results
            p['returns'].append(net_return)
            p['dates'].append(current_date)
            p['weights'].append(common_weights.copy())
            p['turnover'].append(turnover)
            p['gross_returns'].append(gross_return)
            p['prev_weights'] = common_weights.copy()
            p['prev_oos_returns'] = {a: oos_returns_dict[a] for a in common_assets}
            p['prev_gross_return'] = gross_return
        
        if verbose:
            print(f"  GMV Net: {portfolios['GMV']['returns'][-1]:>8.5f} | "
                  f"MV Net: {portfolios['MV']['returns'][-1]:>8.5f} | "
                  f"MSR Net: {portfolios['MSR']['returns'][-1]:>8.5f}")

    if verbose:
        print("\n" + "="*60)
        print("BACKTEST COMPLETE - ALL PORTFOLIOS")
        print("="*60)
    
    # Compile results for all portfolios
    results_dict = {}
    
    for ptype in ['GMV', 'MV', 'MSR']:
        p = portfolios[ptype]
        
        results_df = pd.DataFrame({
            'date': p['dates'],
            'portfolio_return': p['returns'],
            'portfolio_gross_return': p['gross_returns'],
            'portfolio_weights': p['weights'],
            'portfolio_turnover': p['turnover']
        })
        results_df['cumulative_return'] = (1 + results_df['portfolio_return']).cumprod() - 1
        
        # Compute metrics
        if len(p['returns']) > 0:
            mean_return = np.mean(p['returns'])
            variance = np.var(p['returns'], ddof=1)
            sharpe_ratio = mean_return / np.sqrt(variance) if variance > 0 else 0
            
            annual_return = mean_return * 12
            annual_volatility = np.sqrt(variance * 12)
            annual_sharpe = annual_return / annual_volatility if annual_volatility > 0 else 0
            
            metrics = {
                'portfolio_type': ptype,
                'mean_return': mean_return,
                'variance': variance,
                'sharpe_ratio': sharpe_ratio,
                'annual_return': annual_return,
                'annual_volatility': annual_volatility,
                'annual_sharpe_ratio': annual_sharpe,
                'total_return': results_df['cumulative_return'].iloc[-1],
                'avg_turnover': np.mean(p['turnover']),
                'n_periods': len(p['returns']),
                'n_zero_periods': sum(1 for r in p['returns'] if r == 0)
            }
        else:
            metrics = {
                'portfolio_type': ptype,
                'mean_return': 0,
                'variance': 0,
                'sharpe_ratio': 0,
                'annual_return': 0,
                'annual_volatility': 0,
                'annual_sharpe_ratio': 0,
                'total_return': 0,
                'avg_turnover': 0,
                'n_periods': 0,
                'n_zero_periods': 0
            }
        
        results_dict[ptype] = (results_df, metrics)
        
        if verbose:
            print(f"\n{ptype} Portfolio:")
            print(f"  Annual Return: {metrics['annual_return']*100:.2f}%")
            print(f"  Annual Volatility: {metrics['annual_volatility']*100:.2f}%")
            print(f"  Annual Sharpe Ratio: {metrics['annual_sharpe_ratio']:.3f}")
            print(f"  Total Return: {metrics['total_return']*100:.2f}%")
            print(f"  Avg Turnover: {metrics['avg_turnover']:.4f}")
    
    return results_dict

In [2]:
df = pd.read_csv('../green cleaned.csv', dtype={'ncusip': 'string'})
df['ret_fwd_1'] = df.groupby('permno')['ret_excess'].shift(-1)

In [3]:
results_dict = backtest_nodewise_all_portfolios(
    df,
    factors_path='../AI Portfolio Selection/factors_ff_monthly_raw.csv',
    ic='GIC',  # or 'BIC', 'WIC', 'AIC', 'cv'
    test_start_date='2020-01-31',
    test_end_date='2024-04-30',
    lookback_window=180,
    transaction_cost=0.001,
    buys_path_template='../AI Portfolio Selection/novy_marx_buys_{}.csv',
    sells_path_template='../AI Portfolio Selection/novy_marx_sells_{}.csv',
    finbert_signals_path='monthly_signals_decay.csv',  # Your FinBERT signals
    verbose=True
)

# Access individual portfolio results
gmv_results, gmv_metrics = results_dict['GMV']
mv_results, mv_metrics = results_dict['MV']
msr_results, msr_metrics = results_dict['MSR']

# Print summary
print("\nPerformance Summary:")
print(f"GMV Sharpe: {gmv_metrics['annual_sharpe_ratio']:.3f}")
print(f"MV Sharpe:  {mv_metrics['annual_sharpe_ratio']:.3f}")
print(f"MSR Sharpe: {msr_metrics['annual_sharpe_ratio']:.3f}")

Creating ticker to permno mapping...
Mapped 1664 unique tickers to permnos
Loaded FinBERT signals: 24780 monthly records
FinBERT signal distribution:
signal
hold    23840
sell      529
buy       411
Name: count, dtype: int64
STARTING BACKTEST: ALL PORTFOLIOS (GMV, MV, MSR)

[1/52] Date: 2020-01-31 | Year: 2020
  Window: 2005-01-31 to 2019-12-31
  Yearly: 300 | FinBERT: 8 | Union/Intersection: 6 | Assets: 5
  Running Nodewise Regression...
  Computing weights for all portfolios...
  GMV Net: -0.11976 | MV Net: -0.12099 | MSR Net: -0.10926

[2/52] Date: 2020-02-29 | Year: 2020
  Window: 2005-02-28 to 2020-01-31
  Yearly: 300 | FinBERT: 10 | Union/Intersection: 5 | Assets: 4
  Running Nodewise Regression...
  Computing weights for all portfolios...
  GMV Net: -0.05769 | MV Net: -0.09230 | MSR Net: -0.03927

[3/52] Date: 2020-03-31 | Year: 2020
  Window: 2005-03-31 to 2020-02-29
  Yearly: 300 | FinBERT: 14 | Union/Intersection: 10 | Assets: 4
  Running Nodewise Regression...
  Computing we

In [7]:
print(f"\n GMV")
print(f"Annualized Sharpe Ratio: {gmv_metrics['annual_sharpe_ratio']:.4f}")
print(f"Mean Return: {gmv_metrics['mean_return']*12:.4f}")
print(f"Variance: {gmv_metrics['variance']*12:.4f}")
print(f"Avg Turnover: {gmv_metrics['avg_turnover']:.4f}")

print(f"\n MV")
print(f"Annualized Sharpe Ratio: {mv_metrics['annual_sharpe_ratio']:.4f}")
print(f"Mean Return: {mv_metrics['mean_return']*12:.4f}")
print(f"Variance: {mv_metrics['variance']*12:.4f}")
print(f"Avg Turnover: {mv_metrics['avg_turnover']:.4f}")

print(f"\n MSR")
print(f"Annualized Sharpe Ratio: {msr_metrics['annual_sharpe_ratio']:.4f}")
print(f"Mean Return: {msr_metrics['mean_return']*12:.4f}")
print(f"Variance: {msr_metrics['variance']*12:.4f}")
print(f"Avg Turnover: {msr_metrics['avg_turnover']:.4f}")


 GMV
Annualized Sharpe Ratio: 0.2741
Mean Return: 0.0556
Variance: 0.0412
Avg Turnover: 1.7299

 MV
Annualized Sharpe Ratio: 0.6695
Mean Return: 0.1772
Variance: 0.0701
Avg Turnover: 2.0879

 MSR
Annualized Sharpe Ratio: 0.1955
Mean Return: 0.0427
Variance: 0.0478
Avg Turnover: 2.2173
