In [2]:
def mv_weights(Theta_hat, mu, target_return=0.01):
    """
    Compute Mean-Variance portfolio weights with target return.
    
    Solves the constrained optimization:
    min w' Sigma w  subject to  w' mu = target_return  and  w' 1 = 1
    
    Solution uses Lagrange multipliers with two constraints.
    
    Parameters:
    -----------
    Theta_hat : np.ndarray, shape (p, p)
        Precision matrix (Sigma^{-1})
    mu : np.ndarray, shape (p,)
        Expected returns
    target_return : float
        Target portfolio return (default: 0.01 = 1% monthly)
    long_only : bool
        If True, falls back to GMV if MV produces negative weights
    
    Returns:
    --------
    w_star : np.ndarray, shape (p,)
        Portfolio weights
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    # Compute key quantities
    A = ones_p @ Theta_hat @ ones_p  # 1' Theta 1
    B = ones_p @ Theta_hat @ mu       # 1' Theta mu  
    C = mu @ Theta_hat @ mu           # mu' Theta mu
    D = A * C - B * B                  # Determinant
    
    # Check for singularity
    if np.abs(D) < 1e-10:
        print('SINGULARITY')
        # System is singular, use GMV instead
        if np.abs(A) > 1e-10:
            w_star = (Theta_hat @ ones_p) / A
            return w_star
        else:
            return ones_p / p
    
    
    # Compute Lagrange multipliers
    lambda1 = (C - B * target_return) / D
    lambda2 = (A * target_return - B) / D
    
    # Compute weights: w = lambda1 * Theta^{-1} 1 + lambda2 * Theta^{-1} mu
    w_star = lambda1 * (Theta_hat @ ones_p) + lambda2 * (Theta_hat @ mu)
    
    return w_star

def msr_weights(Theta_hat, mu):
    """
    Compute Maximum Sharpe Ratio portfolio weights.
    
    The maximum Sharpe ratio portfolio solves:
    max (w' mu) / sqrt(w' Sigma w)
    
    Solution (when mu represents excess returns):
    w ∝ Sigma^{-1} mu = Theta mu
    
    Then normalize so that sum(w) = 1.
    
    Parameters:
    -----------
    Theta_hat : np.ndarray, shape (p, p)
        Precision matrix (Sigma^{-1})
    mu : np.ndarray, shape (p,)
        Expected excess returns
    
    Returns:
    --------
    w_star : np.ndarray, shape (p,)
        Portfolio weights (sum to 1)
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    # Compute unnormalized weights: w ∝ Theta mu
    w_unnorm = Theta_hat @ mu
    
    # Normalize to sum to 1
    weight_sum = np.sum(w_unnorm)
    
    if np.abs(weight_sum) < 1e-10:
        print('WARNING: Weight sum near zero, returning equal weights')
        return ones_p / p
    
    w_star = w_unnorm / weight_sum
    
    return w_star

In [8]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
import warnings
warnings.filterwarnings('ignore')


def naive_nodewise_regression(Y_star, lambda_grid=None):
    """
    Implements Naive Nodewise Regression (Section 5.1.2).
    Uses GIC (Generalized Information Criterion) as in the paper.
    
    Parameters:
    -----------
    Y_star : np.ndarray, shape (n, p)
        Demeaned returns matrix (time x assets)
    lambda_grid : list or None
        Grid of lambda values to try. If None, creates default grid.
    
    Returns:
    --------
    Theta_hat : np.ndarray, shape (p, p)
        Estimated precision matrix
    """
    n, p = Y_star.shape

    # Initialize matrices
    Theta_hat = np.zeros((p, p))
    tau_squared = np.zeros(p)
    
    # Create lambda grid if not provided
    if lambda_grid is None:
        lambda_grid = np.logspace(-3, 1, 50)
    
    # For each asset j
    for j in range(p):
        # Step 1: Get y_j (target) and Y_{-j} (predictors)
        y_j = Y_star[:, j]
        Y_minus_j = np.delete(Y_star, j, axis=1)
        
        # Step 2-3: Estimate gamma_j using Lasso with GIC
        best_gic = np.inf
        best_lambda = lambda_grid[0]
        best_gamma = None
        best_ssr = None
        
        for lam in lambda_grid:
            lasso = Lasso(alpha=2*lam, fit_intercept=False, max_iter=10000)
            lasso.fit(Y_minus_j, y_j)
            gamma_j = lasso.coef_
            
            # Compute SSR and number of non-zero coefficients
            residuals = y_j - Y_minus_j @ gamma_j
            ssr = np.sum(residuals ** 2)
            sigma_sq_lambda = ssr / n
            q_lambda = np.sum(np.abs(gamma_j) > 1e-8)
            
            # Compute GIC
            if sigma_sq_lambda > 1e-10:
                gic = np.log(sigma_sq_lambda) + q_lambda * (np.log(p) / n) * np.log(np.log(n))
            else:
                gic = np.inf
            
            if gic < best_gic:
                best_gic = gic
                best_lambda = lam
                best_gamma = gamma_j.copy()
                best_ssr = ssr
        
        gamma_j_star = best_gamma
        tau_squared[j] = best_ssr / n + best_lambda * np.sum(np.abs(gamma_j_star))

        # Step 5: Form the j-th row of Theta_hat
        Theta_hat[j, j] = 1 / tau_squared[j]
        off_diag = -gamma_j_star / tau_squared[j]
        Theta_hat[j, :j] = off_diag[:j]
        Theta_hat[j, j+1:] = off_diag[j:]
    
    # Step 6: Symmetrize
    Theta_hat_sym = (Theta_hat + Theta_hat.T) / 2
    
    return Theta_hat_sym


def gmv_weights(Theta_hat):
    """
    Compute Global Minimum Variance (GMV) portfolio weights (Section 6.1).
    
    Parameters:
    -----------
    Theta_hat : np.ndarray, shape (p, p)
        Precision matrix
    
    Returns:
    --------
    w_star : np.ndarray, shape (p,)
        Portfolio weights
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    # w* = (Θ 1_p) / (1_p' Θ 1_p)
    numerator = Theta_hat @ ones_p
    denominator = ones_p @ Theta_hat @ ones_p
    
    if np.abs(denominator) < 1e-10:
        # Fallback to equal weights if precision matrix is near-singular
        return ones_p / p
    
    w_star = numerator / denominator
    
    return w_star


def load_finbert_signals(signals_path):
    """
    Load FinBERT monthly signals from CSV file.
    
    Parameters:
    -----------
    signals_path : str
        Path to monthly_signals.csv file
    
    Returns:
    --------
    signals_df : pd.DataFrame
        DataFrame with columns: symbol, company, year_month, signal, avg_sentiment_score
    """
    try:
        signals_df = pd.read_csv(signals_path)
        # Convert year_month to datetime (end of month)
        signals_df['date'] = pd.to_datetime(signals_df['year_month']) + pd.offsets.MonthEnd(0)
        return signals_df
    except FileNotFoundError as e:
        print(f"  ⚠ Warning: Could not load FinBERT signals: {e}")
        return pd.DataFrame(columns=['symbol', 'company', 'year_month', 'signal', 'date'])


def get_buy_signal_permnos_for_date(signals_df, ticker_to_permno, date):
    """
    Get set of permnos with 'buy' or 'sell' signals for a specific date.
    
    Parameters:
    -----------
    signals_df : pd.DataFrame
        FinBERT signals dataframe
    ticker_to_permno : dict
        Mapping from ticker symbol to permno
    date : pd.Timestamp
        Date to get signals for
    
    Returns:
    --------
    permno_set : set
        Set of permnos with buy or sell signals on this date
    """
    # Get signals for this date
    date_signals = signals_df[signals_df['date'] == date]
    
    # Filter for buy and sell signals only (exclude hold)
    buy_signals = date_signals[date_signals['signal'] == 'buy']
    sell_signals = date_signals[date_signals['signal'] == 'sell']
    
    # Convert tickers to permnos
    permnos = set()
    for ticker in buy_signals['symbol'].values:
        if ticker in ticker_to_permno:
            permnos.add(ticker_to_permno[ticker])
    for ticker in sell_signals['symbol'].values:
        if ticker in ticker_to_permno:
            permnos.add(ticker_to_permno[ticker])
    
    return permnos


def create_ticker_to_permno_mapping(df):
    """
    Create a mapping from ticker to permno from the returns dataframe.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Returns dataframe with 'ticker' and 'permno' columns
    
    Returns:
    --------
    ticker_to_permno : dict
        Mapping from ticker to permno (uses most recent permno for each ticker)
    """
    if 'ticker' not in df.columns:
        raise ValueError("DataFrame must have 'ticker' column for mapping")
    
    # Drop NaN tickers
    valid_df = df[df['ticker'].notna()].copy()
    
    # Get the most recent permno for each ticker
    ticker_to_permno = valid_df.groupby('ticker')['permno'].last().to_dict()
    
    return ticker_to_permno


def calculate_exit_transaction_cost(prev_weights_dict, prev_oos_returns_dict, 
                                    prev_gross_return, transaction_cost, verbose=False):
    """
    Calculate transaction cost when exiting the market (liquidating all positions).
    Enforces Immediate Liquidation logic:
    - Next period return is 0.0 (Cash)
    - Cost is paid on current portfolio value
    """
    if len(prev_weights_dict) == 0:
        return 0.0, 0.0, 0.0
    
    # Step 1: Adjust previous weights to current period's BEGINNING (drift from t-1 to t)
    adjusted_prev = {}
    for asset, prev_w in prev_weights_dict.items():
        if asset in prev_oos_returns_dict:
            prev_r = prev_oos_returns_dict[asset]
            if abs(1 + prev_gross_return) > 1e-6:
                adjusted_prev[asset] = prev_w * (1 + prev_r) / (1 + prev_gross_return)
            else:
                adjusted_prev[asset] = 0.0
        else:
            if abs(1 + prev_gross_return) > 1e-6:
                adjusted_prev[asset] = prev_w / (1 + prev_gross_return)
            else:
                adjusted_prev[asset] = 0.0
    
    # Step 2: Turnover (Selling everything to Cash)
    turnover = sum(abs(w) for w in adjusted_prev.values())
    
    # Step 3: Cost 
    # Paper Formula: c * (1 + R_next) * Turnover. 
    # Since R_next (Cash) is 0.0, this simplifies to c * 1.0 * Turnover.
    tc = transaction_cost * 1.0 * turnover
    
    # Step 4: Net Return is 0.0 (Cash return) - Cost
    net_return = -tc
    
    if verbose:
        print(f"  Liquidating positions | Turnover: {turnover:>6.4f} | TC: {tc:>8.6f}")
    
    return turnover, tc, net_return


def backtest_nodewise_gmv_finbert(df, 
                                   signals_path='monthly_signals.csv',
                                   test_start_date='2020-01-31', 
                                   test_end_date='2024-11-30',
                                   lookback_window=180,
                                   transaction_cost=0.001,
                                   mv_target_return=0.01,
                                   verbose=True):
    """
    Backtest Nodewise + GMV strategy using FinBERT buy/sell signals.
    Records zero returns and empty weights when skipping periods.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with columns: permno, datadate, ticker, ret_fwd_1
    signals_path : str
        Path to monthly_signals.csv file
    test_start_date : str
        First date for out-of-sample returns (format: 'YYYY-MM-DD')
    test_end_date : str
        Last date for out-of-sample returns (format: 'YYYY-MM-DD')
    lookback_window : int
        Number of months in rolling training window (default: 180)
    transaction_cost : float
        Proportional transaction cost (default: 0.001 = 10 bps)
    verbose : bool
        If True, prints detailed log at each time step.
    
    Returns:
    --------
    results_df : pd.DataFrame
        DataFrame with columns: date, portfolio_return, cumulative_return
    metrics : dict
        Overall performance metrics
    """
    # --- 1. Setup ---
    df = df.copy()
    if 'datadate' not in df.columns or 'permno' not in df.columns:
        raise ValueError("DataFrame must have 'datadate' and 'permno' columns")
    df['datadate'] = pd.to_datetime(df['datadate'])
    
    # Create ticker to permno mapping
    if verbose:
        print("Creating ticker to permno mapping...")
    ticker_to_permno = create_ticker_to_permno_mapping(df)
    if verbose:
        print(f"Mapped {len(ticker_to_permno)} unique tickers to permnos")
    
    # Load FinBERT signals
    if verbose:
        print(f"Loading FinBERT signals from {signals_path}...")
    signals_df = load_finbert_signals(signals_path)
    if len(signals_df) == 0:
        raise ValueError("No FinBERT signals loaded")
    
    if verbose:
        print(f"Loaded {len(signals_df)} monthly signals")
        print(f"Signal distribution:")
        print(signals_df['signal'].value_counts())
    
    # Get unique dates
    all_dates = sorted(df['datadate'].unique())
    
    # Convert test dates to datetime
    test_start_dt = pd.to_datetime(test_start_date)
    test_end_dt = pd.to_datetime(test_end_date)
    
    # Find date indices
    try:
        test_start_idx = all_dates.index(test_start_dt)
        test_end_idx = all_dates.index(test_end_dt)
    except ValueError as e:
        raise ValueError(f"Date not found in DataFrame: {e}")
    
    if test_start_idx < lookback_window:
        raise ValueError(f"Not enough data for lookback. Test start date {test_start_date} "
                         f"requires data back to {all_dates[test_start_idx - lookback_window]}, "
                         f"but only {test_start_idx} periods are available.")
    
    # Storage for results - separate for each portfolio type
    portfolio_results = {
        'gmv': {'returns': [], 'dates': [], 'weights': [], 'turnover': [], 'gross_returns': []},
        'mv': {'returns': [], 'dates': [], 'weights': [], 'turnover': [], 'gross_returns': []},
        'msr': {'returns': [], 'dates': [], 'weights': [], 'turnover': [], 'gross_returns': []}
    }
    
    # Track previous state for each portfolio
    prev_state = {
        'gmv': {'weights_dict': {}, 'oos_returns_dict': {}, 'gross_return': 0.0},
        'mv': {'weights_dict': {}, 'oos_returns_dict': {}, 'gross_return': 0.0},
        'msr': {'weights_dict': {}, 'oos_returns_dict': {}, 'gross_return': 0.0}
    }
    
    # Cache for yearly signals
    yearly_signals_cache = {}
    
    # --- 2. Rolling Window Backtest ---
    if verbose:
        print("="*60)
        print("STARTING BACKTEST WITH FINBERT BUY/SELL SIGNALS")
        print("="*60)
        
    for t in range(test_start_idx, test_end_idx + 1):
        current_date = all_dates[t]
        current_year = current_date.year
        
        # Get buy/sell signal permnos for current date
        allowed_permnos = get_buy_signal_permnos_for_date(
            signals_df, ticker_to_permno, current_date
        )
        
        # ========================================
        # CRITICAL: Get OOS returns FIRST before any early exits
        # ========================================
        oos_data = df[(df['datadate'] == current_date) & (df['permno'].isin(allowed_permnos))]
        oos_returns_series = oos_data.set_index('permno')['ret_fwd_1']
        oos_returns_series = oos_returns_series.dropna()
        oos_returns_dict = oos_returns_series.to_dict()
        
        # Now handle early exit cases
        if len(allowed_permnos) == 0:
            if verbose:
                print(f"\n[{t - test_start_idx + 1}/{test_end_idx - test_start_idx + 1}] "
                      f"Date: {current_date.strftime('%Y-%m-%d')}")
                print(f"  ⚠ No signals for {current_date.strftime('%Y-%m-%d')}, recording zero return")
            
            # Process exit for all three portfolios
            for ptype in ['gmv', 'mv', 'msr']:
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    prev_state[ptype]['weights_dict'], 
                    prev_state[ptype]['oos_returns_dict'], 
                    prev_state[ptype]['gross_return'], 
                    transaction_cost,
                    verbose=False
                )
                
                portfolio_results[ptype]['returns'].append(net_return)
                portfolio_results[ptype]['dates'].append(current_date)
                portfolio_results[ptype]['weights'].append({})
                portfolio_results[ptype]['turnover'].append(turnover)
                portfolio_results[ptype]['gross_returns'].append(0.0)
                
                prev_state[ptype] = {'weights_dict': {}, 'oos_returns_dict': {}, 'gross_return': 0.0}
            continue
        
        # Define the lookback window
        window_start_date = all_dates[t - lookback_window]
        window_end_date = all_dates[t - 1]
        
        # Get training data for this window
        train_data = df[(df['datadate'] >= window_start_date) & 
                        (df['datadate'] <= window_end_date) &
                        (df['permno'].isin(allowed_permnos))]
        
        # Pivot to get returns matrix
        returns_pivot = train_data.pivot(index='datadate', columns='permno', values='ret_fwd_1')
        window_dates = all_dates[t - lookback_window : t]
        returns_pivot = returns_pivot.reindex(index=window_dates)
        
        # Filter assets with any NaNs
        nan_assets = returns_pivot.columns[returns_pivot.isna().any()]
        filtered_pivot = returns_pivot.drop(columns=nan_assets)
        
        current_assets = filtered_pivot.columns.tolist()
        Y = filtered_pivot.values
        n_train, p_current = Y.shape
    
        if verbose:
            print(f"\n[{t - test_start_idx + 1}/{test_end_idx - test_start_idx + 1}] "
                  f"Date: {current_date.strftime('%Y-%m-%d')} | Year: {current_year}")
            print(f"  Window: {window_start_date.strftime('%Y-%m-%d')} to "
                  f"{window_end_date.strftime('%Y-%m-%d')}")
            print(f" | FinBERT: {len(allowed_permnos)} | Assets w/ data: {p_current}")
    
        # Check for valid data
        if n_train < lookback_window or p_current < 2:
            if verbose:
                print(f"  ⚠ Insufficient data (n={n_train}, p={p_current}), recording zero return")
            
            for ptype in ['gmv', 'mv', 'msr']:
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    prev_state[ptype]['weights_dict'], 
                    prev_state[ptype]['oos_returns_dict'], 
                    prev_state[ptype]['gross_return'], 
                    transaction_cost,
                    verbose=False
                )
                
                portfolio_results[ptype]['returns'].append(net_return)
                portfolio_results[ptype]['dates'].append(current_date)
                portfolio_results[ptype]['weights'].append({})
                portfolio_results[ptype]['turnover'].append(turnover)
                portfolio_results[ptype]['gross_returns'].append(0.0)
                
                prev_state[ptype] = {'weights_dict': {}, 'oos_returns_dict': {}, 'gross_return': 0.0}
            continue
        
        try:
            # Demean the returns
            Y_bar = Y.mean(axis=0)
            Y_star = Y - Y_bar
            
            if verbose:
                print(f"  Running Nodewise Regression...")
            Theta_hat = naive_nodewise_regression(Y_star)
            
            if verbose:
                print(f"  Computing portfolio weights...")

            mu = Y.mean(axis=0)
            
            # Compute weights for all three portfolios
            weights_dict = {
                'gmv': gmv_weights(Theta_hat),
                'mv': mv_weights(Theta_hat, mu, target_return=mv_target_return),
                'msr': msr_weights(Theta_hat, mu)
            }
            
            # Create weights dictionaries for each portfolio
            new_weights_dicts = {
                ptype: {asset: weights_dict[ptype][i] for i, asset in enumerate(current_assets)}
                for ptype in ['gmv', 'mv', 'msr']
            }
            
        except Exception as e:
            if verbose:
                print(f"  ✗ Error: {e}")
                print(f"  Recording zero return for all portfolios")
            
            for ptype in ['gmv', 'mv', 'msr']:
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    prev_state[ptype]['weights_dict'], 
                    prev_state[ptype]['oos_returns_dict'], 
                    prev_state[ptype]['gross_return'], 
                    transaction_cost,
                    verbose=False
                )
                
                portfolio_results[ptype]['returns'].append(net_return)
                portfolio_results[ptype]['dates'].append(current_date)
                portfolio_results[ptype]['weights'].append({})
                portfolio_results[ptype]['turnover'].append(turnover)
                portfolio_results[ptype]['gross_returns'].append(0.0)
                
                prev_state[ptype] = {'weights_dict': {}, 'oos_returns_dict': {}, 'gross_return': 0.0}
            continue

        # Process each portfolio type
        for ptype in ['gmv', 'mv', 'msr']:
            new_weights_dict = new_weights_dicts[ptype]
            
            # Normalize weights to sum to 1
            weight_sum = sum(new_weights_dict.values())
            if weight_sum > 1e-10:
                new_weights_dict = {k: v/weight_sum for k, v in new_weights_dict.items()}
            else:
                if verbose and ptype == 'gmv':  # Only print once
                    print(f"  ⚠ Zero weight sum for {ptype.upper()}, recording zero return")
                
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    prev_state[ptype]['weights_dict'], 
                    prev_state[ptype]['oos_returns_dict'], 
                    prev_state[ptype]['gross_return'], 
                    transaction_cost,
                    verbose=False
                )
                
                portfolio_results[ptype]['returns'].append(net_return)
                portfolio_results[ptype]['dates'].append(current_date)
                portfolio_results[ptype]['weights'].append({})
                portfolio_results[ptype]['turnover'].append(turnover)
                portfolio_results[ptype]['gross_returns'].append(0.0)
                
                prev_state[ptype] = {'weights_dict': {}, 'oos_returns_dict': {}, 'gross_return': 0.0}
                continue
            
            # Find common assets between weights and returns
            common_assets = set(new_weights_dict.keys()) & set(oos_returns_dict.keys())
            
            if len(common_assets) == 0:
                if verbose and ptype == 'gmv':
                    print(f"  ⚠ No common assets with valid returns")
                
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    prev_state[ptype]['weights_dict'], 
                    prev_state[ptype]['oos_returns_dict'], 
                    prev_state[ptype]['gross_return'], 
                    transaction_cost,
                    verbose=False
                )
                
                portfolio_results[ptype]['returns'].append(net_return)
                portfolio_results[ptype]['dates'].append(current_date)
                portfolio_results[ptype]['weights'].append({})
                portfolio_results[ptype]['turnover'].append(turnover)
                portfolio_results[ptype]['gross_returns'].append(0.0)
                
                prev_state[ptype] = {'weights_dict': {}, 'oos_returns_dict': {}, 'gross_return': 0.0}
                continue
            
            # Filter to common assets and renormalize
            common_weights = {a: new_weights_dict[a] for a in common_assets}
            common_weight_sum = sum(common_weights.values())
            if common_weight_sum > 1e-10:
                common_weights = {k: v/common_weight_sum for k, v in common_weights.items()}
            else:
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    prev_state[ptype]['weights_dict'], 
                    prev_state[ptype]['oos_returns_dict'], 
                    prev_state[ptype]['gross_return'], 
                    transaction_cost,
                    verbose=False
                )
                
                portfolio_results[ptype]['returns'].append(net_return)
                portfolio_results[ptype]['dates'].append(current_date)
                portfolio_results[ptype]['weights'].append({})
                portfolio_results[ptype]['turnover'].append(turnover)
                portfolio_results[ptype]['gross_returns'].append(0.0)
                
                prev_state[ptype] = {'weights_dict': {}, 'oos_returns_dict': {}, 'gross_return': 0.0}
                continue
            
            # Compute gross portfolio return
            gross_return = sum(common_weights[a] * oos_returns_dict[a] for a in common_assets)
            
            # Sanity check
            if np.isnan(gross_return) or np.isinf(gross_return):
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    prev_state[ptype]['weights_dict'], 
                    prev_state[ptype]['oos_returns_dict'], 
                    prev_state[ptype]['gross_return'], 
                    transaction_cost,
                    verbose=False
                )
                
                portfolio_results[ptype]['returns'].append(net_return)
                portfolio_results[ptype]['dates'].append(current_date)
                portfolio_results[ptype]['weights'].append({})
                portfolio_results[ptype]['turnover'].append(turnover)
                portfolio_results[ptype]['gross_returns'].append(0.0)
                
                prev_state[ptype] = {'weights_dict': {}, 'oos_returns_dict': {}, 'gross_return': 0.0}
                continue
            
            # Calculate transaction costs
            if len(prev_state[ptype]['weights_dict']) > 0:
                # Adjust previous weights for returns
                adjusted_prev = {}
                prev_weights_dict = prev_state[ptype]['weights_dict']
                prev_oos_returns_dict = prev_state[ptype]['oos_returns_dict']
                prev_gross_return = prev_state[ptype]['gross_return']
                
                for asset, prev_w in prev_weights_dict.items():
                    if asset in prev_oos_returns_dict:
                        prev_r = prev_oos_returns_dict[asset]
                        if abs(1 + prev_gross_return) > 1e-6:
                            adjusted_prev[asset] = prev_w * (1 + prev_r) / (1 + prev_gross_return)
                        else:
                            adjusted_prev[asset] = 0.0
                    else:
                        if abs(1 + prev_gross_return) > 1e-6:
                            adjusted_prev[asset] = prev_w / (1 + prev_gross_return)
                        else:
                            adjusted_prev[asset] = 0.0
                
                # Calculate turnover
                all_assets = set(adjusted_prev.keys()) | set(common_weights.keys())
                turnover = 0.0
                for asset in all_assets:
                    old_w = adjusted_prev.get(asset, 0.0)
                    new_w = common_weights.get(asset, 0.0)
                    turnover += abs(new_w - old_w)
                
                tc = transaction_cost * (1 + gross_return) * turnover
            else:
                # First period
                turnover = sum(abs(w) for w in common_weights.values())
                tc = transaction_cost * (1 + gross_return) * turnover
            
            # Net return
            net_return = gross_return - tc
            
            # Store results
            portfolio_results[ptype]['returns'].append(net_return)
            portfolio_results[ptype]['dates'].append(current_date)
            portfolio_results[ptype]['weights'].append(common_weights.copy())
            portfolio_results[ptype]['turnover'].append(turnover)
            portfolio_results[ptype]['gross_returns'].append(gross_return)
            
            # Update previous state
            prev_state[ptype] = {
                'weights_dict': common_weights.copy(),
                'oos_returns_dict': {a: oos_returns_dict[a] for a in common_assets},
                'gross_return': gross_return
            }
        
        if verbose:
            print(f"  GMV  - Gross: {portfolio_results['gmv']['gross_returns'][-1]:>8.5f} | "
                  f"Turnover: {portfolio_results['gmv']['turnover'][-1]:>6.4f} | "
                  f"Net: {portfolio_results['gmv']['returns'][-1]:>8.5f}")
            print(f"  MV   - Gross: {portfolio_results['mv']['gross_returns'][-1]:>8.5f} | "
                  f"Turnover: {portfolio_results['mv']['turnover'][-1]:>6.4f} | "
                  f"Net: {portfolio_results['mv']['returns'][-1]:>8.5f}")
            print(f"  MSR  - Gross: {portfolio_results['msr']['gross_returns'][-1]:>8.5f} | "
                  f"Turnover: {portfolio_results['msr']['turnover'][-1]:>6.4f} | "
                  f"Net: {portfolio_results['msr']['returns'][-1]:>8.5f}")

    if verbose:
        print("\n" + "="*60)
        print("BACKTEST COMPLETE")
        print("="*60)
    
    # --- 4. Compile Results for Each Portfolio ---
    results_dict = {}
    
    for ptype in ['gmv', 'mv', 'msr']:
        results_df = pd.DataFrame({
            'date': portfolio_results[ptype]['dates'],
            'portfolio_return': portfolio_results[ptype]['returns'],
            'portfolio_gross_return': portfolio_results[ptype]['gross_returns'],
            'portfolio_weights': portfolio_results[ptype]['weights'],
            'portfolio_turnover': portfolio_results[ptype]['turnover']
        })
        results_df['cumulative_return'] = (1 + results_df['portfolio_return']).cumprod() - 1
        
        # Compute overall metrics
        if len(portfolio_results[ptype]['returns']) > 0:
            returns = portfolio_results[ptype]['returns']
            mean_return = np.mean(returns)
            variance = np.var(returns, ddof=1)
            sharpe_ratio = mean_return / np.sqrt(variance) if variance > 0 else 0
            
            # Annualized metrics (monthly data)
            annual_return = mean_return * 12
            annual_volatility = np.sqrt(variance * 12)
            annual_sharpe = annual_return / annual_volatility if annual_volatility > 0 else 0
            
            metrics = {
                'mean_return': mean_return,
                'variance': variance,
                'sharpe_ratio': sharpe_ratio,
                'annual_return': annual_return,
                'annual_volatility': annual_volatility,
                'annual_sharpe_ratio': annual_sharpe,
                'total_return': results_df['cumulative_return'].iloc[-1],
                'avg_turnover': np.mean(portfolio_results[ptype]['turnover']),
                'n_periods': len(returns),
                'n_zero_periods': sum(1 for r in returns if r == 0)
            }
        else:
            metrics = {
                'mean_return': 0,
                'variance': 0,
                'sharpe_ratio': 0,
                'annual_return': 0,
                'annual_volatility': 0,
                'annual_sharpe_ratio': 0,
                'total_return': 0,
                'avg_turnover': 0,
                'n_periods': 0,
                'n_zero_periods': 0
            }
        
        results_dict[ptype] = (results_df, metrics)
    
    return results_dict

In [6]:
df = pd.read_csv('../green cleaned.csv', dtype={'ncusip': 'string'})
df['ret_fwd_1'] = df.groupby('permno')['ret_excess'].shift(-1)

In [9]:
results = backtest_nodewise_gmv_finbert(
    df=df,
    signals_path='monthly_signals_decay.csv',
    test_start_date='2020-01-31',
    test_end_date='2024-04-30',
    lookback_window=180,
    transaction_cost=0.001,
    verbose=True
)

# Access individual results
gmv_df, gmv_metrics = results['gmv']
mv_df, mv_metrics = results['mv']
msr_df, msr_metrics = results['msr']

# Compare Sharpe ratios
print(f"GMV Sharpe: {gmv_metrics['annual_sharpe_ratio']:.3f}")
print(f"MV Sharpe: {mv_metrics['annual_sharpe_ratio']:.3f}")
print(f"MSR Sharpe: {msr_metrics['annual_sharpe_ratio']:.3f}")

Creating ticker to permno mapping...
Mapped 1664 unique tickers to permnos
Loading FinBERT signals from monthly_signals_decay.csv...
Loaded 24780 monthly signals
Signal distribution:
signal
hold    23840
sell      529
buy       411
Name: count, dtype: int64
STARTING BACKTEST WITH FINBERT BUY/SELL SIGNALS

[1/52] Date: 2020-01-31 | Year: 2020
  Window: 2005-01-31 to 2019-12-31
 | FinBERT: 8 | Assets w/ data: 7
  Running Nodewise Regression...
  Computing portfolio weights...
  GMV  - Gross: -0.10884 | Turnover: 1.0000 | Net: -0.10974
  MV   - Gross: -0.10972 | Turnover: 1.0000 | Net: -0.11061
  MSR  - Gross: -0.10316 | Turnover: 1.0517 | Net: -0.10410

[2/52] Date: 2020-02-29 | Year: 2020
  Window: 2005-02-28 to 2020-01-31
 | FinBERT: 10 | Assets w/ data: 5
  Running Nodewise Regression...
  Computing portfolio weights...
  GMV  - Gross: -0.11698 | Turnover: 1.5135 | Net: -0.11831
  MV   - Gross: -0.12823 | Turnover: 1.8716 | Net: -0.12986
  MSR  - Gross: -0.11501 | Turnover: 1.5653 | N

In [10]:
results_df['portfolio_weights']

0     {54148: 0.06353613521591844, 23819: 0.07179758...
1     {52090: 0.5199768423883128, 10516: 0.185415790...
2     {16600: 0.6213324246202543, 56274: 0.378667575...
3     {82651: 0.21889998797634508, 86356: 0.12040648...
4     {46578: 0.35592928494166126, 52090: 0.32199116...
5     {46578: 0.5558684147396795, 77274: 0.226354802...
6     {77274: 0.3243599232801598, 84373: 0.396881631...
7     {46578: 0.5265825877318946, 49154: 0.205580092...
8     {87137: 0.015647930256273854, 88661: 0.0365193...
9     {21186: 0.10514897352831676, 23819: 0.03840542...
10    {19393: 0.13821660022700588, 17005: 0.14269574...
11    {60442: 0.5918406709320633, 52708: 0.197410768...
12    {35044: 0.07500460072647971, 15720: 0.31514302...
13    {15720: 0.26891582893081234, 57904: 0.09463949...
14    {21186: 0.08482901712043692, 49154: 0.14388740...
15    {41355: 0.049701389511887795, 11308: 0.3051606...
16    {16600: 0.5366823602003937, 87137: 0.041768132...
17    {52038: 0.17018974186765423, 11308: 0.3151