In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
import warnings

warnings.filterwarnings('ignore')

from multiprocessing import get_context
import tensorflow as tf
import time
from decimal import Decimal, ROUND_HALF_UP
from scipy.optimize import brentq

import scipy.linalg as sl
def afm_est(Y, NF):

    n = Y.shape[0]
    p = Y.shape[1]
    
    # L'L normalization
    ev_ = sl.eigh(np.cov(Y))

    # Sort eigenvalues in descending order
    indx_ev = ev_[0].argsort()[::-1]
    # Get eigenvectors
    evec = ev_[1][:, indx_ev]

    # Determining Factors
    F = np.sqrt(n) * evec[:, 0:NF]

    # Factorloadings
    L = Y.T @ F/n

    """
    # F'F normalization
    ev_ = sl.eigh(np.cov(Y.T))

    # Sort eigenvalues in descending order
    indx_ev = ev_[0].argsort()[::-1]
    # Get eigenvectors
    evec = ev_[1][:, indx_ev]

    # Determining Factors
    L = np.sqrt(p) * evec[:, 0:NF]
    # Factorloadings
    F = Y @ L/p
    """
    resd = Y - F @ L.T

    ret_ = {'F': F, 'L': L, 'resd': resd}

    return ret_

def nf_bn(Y, nf_max):

    n = Y.shape[0]
    p = Y.shape[1]

    IC1 = np.empty((nf_max, 1))
    IC1[:] = np.nan
    IC2 = np.empty((nf_max, 1))
    IC2[:] = np.nan
    IC3 = np.empty((nf_max, 1))
    IC3[:] = np.nan

    for ii in range(1, nf_max+1):
        ret_afm = afm_est(Y, ii)

        V = np.mean(ret_afm['resd']**2)
        
        #Information criteria
        #IC1
        IC1[ii-1, 0] = np.log(V) + ii * ((p+n)/(p*n) * np.log((p*n)/(p+n)))
        #IC2
        IC2[ii-1, 0] = np.log(V) + ii * ((p+n)/(p*n) * np.log(min(n,p)))
        #IC3
        IC3[ii-1, 0] = np.log(V) + ii * (np.log(min(n,p)) / (min(n,p)))
    
    ICs = np.empty((3, 2))
    ICs[:] = np.nan

    ICs[0, 0] = IC1.argmin()
    ICs[1, 0] = IC2.argmin()
    ICs[2, 0] = IC3.argmin()

    ICs[0, 1] = IC1[int(ICs[0, 0])]
    ICs[1, 1] = IC2[int(ICs[1, 0])]
    ICs[2, 1] = IC3[int(ICs[2, 0])]

    ret_ = {'num_f': ICs[:,0], 'ICs': ICs[:,1]}

    return ret_

def soft_t(z, a):
  t1 = np.sign(z)
  b = np.abs(z) - a
  t2 = b * (b >= 0)
  z_t = t1 * t2
  return z_t

def cov_e_poet(resd, C, N, T):
    rate_thres = 1/np.sqrt(N) + np.sqrt((np.log(N))/T)
    # lam = rate_thres * C * np.ones(shape=(N,N))
    
    sig_e_samp = np.cov(resd.T)
    
    
    thet_par = np.empty((N, N))
    thet_par[:] = np.nan
    
    for ii in range(0, N):
        for jj in range(0, N):
            thet_par[ii, jj] = np.mean((resd[:, ii] * resd[:, jj] - sig_e_samp[ii, jj])**2)
    
    lam = rate_thres * C * np.sqrt(thet_par)
    
    """
    sig_e_diag=np.diag(np.sqrt(np.diag(sig_e_samp)))
    R = np.linalg.inv(sig_e_diag) @ sig_e_samp @ np.linalg.inv(sig_e_diag); 
    M = soft_t(R, lam)
    np.fill_diagonal(M, 1)
    sig_e_hat = sig_e_diag @ M @ sig_e_diag
    """

    sig_e_diag = np.diag(sig_e_samp)
    sig_e_hat = soft_t(sig_e_samp, lam)
    np.fill_diagonal(sig_e_hat, sig_e_diag)

    return sig_e_hat

def poet(Y_star,NF_max):
    n = Y_star.shape[0]
    p = Y_star.shape[1]
    
    num_f = nf_bn(Y_star, NF_max)['num_f']
    num_f = int(num_f[0])
    est_afm = afm_est(Y_star, num_f)

    F = est_afm['F']
    L = est_afm['L']
    resd = est_afm['resd']
    sigma_u_hat = cov_e_poet(resd, 2, p, n)
    sigma_u_hat_inv = np.linalg.inv(sigma_u_hat)
    
    A=L.T @ sigma_u_hat_inv @ L
    I=np.eye(A.shape[0])
    
    Theta_hat = sigma_u_hat_inv - sigma_u_hat_inv @ L @ np.linalg.inv(I+A) @ L.T @ sigma_u_hat_inv
    return Theta_hat


def gmv_weights(Theta_hat):
    """
    Compute Global Minimum Variance (GMV) portfolio weights (Section 6.1).
    
    Parameters:
    -----------
    Theta_hat : np.ndarray, shape (p, p)
        Precision matrix
    
    Returns:
    --------
    w_star : np.ndarray, shape (p,)
        Portfolio weights
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    # w* = (Θ 1_p) / (1_p' Θ 1_p)
    numerator = Theta_hat @ ones_p
    denominator = ones_p @ Theta_hat @ ones_p
    
    if np.abs(denominator) < 1e-10:
        # Fallback to equal weights if precision matrix is near-singular
        return ones_p / p
    
    w_star = numerator / denominator
    
    return w_star

def mv_weights(Theta_hat, mu, target_return=0.01):
    """
    Compute Mean-Variance portfolio weights with target return.
    
    Solves the constrained optimization:
    min w' Sigma w  subject to  w' mu = target_return  and  w' 1 = 1
    
    Solution uses Lagrange multipliers with two constraints.
    
    Parameters:
    -----------
    Theta_hat : np.ndarray, shape (p, p)
        Precision matrix (Sigma^{-1})
    mu : np.ndarray, shape (p,)
        Expected returns
    target_return : float
        Target portfolio return (default: 0.01 = 1% monthly)
    long_only : bool
        If True, falls back to GMV if MV produces negative weights
    
    Returns:
    --------
    w_star : np.ndarray, shape (p,)
        Portfolio weights
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    # Compute key quantities
    A = ones_p @ Theta_hat @ ones_p  # 1' Theta 1
    B = ones_p @ Theta_hat @ mu       # 1' Theta mu  
    C = mu @ Theta_hat @ mu           # mu' Theta mu
    D = A * C - B * B                  # Determinant
    
    # Check for singularity
    if np.abs(D) < 1e-10:
        print('SINGULARITY')
        # System is singular, use GMV instead
        if np.abs(A) > 1e-10:
            w_star = (Theta_hat @ ones_p) / A
            return w_star
        else:
            return ones_p / p
    
    
    # Compute Lagrange multipliers
    lambda1 = (C - B * target_return) / D
    lambda2 = (A * target_return - B) / D
    
    # Compute weights: w = lambda1 * Theta^{-1} 1 + lambda2 * Theta^{-1} mu
    w_star = lambda1 * (Theta_hat @ ones_p) + lambda2 * (Theta_hat @ mu)
    
    return w_star

def msr_weights(Theta_hat, mu):
    """
    Compute Maximum Sharpe Ratio portfolio weights.
    
    The maximum Sharpe ratio portfolio solves:
    max (w' mu) / sqrt(w' Sigma w)
    
    Solution (when mu represents excess returns):
    w ∝ Sigma^{-1} mu = Theta mu
    
    Then normalize so that sum(w) = 1.
    
    Parameters:
    -----------
    Theta_hat : np.ndarray, shape (p, p)
        Precision matrix (Sigma^{-1})
    mu : np.ndarray, shape (p,)
        Expected excess returns
    
    Returns:
    --------
    w_star : np.ndarray, shape (p,)
        Portfolio weights (sum to 1)
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    # Compute unnormalized weights: w ∝ Theta mu
    w_unnorm = Theta_hat @ mu
    
    # Normalize to sum to 1
    weight_sum = np.sum(w_unnorm)
    
    if np.abs(weight_sum) < 1e-10:
        print('WARNING: Weight sum near zero, returning equal weights')
        return ones_p / p
    
    w_star = w_unnorm / weight_sum
    
    return w_star


def load_yearly_signals(year, buys_path_template='buys_{}.csv', sells_path_template='sells_{}.csv'):
    """
    Load buy and sell signals for a specific year.
    
    Parameters:
    -----------
    year : int
        Year to load signals for
    buys_path_template : str
        Template for buys file path (use {} for year placeholder)
    sells_path_template : str
        Template for sells file path (use {} for year placeholder)
    
    Returns:
    --------
    permno_set : set
        Set of permnos in the buy and sell signals for this year
    """
    try:
        buys = pd.read_csv(buys_path_template.format(year), index_col=1)
        sells = pd.read_csv(sells_path_template.format(year), index_col=1)
        
        buys.index.name = 'permno'
        sells.index.name = 'permno'
        
        buys_index = buys.index.astype(int)
        sells_index = sells.index.astype(int)
        
        return set(buys_index.union(sells_index))
    except FileNotFoundError as e:
        print(f"  ⚠ Warning: Could not load signals for year {year}: {e}")
        return set()

def load_finbert_signals(signals_path):
    """Load FinBERT monthly signals from CSV file."""
    try:
        signals_df = pd.read_csv(signals_path)
        signals_df['date'] = pd.to_datetime(signals_df['year_month']) + pd.offsets.MonthEnd(0)
        return signals_df
    except FileNotFoundError as e:
        print(f"  ⚠ Warning: Could not load FinBERT signals: {e}")
        return pd.DataFrame(columns=['symbol', 'company', 'year_month', 'signal', 'date'])


def get_finbert_permnos_for_date(signals_df, ticker_to_permno, date):
    """Get set of permnos with 'buy' or 'sell' signals for a specific date."""
    date_signals = signals_df[signals_df['date'] == date]
    buy_signals = date_signals[date_signals['signal'] == 'buy']
    sell_signals = date_signals[date_signals['signal'] == 'sell']
    
    permnos = set()
    for ticker in buy_signals['symbol'].values:
        if ticker in ticker_to_permno:
            permnos.add(ticker_to_permno[ticker])
    for ticker in sell_signals['symbol'].values:
        if ticker in ticker_to_permno:
            permnos.add(ticker_to_permno[ticker])
    
    return permnos


def create_ticker_to_permno_mapping(df):
    """Create a mapping from ticker to permno."""
    if 'ticker' not in df.columns:
        raise ValueError("DataFrame must have 'ticker' column for mapping")
    
    valid_df = df[df['ticker'].notna()].copy()
    ticker_to_permno = valid_df.groupby('ticker')['permno'].last().to_dict()
    
    return ticker_to_permno


def calculate_exit_transaction_cost(prev_weights_dict, prev_oos_returns_dict, 
                                    prev_gross_return, transaction_cost, verbose=False):
    """Calculate transaction cost when exiting the market (liquidating all positions)."""
    if len(prev_weights_dict) == 0:
        return 0.0, 0.0, 0.0
    
    adjusted_prev = {}
    for asset, prev_w in prev_weights_dict.items():
        if asset in prev_oos_returns_dict:
            prev_r = prev_oos_returns_dict[asset]
            if abs(1 + prev_gross_return) > 1e-6:
                adjusted_prev[asset] = prev_w * (1 + prev_r) / (1 + prev_gross_return)
            else:
                adjusted_prev[asset] = 0.0
        else:
            if abs(1 + prev_gross_return) > 1e-6:
                adjusted_prev[asset] = prev_w / (1 + prev_gross_return)
            else:
                adjusted_prev[asset] = 0.0
    
    turnover = sum(abs(w) for w in adjusted_prev.values())
    tc = transaction_cost * 1.0 * turnover
    net_return = -tc
    
    if verbose:
        print(f"  Liquidating positions | Turnover: {turnover:>6.4f} | TC: {tc:>8.6f}")
    
    return turnover, tc, net_return



def backtest_dnn_yearly(df, 
                          test_start_date='2020-01-31', 
                          test_end_date='2024-11-30',
                          lookback_window=180,
                          transaction_cost=0.001,
                          buys_path_template='buys_{}.csv',
                          sells_path_template='sells_{}.csv',
                          finbert_signals_path=None,  # NEW PARAMETER
                          data_factor=None,
                          verbose=True):
    """
    Backtest DNN-FM with FinBERT + LLM signals using GMV/MV/MSR strategies.
    
    KEY CHANGES:
    1. Added finbert_signals_path parameter
    2. Added ticker to permno mapping creation
    3. Added FinBERT signal loading and processing
    4. Combined yearly and FinBERT signals using union/intersection logic
    5. Added calculate_exit_transaction_cost for proper liquidation handling
    """
    
    # [Keep all existing setup code]
    df = df.copy()
    if 'datadate' not in df.columns or 'permno' not in df.columns:
        raise ValueError("DataFrame must have 'datadate' and 'permno' columns")
    df['datadate'] = pd.to_datetime(df['datadate'])
    
    # NEW: Create ticker to permno mapping
    if verbose:
        print("Creating ticker to permno mapping...")
    ticker_to_permno = create_ticker_to_permno_mapping(df)
    if verbose:
        print(f"Mapped {len(ticker_to_permno)} unique tickers to permnos")
    
    # NEW: Load FinBERT signals if provided
    finbert_df = None
    if finbert_signals_path is not None:
        finbert_df = load_finbert_signals(finbert_signals_path)
        if verbose and len(finbert_df) > 0:
            print(f"Loaded FinBERT signals: {len(finbert_df)} monthly records")
            print(f"FinBERT signal distribution:")
            print(finbert_df['signal'].value_counts())
    
    # [Keep all existing date and storage setup]
    all_dates = sorted(df['datadate'].unique())
    test_start_dt = pd.to_datetime(test_start_date)
    test_end_dt = pd.to_datetime(test_end_date)
    
    try:
        test_start_idx = all_dates.index(test_start_dt)
        test_end_idx = all_dates.index(test_end_dt)
    except ValueError as e:
        raise ValueError(f"Date not found in DataFrame: {e}")
    
    # Storage for results - GMV
    portfolio_returns = []
    portfolio_dates = []
    portfolio_weights_list = []
    portfolio_turnover_list = []
    portfolio_gross_returns = []
    
    # Storage for results - MV
    portfolio_returns_2 = []
    portfolio_dates_2 = []
    portfolio_weights_list_2 = []
    portfolio_turnover_list_2 = []
    portfolio_gross_returns_2 = []
    
    # Storage for results - MSR
    portfolio_returns_3 = []
    portfolio_dates_3 = []
    portfolio_weights_list_3 = []
    portfolio_turnover_list_3 = []
    portfolio_gross_returns_3 = []
    
    # Track weights by permno - GMV
    prev_weights_dict = {}
    prev_oos_returns_dict = {}
    prev_gross_return = 0.0
    
    # Track weights by permno - MV
    prev_weights_dict_2 = {}
    prev_oos_returns_dict_2 = {}
    prev_gross_return_2 = 0.0
    
    # Track weights by permno - MSR
    prev_weights_dict_3 = {}
    prev_oos_returns_dict_3 = {}
    prev_gross_return_3 = 0.0
    
    # Cache for yearly signals
    yearly_signals_cache = {}
    
    # --- 2. Rolling Window Backtest ---
    if verbose:
        print("="*60)
        print("STARTING BACKTEST WITH DNN-FM + YEARLY + FINBERT SIGNALS")
        print("="*60)
        
    for t in range(test_start_idx, test_end_idx + 1):
        current_date = all_dates[t]
        current_year = current_date.year
        
        # Load yearly signals
        if current_year not in yearly_signals_cache:
            yearly_signals_cache[current_year] = load_yearly_signals(
                current_year, buys_path_template, sells_path_template
            )
        
        yearly_permnos = yearly_signals_cache[current_year]
        
        # NEW: Get FinBERT signals for current date
        finbert_permnos = set()
        if finbert_df is not None and len(finbert_df) > 0:
            finbert_permnos = get_finbert_permnos_for_date(finbert_df, ticker_to_permno, current_date)
        
        # NEW: Combine yearly and FinBERT signals
        allowed_permnos = yearly_permnos.intersection(finbert_permnos)
        if len(allowed_permnos) <= 1:
            allowed_permnos = yearly_permnos.union(finbert_permnos)
        
        # NEW: Get OOS returns FIRST (critical for exit transaction cost calculation)
        oos_data = df[(df['datadate'] == current_date) & (df['permno'].isin(allowed_permnos))]
        oos_returns_series = oos_data.set_index('permno')['ret_fwd_1']
        oos_returns_series = oos_returns_series.dropna()
        oos_returns_dict = oos_returns_series.to_dict()
        
        # NEW: Handle no signals case with proper liquidation
        if len(allowed_permnos) == 0:
            if verbose:
                print(f"\n[{t - test_start_idx + 1}/{test_end_idx - test_start_idx + 1}] "
                      f"Date: {current_date.strftime('%Y-%m-%d')}")
                print(f"  ⚠ No signals, recording zero return for all strategies")
            
            # Liquidate all three strategies
            for idx, (pw, po, pg) in enumerate([
                (prev_weights_dict, prev_oos_returns_dict, prev_gross_return),
                (prev_weights_dict_2, prev_oos_returns_dict_2, prev_gross_return_2),
                (prev_weights_dict_3, prev_oos_returns_dict_3, prev_gross_return_3)
            ]):
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    pw, po, pg, transaction_cost, verbose=verbose
                )
                
                if idx == 0:  # GMV
                    portfolio_returns.append(net_return)
                    portfolio_dates.append(current_date)
                    portfolio_weights_list.append({})
                    portfolio_turnover_list.append(turnover)
                    portfolio_gross_returns.append(0.0)
                elif idx == 1:  # MV
                    portfolio_returns_2.append(net_return)
                    portfolio_dates_2.append(current_date)
                    portfolio_weights_list_2.append({})
                    portfolio_turnover_list_2.append(turnover)
                    portfolio_gross_returns_2.append(0.0)
                else:  # MSR
                    portfolio_returns_3.append(net_return)
                    portfolio_dates_3.append(current_date)
                    portfolio_weights_list_3.append({})
                    portfolio_turnover_list_3.append(turnover)
                    portfolio_gross_returns_3.append(0.0)
            
            # Reset all state
            prev_weights_dict = {}
            prev_oos_returns_dict = {}
            prev_gross_return = 0.0
            prev_weights_dict_2 = {}
            prev_oos_returns_dict_2 = {}
            prev_gross_return_2 = 0.0
            prev_weights_dict_3 = {}
            prev_oos_returns_dict_3 = {}
            prev_gross_return_3 = 0.0
            continue
        
        # Define the lookback window
        window_start_date = all_dates[t - lookback_window]
        window_end_date = all_dates[t - 1]
        
        train_data = df[(df['datadate'] >= window_start_date) & 
                        (df['datadate'] <= window_end_date) &
                        (df['permno'].isin(allowed_permnos))]
        train_factor = data_factor.loc[window_start_date : window_end_date]
        
        returns_pivot = train_data.pivot(index='datadate', columns='permno', values='ret_fwd_1')
        window_dates = all_dates[t - lookback_window : t]
        returns_pivot = returns_pivot.reindex(index=window_dates)
        
        nan_assets = returns_pivot.columns[returns_pivot.isna().any()]
        filtered_pivot = returns_pivot.drop(columns=nan_assets)
        
        current_assets = filtered_pivot.columns.tolist()
        Y = filtered_pivot.values
        n_train, p_current = Y.shape

        if verbose:
            print(f"\n[{t - test_start_idx + 1}/{test_end_idx - test_start_idx + 1}] "
                  f"Date: {current_date.strftime('%Y-%m-%d')} | Year: {current_year}")
            print(f"  Window: {window_start_date.strftime('%Y-%m-%d')} to "
                  f"{window_end_date.strftime('%Y-%m-%d')}")
            # NEW: Show both signal types
            print(f"  Yearly: {len(yearly_permnos)} | FinBERT: {len(finbert_permnos)} | "
                  f"Union/Intersection: {len(allowed_permnos)} | Assets w/ data: {p_current}")

        # Check for valid data
        if n_train < lookback_window or p_current < 2:
            if verbose:
                print(f"  ⚠ Insufficient data (n={n_train}, p={p_current}), recording 0 return")
            # Liquidate all three strategies
            for idx, (pw, po, pg) in enumerate([
                (prev_weights_dict, prev_oos_returns_dict, prev_gross_return),
                (prev_weights_dict_2, prev_oos_returns_dict_2, prev_gross_return_2),
                (prev_weights_dict_3, prev_oos_returns_dict_3, prev_gross_return_3)
            ]):
                turnover, tc, net_return = calculate_exit_transaction_cost(
                    pw, po, pg, transaction_cost, verbose=verbose
                )
                
                if idx == 0:  # GMV
                    portfolio_returns.append(net_return)
                    portfolio_dates.append(current_date)
                    portfolio_weights_list.append({})
                    portfolio_turnover_list.append(turnover)
                    portfolio_gross_returns.append(0.0)
                elif idx == 1:  # MV
                    portfolio_returns_2.append(net_return)
                    portfolio_dates_2.append(current_date)
                    portfolio_weights_list_2.append({})
                    portfolio_turnover_list_2.append(turnover)
                    portfolio_gross_returns_2.append(0.0)
                else:  # MSR
                    portfolio_returns_3.append(net_return)
                    portfolio_dates_3.append(current_date)
                    portfolio_weights_list_3.append({})
                    portfolio_turnover_list_3.append(turnover)
                    portfolio_gross_returns_3.append(0.0)
            
            # Reset all state
            prev_weights_dict = {}
            prev_oos_returns_dict = {}
            prev_gross_return = 0.0
            prev_weights_dict_2 = {}
            prev_oos_returns_dict_2 = {}
            prev_gross_return_2 = 0.0
            prev_weights_dict_3 = {}
            prev_oos_returns_dict_3 = {}
            prev_gross_return_3 = 0.0
            continue
        else:
            try:
                # Demean the returns
                Y_bar = Y.mean(axis=0)
                Y_star = Y - Y_bar
                
                if verbose:
                    print(f"  Running Deep Learning Regression...")
                F = train_factor.values.astype(float)
                Theta_hat = poet(Y_star, 8)
                
                if verbose:
                    print(f"  Computing GMV weights...")
                w_star = gmv_weights(Theta_hat)
                
                if verbose:
                    print(f"  Computing MV weights...")
                w_star_2 = mv_weights(Theta_hat, Y_bar, target_return=0.01)
                
                if verbose:
                    print(f"  Computing MSR weights...")
                w_star_3 = msr_weights(Theta_hat, Y_bar)
                
                # Create weights dictionaries
                new_weights_dict = {asset: w_star[i] for i, asset in enumerate(current_assets)}
                new_weights_dict_2 = {asset: w_star_2[i] for i, asset in enumerate(current_assets)}
                new_weights_dict_3 = {asset: w_star_3[i] for i, asset in enumerate(current_assets)}
                
            except Exception as e:
                if verbose:
                    print(f"  ✗ Error: {e}")
                    print(f"  Using previous weights")
                new_weights_dict = prev_weights_dict.copy()
                new_weights_dict_2 = prev_weights_dict_2.copy()
                new_weights_dict_3 = prev_weights_dict_3.copy()

        # Normalize weights to sum to 1 - GMV
        weight_sum = sum(new_weights_dict.values())
        if weight_sum > 1e-10:
            new_weights_dict = {k: v/weight_sum for k, v in new_weights_dict.items()}
        else:
            if verbose:
                print("  ⚠ GMV: Zero weight sum, using previous weights")
            new_weights_dict = prev_weights_dict.copy()
            weight_sum = sum(new_weights_dict.values())
            if weight_sum > 1e-10:
                new_weights_dict = {k: v/weight_sum for k, v in new_weights_dict.items()}
        
        # Normalize weights to sum to 1 - MV
        weight_sum_2 = sum(new_weights_dict_2.values())
        if weight_sum_2 > 1e-10:
            new_weights_dict_2 = {k: v/weight_sum_2 for k, v in new_weights_dict_2.items()}
        else:
            if verbose:
                print("  ⚠ MV: Zero weight sum, using previous weights")
            new_weights_dict_2 = prev_weights_dict_2.copy()
            weight_sum_2 = sum(new_weights_dict_2.values())
            if weight_sum_2 > 1e-10:
                new_weights_dict_2 = {k: v/weight_sum_2 for k, v in new_weights_dict_2.items()}
        
        # Normalize weights to sum to 1 - MSR
        weight_sum_3 = sum(new_weights_dict_3.values())
        if weight_sum_3 > 1e-10:
            new_weights_dict_3 = {k: v/weight_sum_3 for k, v in new_weights_dict_3.items()}
        else:
            if verbose:
                print("  ⚠ MSR: Zero weight sum, using previous weights")
            new_weights_dict_3 = prev_weights_dict_3.copy()
            weight_sum_3 = sum(new_weights_dict_3.values())
            if weight_sum_3 > 1e-10:
                new_weights_dict_3 = {k: v/weight_sum_3 for k, v in new_weights_dict_3.items()}
        
        # --- 3. OOS Returns & Transaction Costs ---
        
        # Get out-of-sample returns for current month (only for allowed permnos)
        oos_data = df[(df['datadate'] == current_date) & (df['permno'].isin(allowed_permnos))]
        oos_returns_series = oos_data.set_index('permno')['ret_fwd_1']
        
        # Filter out NaN returns
        oos_returns_series = oos_returns_series.dropna()
        oos_returns_dict = oos_returns_series.to_dict()
        
        # Find common assets between weights and returns
        common_assets = set(new_weights_dict.keys()) & set(oos_returns_dict.keys())
        common_assets_2 = set(new_weights_dict_2.keys()) & set(oos_returns_dict.keys())
        common_assets_3 = set(new_weights_dict_3.keys()) & set(oos_returns_dict.keys())
        
        if len(common_assets) == 0 or len(common_assets_2) == 0 or len(common_assets_3) == 0:
            if verbose:
                print("  ⚠ No common assets with valid returns, skipping period")
            continue
        
        # Filter to common assets and renormalize - GMV
        common_weights = {a: new_weights_dict[a] for a in common_assets}
        common_weight_sum = sum(common_weights.values())
        if common_weight_sum > 1e-10:
            common_weights = {k: v/common_weight_sum for k, v in common_weights.items()}
        else:
            if verbose:
                print("  ⚠ GMV: Zero weight sum after filtering, skipping period")
            continue
        
        # Filter to common assets and renormalize - MV
        common_weights_2 = {a: new_weights_dict_2[a] for a in common_assets_2}
        common_weight_sum_2 = sum(common_weights_2.values())
        if common_weight_sum_2 > 1e-10:
            common_weights_2 = {k: v/common_weight_sum_2 for k, v in common_weights_2.items()}
        else:
            if verbose:
                print("  ⚠ MV: Zero weight sum after filtering, skipping period")
            continue
        
        # Filter to common assets and renormalize - MSR
        common_weights_3 = {a: new_weights_dict_3[a] for a in common_assets_3}
        common_weight_sum_3 = sum(common_weights_3.values())
        if common_weight_sum_3 > 1e-10:
            common_weights_3 = {k: v/common_weight_sum_3 for k, v in common_weights_3.items()}
        else:
            if verbose:
                print("  ⚠ MSR: Zero weight sum after filtering, skipping period")
            continue
        
        # Compute gross portfolio returns
        gross_return = sum(common_weights[a] * oos_returns_dict[a] for a in common_assets)
        gross_return_2 = sum(common_weights_2[a] * oos_returns_dict[a] for a in common_assets_2)
        gross_return_3 = sum(common_weights_3[a] * oos_returns_dict[a] for a in common_assets_3)
        
        # Sanity checks
        if np.isnan(gross_return) or np.isinf(gross_return):
            if verbose:
                print(f"  ⚠ GMV: Invalid gross return: {gross_return}, skipping period")
            continue
        if np.isnan(gross_return_2) or np.isinf(gross_return_2):
            if verbose:
                print(f"  ⚠ MV: Invalid gross return: {gross_return_2}, skipping period")
            continue
        if np.isnan(gross_return_3) or np.isinf(gross_return_3):
            if verbose:
                print(f"  ⚠ MSR: Invalid gross return: {gross_return_3}, skipping period")
            continue
        
        # === IMPROVED TRANSACTION COST CALCULATION - GMV ===
        if len(prev_weights_dict) > 0:
            # Step 1: Adjust ALL previous weights for their returns
            adjusted_prev = {}
            
            for asset, prev_w in prev_weights_dict.items():
                if asset in prev_oos_returns_dict:
                    prev_r = prev_oos_returns_dict[asset]
                    if abs(1 + prev_gross_return) > 1e-6:
                        adjusted_prev[asset] = prev_w * (1 + prev_r) / (1 + prev_gross_return)
                    else:
                        adjusted_prev[asset] = 0.0
                else:
                    # Asset had weight but no return data (exited)
                    if abs(1 + prev_gross_return) > 1e-6:
                        adjusted_prev[asset] = prev_w / (1 + prev_gross_return)
                    else:
                        adjusted_prev[asset] = 0.0
            
            # Step 2: Calculate turnover across all assets (old and new)
            all_assets = set(adjusted_prev.keys()) | set(common_weights.keys())
            
            turnover = 0.0
            for asset in all_assets:
                old_w = adjusted_prev.get(asset, 0.0)
                new_w = common_weights.get(asset, 0.0)
                turnover += abs(new_w - old_w)
            
            # Transaction cost on end-of-period portfolio value
            tc = transaction_cost * (1 + gross_return) * turnover
        else:
            # First period: buying into everything
            turnover = sum(abs(w) for w in common_weights.values())
            tc = transaction_cost * (1 + gross_return) * turnover
        
        # === IMPROVED TRANSACTION COST CALCULATION - MV ===
        if len(prev_weights_dict_2) > 0:
            adjusted_prev_2 = {}
            
            for asset, prev_w in prev_weights_dict_2.items():
                if asset in prev_oos_returns_dict_2:
                    prev_r = prev_oos_returns_dict_2[asset]
                    if abs(1 + prev_gross_return_2) > 1e-6:
                        adjusted_prev_2[asset] = prev_w * (1 + prev_r) / (1 + prev_gross_return_2)
                    else:
                        adjusted_prev_2[asset] = 0.0
                else:
                    if abs(1 + prev_gross_return_2) > 1e-6:
                        adjusted_prev_2[asset] = prev_w / (1 + prev_gross_return_2)
                    else:
                        adjusted_prev_2[asset] = 0.0
            
            all_assets_2 = set(adjusted_prev_2.keys()) | set(common_weights_2.keys())
            
            turnover_2 = 0.0
            for asset in all_assets_2:
                old_w = adjusted_prev_2.get(asset, 0.0)
                new_w = common_weights_2.get(asset, 0.0)
                turnover_2 += abs(new_w - old_w)
            
            tc_2 = transaction_cost * (1 + gross_return_2) * turnover_2
        else:
            turnover_2 = sum(abs(w) for w in common_weights_2.values())
            tc_2 = transaction_cost * (1 + gross_return_2) * turnover_2
        
        # === IMPROVED TRANSACTION COST CALCULATION - MSR ===
        if len(prev_weights_dict_3) > 0:
            adjusted_prev_3 = {}
            
            for asset, prev_w in prev_weights_dict_3.items():
                if asset in prev_oos_returns_dict_3:
                    prev_r = prev_oos_returns_dict_3[asset]
                    if abs(1 + prev_gross_return_3) > 1e-6:
                        adjusted_prev_3[asset] = prev_w * (1 + prev_r) / (1 + prev_gross_return_3)
                    else:
                        adjusted_prev_3[asset] = 0.0
                else:
                    if abs(1 + prev_gross_return_3) > 1e-6:
                        adjusted_prev_3[asset] = prev_w / (1 + prev_gross_return_3)
                    else:
                        adjusted_prev_3[asset] = 0.0
            
            all_assets_3 = set(adjusted_prev_3.keys()) | set(common_weights_3.keys())
            
            turnover_3 = 0.0
            for asset in all_assets_3:
                old_w = adjusted_prev_3.get(asset, 0.0)
                new_w = common_weights_3.get(asset, 0.0)
                turnover_3 += abs(new_w - old_w)
            
            tc_3 = transaction_cost * (1 + gross_return_3) * turnover_3
        else:
            turnover_3 = sum(abs(w) for w in common_weights_3.values())
            tc_3 = transaction_cost * (1 + gross_return_3) * turnover_3
        
        # Net returns
        net_return = gross_return - tc
        net_return_2 = gross_return_2 - tc_2
        net_return_3 = gross_return_3 - tc_3
        
        # Store results - GMV
        portfolio_returns.append(net_return)
        portfolio_dates.append(current_date)
        portfolio_weights_list.append(common_weights.copy())
        portfolio_turnover_list.append(turnover)
        portfolio_gross_returns.append(gross_return)
        
        # Store results - MV
        portfolio_returns_2.append(net_return_2)
        portfolio_dates_2.append(current_date)
        portfolio_weights_list_2.append(common_weights_2.copy())
        portfolio_turnover_list_2.append(turnover_2)
        portfolio_gross_returns_2.append(gross_return_2)
        
        # Store results - MSR
        portfolio_returns_3.append(net_return_3)
        portfolio_dates_3.append(current_date)
        portfolio_weights_list_3.append(common_weights_3.copy())
        portfolio_turnover_list_3.append(turnover_3)
        portfolio_gross_returns_3.append(gross_return_3)
        
        # Update previous values for next iteration
        prev_weights_dict = common_weights.copy()
        prev_oos_returns_dict = {a: oos_returns_dict[a] for a in common_assets}
        prev_gross_return = gross_return
        
        prev_weights_dict_2 = common_weights_2.copy()
        prev_oos_returns_dict_2 = {a: oos_returns_dict[a] for a in common_assets_2}
        prev_gross_return_2 = gross_return_2
        
        prev_weights_dict_3 = common_weights_3.copy()
        prev_oos_returns_dict_3 = {a: oos_returns_dict[a] for a in common_assets_3}
        prev_gross_return_3 = gross_return_3
        
        if verbose:
            print(f"  GMV  - Gross: {gross_return:>8.5f} | Turnover: {turnover:>6.4f} | "
                  f"TC: {tc:>8.6f} | Net: {net_return:>8.5f}")
            print(f"  MV   - Gross: {gross_return_2:>8.5f} | Turnover: {turnover_2:>6.4f} | "
                  f"TC: {tc_2:>8.6f} | Net: {net_return_2:>8.5f}")
            print(f"  MSR  - Gross: {gross_return_3:>8.5f} | Turnover: {turnover_3:>6.4f} | "
                  f"TC: {tc_3:>8.6f} | Net: {net_return_3:>8.5f}")

    if verbose:
        print("\n" + "="*60)
        print("BACKTEST COMPLETE")
        print("="*60)
    
    # --- 4. Compile Results ---
    results_df = pd.DataFrame({
        'date': portfolio_dates,
        'portfolio_return': portfolio_returns,
        'portfolio_gross_return': portfolio_gross_returns,
        'portfolio_weights': portfolio_weights_list,
        'portfolio_turnover': portfolio_turnover_list
    })
    results_df['cumulative_return'] = (1 + results_df['portfolio_return']).cumprod() - 1
    
    results_df_2 = pd.DataFrame({
        'date': portfolio_dates_2,
        'portfolio_return': portfolio_returns_2,
        'portfolio_gross_return': portfolio_gross_returns_2,
        'portfolio_weights': portfolio_weights_list_2,
        'portfolio_turnover': portfolio_turnover_list_2
    })
    results_df_2['cumulative_return'] = (1 + results_df_2['portfolio_return']).cumprod() - 1
    
    results_df_3 = pd.DataFrame({
        'date': portfolio_dates_3,
        'portfolio_return': portfolio_returns_3,
        'portfolio_gross_return': portfolio_gross_returns_3,
        'portfolio_weights': portfolio_weights_list_3,
        'portfolio_turnover': portfolio_turnover_list_3
    })
    results_df_3['cumulative_return'] = (1 + results_df_3['portfolio_return']).cumprod() - 1
    
    # Helper function to compute metrics
    def compute_metrics(returns_list, turnover_list, results_df):
        if len(returns_list) > 0:
            mean_return = np.mean(returns_list)
            variance = np.var(returns_list, ddof=1)
            sharpe_ratio = mean_return / np.sqrt(variance) if variance > 0 else 0
            
            # Annualized metrics (monthly data)
            annual_return = mean_return * 12
            annual_volatility = np.sqrt(variance * 12)
            annual_sharpe = annual_return / annual_volatility if annual_volatility > 0 else 0
            
            return {
                'mean_return': mean_return,
                'variance': variance,
                'sharpe_ratio': sharpe_ratio,
                'annual_return': annual_return,
                'annual_volatility': annual_volatility,
                'annual_sharpe_ratio': annual_sharpe,
                'total_return': results_df['cumulative_return'].iloc[-1],
                'avg_turnover': np.mean(turnover_list),
                'n_periods': len(returns_list)
            }
        else:
            return {
                'mean_return': 0, 'variance': 0, 'sharpe_ratio': 0,
                'annual_return': 0, 'annual_volatility': 0, 'annual_sharpe_ratio': 0,
                'total_return': 0, 'avg_turnover': 0, 'n_periods': 0
            }
    
    # Compute metrics for all three strategies
    metrics = compute_metrics(portfolio_returns, portfolio_turnover_list, results_df)
    metrics_2 = compute_metrics(portfolio_returns_2, portfolio_turnover_list_2, results_df_2)
    metrics_3 = compute_metrics(portfolio_returns_3, portfolio_turnover_list_3, results_df_3)
    
    return results_df, metrics, results_df_2, metrics_2, results_df_3, metrics_3

In [2]:
df = pd.read_csv('../green cleaned.csv', dtype={'ncusip': 'string'})
df['ret_fwd_1'] = df.groupby('permno')['ret_excess'].shift(-1)

data_f=pd.read_csv('F-F_Research_Data_Factors.csv',sep=',')
data_f['Date']=pd.to_datetime(data_f['Date'], format="%Y%m")
data_f['Date']=data_f['Date']+pd.offsets.MonthEnd(0)
data_f = data_f.set_index('Date')
data_f = data_f[['Mkt-RF', 'SMB', 'HML', 'RF']].astype(float)

# Run backtest with yearly signals
results_df, metrics, results_df_2, metrics_2, results_df_3, metrics_3= backtest_dnn_yearly(
    df,
    test_start_date='2020-01-31',
    test_end_date='2024-04-30',
    lookback_window=180,
    transaction_cost=0.001,
    buys_path_template='buys_{}.csv',
    sells_path_template='sells_{}.csv',
    finbert_signals_path='../examples/monthly_signals_decay.csv',  # Your FinBERT signals
    data_factor=data_f,
    verbose=True
)

print(f"\n GMV")
print(f"\nSharpe Ratio: {metrics['sharpe_ratio']:.4f}")
print(f"Annualized Sharpe Ratio: {metrics['annual_sharpe_ratio']:.4f}")
print(f"Total Return: {metrics['total_return']:.4f}")
print(f"Average Turnover: {metrics['avg_turnover']:.4f}")

print(f"\n MV")
print(f"\nSharpe Ratio: {metrics_2['sharpe_ratio']:.4f}")
print(f"Annualized Sharpe Ratio: {metrics_2['annual_sharpe_ratio']:.4f}")
print(f"Total Return: {metrics_2['total_return']:.4f}")
print(f"Average Turnover: {metrics_2['avg_turnover']:.4f}")

print(f"\n MSR")
print(f"\nSharpe Ratio: {metrics_3['sharpe_ratio']:.4f}")
print(f"Annualized Sharpe Ratio: {metrics_3['annual_sharpe_ratio']:.4f}")
print(f"Total Return: {metrics_3['total_return']:.4f}")
print(f"Average Turnover: {metrics_3['avg_turnover']:.4f}")

Creating ticker to permno mapping...
Mapped 1664 unique tickers to permnos
Loaded FinBERT signals: 24780 monthly records
FinBERT signal distribution:
signal
hold    23840
sell      529
buy       411
Name: count, dtype: int64
STARTING BACKTEST WITH DNN-FM + YEARLY + FINBERT SIGNALS

[1/52] Date: 2020-01-31 | Year: 2020
  Window: 2005-01-31 to 2019-12-31
  Yearly: 40 | FinBERT: 8 | Union/Intersection: 47 | Assets w/ data: 34
  Running Deep Learning Regression...
  Computing GMV weights...
  Computing MV weights...
  Computing MSR weights...
  GMV  - Gross: -0.08401 | Turnover: 1.2170 | TC: 0.001115 | Net: -0.08513
  MV   - Gross: -0.08390 | Turnover: 1.2926 | TC: 0.001184 | Net: -0.08508
  MSR  - Gross: -0.08385 | Turnover: 1.3492 | TC: 0.001236 | Net: -0.08509

[2/52] Date: 2020-02-29 | Year: 2020
  Window: 2005-02-28 to 2020-01-31
  Yearly: 40 | FinBERT: 10 | Union/Intersection: 50 | Assets w/ data: 33
  Running Deep Learning Regression...
  Computing GMV weights...
  Computing MV weig

In [3]:
print(f"\n GMV")
print(f"Annualized Sharpe Ratio: {metrics['annual_sharpe_ratio']:.4f}")
print(f"Mean Return: {metrics['mean_return']*12:.4f}")
print(f"Variance: {metrics['variance']*12:.4f}")
print(f"Avg Turnover: {metrics['avg_turnover']:.4f}")

print(f"\n MV")
print(f"Annualized Sharpe Ratio: {metrics_2['annual_sharpe_ratio']:.4f}")
print(f"Mean Return: {metrics_2['mean_return']*12:.4f}")
print(f"Variance: {metrics_2['variance']*12:.4f}")
print(f"Avg Turnover: {metrics_2['avg_turnover']:.4f}")

print(f"\n MSR")
print(f"Annualized Sharpe Ratio: {metrics_3['annual_sharpe_ratio']:.4f}")
print(f"Mean Return: {metrics_3['mean_return']*12:.4f}")
print(f"Variance: {metrics_3['variance']*12:.4f}")
print(f"Avg Turnover: {metrics_3['avg_turnover']:.4f}")


 GMV
Annualized Sharpe Ratio: 0.9339
Mean Return: 0.2117
Variance: 0.0514
Avg Turnover: 1.2252

 MV
Annualized Sharpe Ratio: 1.1673
Mean Return: 0.3402
Variance: 0.0850
Avg Turnover: 1.9413

 MSR
Annualized Sharpe Ratio: 0.9035
Mean Return: 0.1878
Variance: 0.0432
Avg Turnover: 1.3149


# Testing long-short on DL-MV

In [4]:
import numpy as np
import pandas as pd

def construct_long_short_portfolios(results_df_2, df, transaction_cost=0.001, verbose=True):
    """
    Construct equal-weighted and value-weighted long-short portfolios.
    Long top 10% of weights, short bottom 10% of weights (net-zero).
    
    Parameters:
    -----------
    results_df_2 : pd.DataFrame
        DataFrame with columns: 'date', 'portfolio_weights'
        portfolio_weights contains dict of {permno: weight}
    df : pd.DataFrame
        DataFrame with columns: 'datadate', 'permno', 'mve_m', 'ret_fwd_1'
    transaction_cost : float
        Transaction cost rate (default: 0.001 = 10 bps)
    verbose : bool
        Print progress information
    
    Returns:
    --------
    results_ew : pd.DataFrame
        Equal-weighted long-short portfolio results
    results_vw : pd.DataFrame
        Value-weighted long-short portfolio results
    metrics_ew : dict
        Performance metrics for equal-weighted portfolio
    metrics_vw : dict
        Performance metrics for value-weighted portfolio
    """
    
    # Storage for results
    ew_returns = []
    ew_gross_returns = []
    ew_turnovers = []
    ew_dates = []
    ew_long_weights_list = []
    ew_short_weights_list = []
    
    vw_returns = []
    vw_gross_returns = []
    vw_turnovers = []
    vw_dates = []
    vw_long_weights_list = []
    vw_short_weights_list = []
    
    # Previous period state for transaction cost calculation
    prev_ew_weights = {}  # Combined long and short positions
    prev_ew_returns_dict = {}
    prev_ew_gross_return = 0.0
    
    prev_vw_weights = {}
    prev_vw_returns_dict = {}
    prev_vw_gross_return = 0.0
    
    if verbose:
        print("="*80)
        print("CONSTRUCTING LONG-SHORT PORTFOLIOS (TOP 10% LONG, BOTTOM 10% SHORT)")
        print("="*80)
    
    for idx, row in results_df_2.iterrows():
        current_date = row['date'].strftime("%Y-%m-%d")
        weights_dict = row['portfolio_weights']
        
        if not weights_dict or len(weights_dict) == 0:
            if verbose:
                print(f"\n[{idx+1}/{len(results_df_2)}] Date: {current_date}")
                print(f"  ⚠ No assets, liquidating all positions")
            
            # Get returns for previous positions to properly calculate liquidation costs
            date_data = df[df['datadate'] == current_date].set_index('permno')
            returns_dict = {}
            for permno in set(list(prev_ew_weights.keys()) + list(prev_vw_weights.keys())):
                if permno in date_data.index:
                    ret = date_data.loc[permno, 'ret_fwd_1']
                    if pd.notna(ret):
                        returns_dict[permno] = ret
            
            # Calculate liquidation transaction costs for EW
            if len(prev_ew_weights) > 0:
                adjusted_prev_ew = {}
                for asset, prev_w in prev_ew_weights.items():
                    if asset in returns_dict:
                        prev_r = returns_dict[asset]
                        if abs(1 + prev_ew_gross_return) > 1e-6:
                            adjusted_prev_ew[asset] = prev_w * (1 + prev_r) / (1 + prev_ew_gross_return)
                        else:
                            adjusted_prev_ew[asset] = 0.0
                    else:
                        if abs(1 + prev_ew_gross_return) > 1e-6:
                            adjusted_prev_ew[asset] = prev_w / (1 + prev_ew_gross_return)
                        else:
                            adjusted_prev_ew[asset] = 0.0
                
                ew_turnover = sum(abs(w) for w in adjusted_prev_ew.values())
                ew_tc = transaction_cost * 1.0 * ew_turnover
                ew_net_return = -ew_tc
                
                ew_returns.append(ew_net_return)
                ew_gross_returns.append(0.0)
                ew_turnovers.append(ew_turnover)
                ew_dates.append(current_date)
                ew_long_weights_list.append({})
                ew_short_weights_list.append({})
                
                if verbose:
                    print(f"  EW - Liquidating | TO: {ew_turnover:>6.4f} | TC: {ew_tc:>8.6f} | Net: {ew_net_return:>8.5f}")
            else:
                ew_returns.append(0.0)
                ew_gross_returns.append(0.0)
                ew_turnovers.append(0.0)
                ew_dates.append(current_date)
                ew_long_weights_list.append({})
                ew_short_weights_list.append({})
            
            # Calculate liquidation transaction costs for VW
            if len(prev_vw_weights) > 0:
                adjusted_prev_vw = {}
                for asset, prev_w in prev_vw_weights.items():
                    if asset in returns_dict:
                        prev_r = returns_dict[asset]
                        if abs(1 + prev_vw_gross_return) > 1e-6:
                            adjusted_prev_vw[asset] = prev_w * (1 + prev_r) / (1 + prev_vw_gross_return)
                        else:
                            adjusted_prev_vw[asset] = 0.0
                    else:
                        if abs(1 + prev_vw_gross_return) > 1e-6:
                            adjusted_prev_vw[asset] = prev_w / (1 + prev_vw_gross_return)
                        else:
                            adjusted_prev_vw[asset] = 0.0
                
                vw_turnover = sum(abs(w) for w in adjusted_prev_vw.values())
                vw_tc = transaction_cost * 1.0 * vw_turnover
                vw_net_return = -vw_tc
                
                vw_returns.append(vw_net_return)
                vw_gross_returns.append(0.0)
                vw_turnovers.append(vw_turnover)
                vw_dates.append(current_date)
                vw_long_weights_list.append({})
                vw_short_weights_list.append({})
                
                if verbose:
                    print(f"  VW - Liquidating | TO: {vw_turnover:>6.4f} | TC: {vw_tc:>8.6f} | Net: {vw_net_return:>8.5f}")
            else:
                vw_returns.append(0.0)
                vw_gross_returns.append(0.0)
                vw_turnovers.append(0.0)
                vw_dates.append(current_date)
                vw_long_weights_list.append({})
                vw_short_weights_list.append({})
            
            # Reset state
            prev_ew_weights = {}
            prev_ew_returns_dict = {}
            prev_ew_gross_return = 0.0
            prev_vw_weights = {}
            prev_vw_returns_dict = {}
            prev_vw_gross_return = 0.0
            
            continue
        
        # Sort assets by weight
        sorted_assets = sorted(weights_dict.items(), key=lambda x: x[1], reverse=True)
        n_assets = len(sorted_assets)
        
        # If < 10 firms, just long/short one firm each
        if n_assets < 10:
            n_long = 1
            n_short = 1
        else:
            n_long = max(1, int(np.ceil(0.1 * n_assets)))  # Top 10%
            n_short = max(1, int(np.ceil(0.1 * n_assets)))  # Bottom 10%
        
        # Select long and short assets
        long_assets = [permno for permno, _ in sorted_assets[:n_long]]
        short_assets = [permno for permno, _ in sorted_assets[-n_short:]]
        
        # Get market values and returns for these assets
        date_data = df[df['datadate'] == current_date].set_index('permno')
        
        # Filter to assets with valid data
        long_assets_valid = [p for p in long_assets if p in date_data.index 
                            and pd.notna(date_data.loc[p, 'ret_fwd_1'])]
        short_assets_valid = [p for p in short_assets if p in date_data.index 
                             and pd.notna(date_data.loc[p, 'ret_fwd_1'])]
        
        if len(long_assets_valid) == 0 or len(short_assets_valid) == 0:
            if verbose:
                print(f"\n[{idx+1}/{len(results_df_2)}] Date: {current_date}")
                print(f"  ⚠ No valid long ({len(long_assets_valid)}) or short ({len(short_assets_valid)}) assets")
            continue
        
        # === EQUAL-WEIGHTED PORTFOLIO ===
        # Long: +1/n_long each, Short: -1/n_short each
        ew_long_weight = 1.0 / len(long_assets_valid)
        ew_short_weight = -1.0 / len(short_assets_valid)
        
        ew_weights_dict = {}
        for permno in long_assets_valid:
            ew_weights_dict[permno] = ew_long_weight
        for permno in short_assets_valid:
            ew_weights_dict[permno] = ew_short_weight
        
        # === VALUE-WEIGHTED PORTFOLIO ===
        # Get market values
        long_mve = {}
        short_mve = {}
        
        for permno in long_assets_valid:
            mve = date_data.loc[permno, 'mve_m']
            if pd.notna(mve) and mve > 0:
                long_mve[permno] = mve
        
        for permno in short_assets_valid:
            mve = date_data.loc[permno, 'mve_m']
            if pd.notna(mve) and mve > 0:
                short_mve[permno] = mve
        
        if len(long_mve) == 0 or len(short_mve) == 0:
            if verbose:
                print(f"\n[{idx+1}/{len(results_df_2)}] Date: {current_date}")
                print(f"  ⚠ No valid market values for long ({len(long_mve)}) or short ({len(short_mve)})")
            continue
        
        # Value-weighted: normalize to sum to +1 for long, -1 for short
        total_long_mve = sum(long_mve.values())
        total_short_mve = sum(short_mve.values())
        
        vw_weights_dict = {}
        for permno, mve in long_mve.items():
            vw_weights_dict[permno] = mve / total_long_mve
        for permno, mve in short_mve.items():
            vw_weights_dict[permno] = -mve / total_short_mve
        
        # === COMPUTE RETURNS ===
        # Get returns for all assets
        returns_dict = {}
        for permno in set(list(ew_weights_dict.keys()) + list(vw_weights_dict.keys())):
            ret = date_data.loc[permno, 'ret_fwd_1']
            if pd.notna(ret):
                returns_dict[permno] = ret
        
        # Equal-weighted gross return
        ew_gross_return = sum(ew_weights_dict[p] * returns_dict[p] 
                             for p in ew_weights_dict.keys() if p in returns_dict)
        
        # Value-weighted gross return
        vw_gross_return = sum(vw_weights_dict[p] * returns_dict[p] 
                             for p in vw_weights_dict.keys() if p in returns_dict)
        
        # === TRANSACTION COSTS ===
        # Equal-weighted
        if len(prev_ew_weights) > 0:
            # Adjust previous weights for returns
            adjusted_prev_ew = {}
            for asset, prev_w in prev_ew_weights.items():
                if asset in prev_ew_returns_dict:
                    prev_r = prev_ew_returns_dict[asset]
                    if abs(1 + prev_ew_gross_return) > 1e-6:
                        adjusted_prev_ew[asset] = prev_w * (1 + prev_r) / (1 + prev_ew_gross_return)
                    else:
                        adjusted_prev_ew[asset] = 0.0
                else:
                    if abs(1 + prev_ew_gross_return) > 1e-6:
                        adjusted_prev_ew[asset] = prev_w / (1 + prev_ew_gross_return)
                    else:
                        adjusted_prev_ew[asset] = 0.0
            
            # Calculate turnover
            all_assets_ew = set(adjusted_prev_ew.keys()) | set(ew_weights_dict.keys())
            ew_turnover = sum(abs(ew_weights_dict.get(a, 0.0) - adjusted_prev_ew.get(a, 0.0)) 
                            for a in all_assets_ew)
            ew_tc = transaction_cost * (1 + ew_gross_return) * ew_turnover
        else:
            # First period: sum of absolute weights (should be 2.0 for net-zero portfolio)
            ew_turnover = sum(abs(w) for w in ew_weights_dict.values())
            ew_tc = transaction_cost * (1 + ew_gross_return) * ew_turnover
        
        # Value-weighted
        if len(prev_vw_weights) > 0:
            adjusted_prev_vw = {}
            for asset, prev_w in prev_vw_weights.items():
                if asset in prev_vw_returns_dict:
                    prev_r = prev_vw_returns_dict[asset]
                    if abs(1 + prev_vw_gross_return) > 1e-6:
                        adjusted_prev_vw[asset] = prev_w * (1 + prev_r) / (1 + prev_vw_gross_return)
                    else:
                        adjusted_prev_vw[asset] = 0.0
                else:
                    if abs(1 + prev_vw_gross_return) > 1e-6:
                        adjusted_prev_vw[asset] = prev_w / (1 + prev_vw_gross_return)
                    else:
                        adjusted_prev_vw[asset] = 0.0
            
            all_assets_vw = set(adjusted_prev_vw.keys()) | set(vw_weights_dict.keys())
            vw_turnover = sum(abs(vw_weights_dict.get(a, 0.0) - adjusted_prev_vw.get(a, 0.0)) 
                            for a in all_assets_vw)
            vw_tc = transaction_cost * (1 + vw_gross_return) * vw_turnover
        else:
            vw_turnover = sum(abs(w) for w in vw_weights_dict.values())
            vw_tc = transaction_cost * (1 + vw_gross_return) * vw_turnover
        
        # Net returns
        ew_net_return = ew_gross_return - ew_tc
        vw_net_return = vw_gross_return - vw_tc
        
        # Store results
        ew_returns.append(ew_net_return)
        ew_gross_returns.append(ew_gross_return)
        ew_turnovers.append(ew_turnover)
        ew_dates.append(current_date)
        ew_long_weights_list.append({p: w for p, w in ew_weights_dict.items() if w > 0})
        ew_short_weights_list.append({p: w for p, w in ew_weights_dict.items() if w < 0})
        
        vw_returns.append(vw_net_return)
        vw_gross_returns.append(vw_gross_return)
        vw_turnovers.append(vw_turnover)
        vw_dates.append(current_date)
        vw_long_weights_list.append({p: w for p, w in vw_weights_dict.items() if w > 0})
        vw_short_weights_list.append({p: w for p, w in vw_weights_dict.items() if w < 0})
        
        # Update previous state
        prev_ew_weights = ew_weights_dict.copy()
        prev_ew_returns_dict = returns_dict.copy()
        prev_ew_gross_return = ew_gross_return
        
        prev_vw_weights = vw_weights_dict.copy()
        prev_vw_returns_dict = returns_dict.copy()
        prev_vw_gross_return = vw_gross_return
        
        if verbose:
            print(f"\n[{idx+1}/{len(results_df_2)}] Date: {current_date}")
            print(f"  Long: {len(long_assets_valid)} assets | Short: {len(short_assets_valid)} assets")
            
            # Print EW positions
            print(f"\n  EQUAL-WEIGHTED:")
            print(f"    Long positions:")
            for permno, weight in sorted([(p, w) for p, w in ew_weights_dict.items() if w > 0], 
                                        key=lambda x: x[1], reverse=True):
                print(f"      PERMNO {permno}: {weight:>8.5f}")
            print(f"    Short positions:")
            for permno, weight in sorted([(p, w) for p, w in ew_weights_dict.items() if w < 0], 
                                        key=lambda x: x[1]):
                print(f"      PERMNO {permno}: {weight:>8.5f}")
            print(f"    Gross: {ew_gross_return:>8.5f} | TO: {ew_turnover:>6.4f} | "
                  f"TC: {ew_tc:>8.6f} | Net: {ew_net_return:>8.5f}")
            
            # Print VW positions
            print(f"\n  VALUE-WEIGHTED:")
            print(f"    Long positions:")
            for permno, weight in sorted([(p, w) for p, w in vw_weights_dict.items() if w > 0], 
                                        key=lambda x: x[1], reverse=True):
                print(f"      PERMNO {permno}: {weight:>8.5f}")
            print(f"    Short positions:")
            for permno, weight in sorted([(p, w) for p, w in vw_weights_dict.items() if w < 0], 
                                        key=lambda x: x[1]):
                print(f"      PERMNO {permno}: {weight:>8.5f}")
            print(f"    Gross: {vw_gross_return:>8.5f} | TO: {vw_turnover:>6.4f} | "
                  f"TC: {vw_tc:>8.6f} | Net: {vw_net_return:>8.5f}")
    
    # === CREATE RESULTS DATAFRAMES ===
    results_ew = pd.DataFrame({
        'date': ew_dates,
        'net_return': ew_returns,
        'gross_return': ew_gross_returns,
        'turnover': ew_turnovers,
        'long_weights': ew_long_weights_list,
        'short_weights': ew_short_weights_list
    })
    results_ew['cumulative_return'] = (1 + results_ew['net_return']).cumprod() - 1
    
    results_vw = pd.DataFrame({
        'date': vw_dates,
        'net_return': vw_returns,
        'gross_return': vw_gross_returns,
        'turnover': vw_turnovers,
        'long_weights': vw_long_weights_list,
        'short_weights': vw_short_weights_list
    })
    results_vw['cumulative_return'] = (1 + results_vw['net_return']).cumprod() - 1
    
    # === COMPUTE METRICS ===
    def compute_metrics(returns_list, turnovers_list, results_df):
        if len(returns_list) > 0:
            mean_return = np.mean(returns_list)
            variance = np.var(returns_list, ddof=1)
            sharpe_ratio = mean_return / np.sqrt(variance) if variance > 0 else 0
            
            # Annualized metrics (assuming monthly data)
            annual_return = mean_return * 12
            annual_volatility = np.sqrt(variance * 12)
            annual_sharpe = annual_return / annual_volatility if annual_volatility > 0 else 0
            
            return {
                'mean_return': mean_return,
                'std_dev': np.sqrt(variance),
                'sharpe_ratio': sharpe_ratio,
                'annual_return': annual_return,
                'annual_volatility': annual_volatility,
                'annual_sharpe_ratio': annual_sharpe,
                'total_return': results_df['cumulative_return'].iloc[-1] if len(results_df) > 0 else 0,
                'avg_turnover': np.mean(turnovers_list),
                'n_periods': len(returns_list)
            }
        else:
            return {
                'mean_return': 0, 'std_dev': 0, 'sharpe_ratio': 0,
                'annual_return': 0, 'annual_volatility': 0, 'annual_sharpe_ratio': 0,
                'total_return': 0, 'avg_turnover': 0, 'n_periods': 0
            }
    
    metrics_ew = compute_metrics(ew_returns, ew_turnovers, results_ew)
    metrics_vw = compute_metrics(vw_returns, vw_turnovers, results_vw)
    
    if verbose:
        print("\n" + "="*80)
        print("LONG-SHORT PORTFOLIO RESULTS")
        print("="*80)
        print("\nEQUAL-WEIGHTED:")
        print(f"  Sharpe Ratio:         {metrics_ew['sharpe_ratio']:.4f}")
        print(f"  Annual Sharpe Ratio:  {metrics_ew['annual_sharpe_ratio']:.4f}")
        print(f"  Mean Return:          {metrics_ew['mean_return']:.4%}")
        print(f"  Annual Return:        {metrics_ew['annual_return']:.4%}")
        print(f"  Annual Volatility:    {metrics_ew['annual_volatility']:.4%}")
        print(f"  Total Return:         {metrics_ew['total_return']:.4%}")
        print(f"  Avg Turnover:         {metrics_ew['avg_turnover']:.4f}")
        print(f"  N Periods:            {metrics_ew['n_periods']}")
        
        print("\nVALUE-WEIGHTED:")
        print(f"  Sharpe Ratio:         {metrics_vw['sharpe_ratio']:.4f}")
        print(f"  Annual Sharpe Ratio:  {metrics_vw['annual_sharpe_ratio']:.4f}")
        print(f"  Mean Return:          {metrics_vw['mean_return']:.4%}")
        print(f"  Annual Return:        {metrics_vw['annual_return']:.4%}")
        print(f"  Annual Volatility:    {metrics_vw['annual_volatility']:.4%}")
        print(f"  Total Return:         {metrics_vw['total_return']:.4%}")
        print(f"  Avg Turnover:         {metrics_vw['avg_turnover']:.4f}")
        print(f"  N Periods:            {metrics_vw['n_periods']}")
    
    return results_ew, metrics_ew, results_vw, metrics_vw


In [5]:
# Example usage:
results_ew, metrics_ew, results_vw, metrics_vw = construct_long_short_portfolios(
    results_df_2, df, transaction_cost=0.001, verbose=True
)

CONSTRUCTING LONG-SHORT PORTFOLIOS (TOP 10% LONG, BOTTOM 10% SHORT)

[1/52] Date: 2020-01-31
  Long: 4 assets | Short: 4 assets

  EQUAL-WEIGHTED:
    Long positions:
      PERMNO 11404:  0.25000
      PERMNO 27959:  0.25000
      PERMNO 52090:  0.25000
      PERMNO 46578:  0.25000
    Short positions:
      PERMNO 12060: -0.25000
      PERMNO 82775: -0.25000
      PERMNO 89195: -0.25000
      PERMNO 70519: -0.25000
    Gross:  0.07002 | TO: 2.0000 | TC: 0.002140 | Net:  0.06788

  VALUE-WEIGHTED:
    Long positions:
      PERMNO 27959:  0.48731
      PERMNO 11404:  0.21928
      PERMNO 52090:  0.15292
      PERMNO 46578:  0.14050
    Short positions:
      PERMNO 70519: -0.56446
      PERMNO 12060: -0.31543
      PERMNO 82775: -0.07088
      PERMNO 89195: -0.04923
    Gross:  0.06834 | TO: 2.0000 | TC: 0.002137 | Net:  0.06621

[2/52] Date: 2020-02-29
  Long: 4 assets | Short: 4 assets

  EQUAL-WEIGHTED:
    Long positions:
      PERMNO 27959:  0.25000
      PERMNO 11404:  0.25000
   