In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
import warnings
warnings.filterwarnings('ignore')


def naive_nodewise_regression(Y_star, lambda_grid=None):
    """
    Implements Naive Nodewise Regression (Section 5.1.2).
    Uses GIC (Generalized Information Criterion) as in the paper.
    
    Parameters:
    -----------
    Y_star : np.ndarray, shape (n, p)
        Demeaned returns matrix (time x assets)
    lambda_grid : list or None
        Grid of lambda values to try. If None, creates default grid.
    
    Returns:
    --------
    Theta_hat : np.ndarray, shape (p, p)
        Estimated precision matrix
    """
    n, p = Y_star.shape

    # Initialize matrices
    Theta_hat = np.zeros((p, p))
    tau_squared = np.zeros(p)
    
    # Create lambda grid if not provided
    if lambda_grid is None:
        lambda_grid = np.logspace(-3, 1, 50)
    
    # For each asset j
    for j in range(p):
        # Step 1: Get y_j (target) and Y_{-j} (predictors)
        y_j = Y_star[:, j]
        Y_minus_j = np.delete(Y_star, j, axis=1)
        
        # Step 2-3: Estimate gamma_j using Lasso with GIC
        # GIC(λ) = log(σ²_λ) + |S_λ| * (log(p-1) / n) * log(log(n))
        best_gic = np.inf
        best_lambda = lambda_grid[0]
        best_gamma = None
        best_ssr = None
        
        for lam in lambda_grid:
            lasso = Lasso(alpha=2*lam, fit_intercept=False, max_iter=10000)
            lasso.fit(Y_minus_j, y_j)
            gamma_j = lasso.coef_
            
            # Compute SSR and number of non-zero coefficients
            residuals = y_j - Y_minus_j @ gamma_j
            ssr = np.sum(residuals ** 2)
            sigma_sq_lambda = ssr / n
            q_lambda = np.sum(np.abs(gamma_j) > 1e-8)
            
            # Compute GIC
            if sigma_sq_lambda > 1e-10: # Check for non-zero variance
                # GIC formula from paper
                gic = np.log(sigma_sq_lambda) + q_lambda * (np.log(p) / n) * np.log(np.log(n))
            else:
                gic = np.inf
            
            if gic < best_gic:
                best_gic = gic
                best_lambda = lam
                best_gamma = gamma_j.copy()
                best_ssr = ssr
        
        gamma_j_star = best_gamma
        

        tau_squared[j] = best_ssr / n + best_lambda * np.sum(np.abs(gamma_j_star))

        # [cite_start]Step 5: Form the j-th row of Theta_hat [cite: 579, 543-547]
        Theta_hat[j, j] = 1 / tau_squared[j]
        off_diag = -gamma_j_star / tau_squared[j]
        Theta_hat[j, :j] = off_diag[:j]
        Theta_hat[j, j+1:] = off_diag[j:]
    
    # Step 6: Symmetrize
    Theta_hat_sym = (Theta_hat + Theta_hat.T) / 2
    
    return Theta_hat_sym


def gmv_weights(Theta_hat):
    """
    Compute Global Minimum Variance (GMV) portfolio weights (Section 6.1).
    
    Parameters:
    -----------
    Theta_hat : np.ndarray, shape (p, p)
        Precision matrix
    
    Returns:
    --------
    w_star : np.ndarray, shape (p,)
        Portfolio weights
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    # w* = (Θ 1_p) / (1_p' Θ 1_p)
    numerator = Theta_hat @ ones_p
    denominator = ones_p @ Theta_hat @ ones_p
    
    if np.abs(denominator) < 1e-10:
        # Fallback to equal weights if precision matrix is near-singular
        return ones_p / p
    
    w_star = numerator / denominator
    
    return w_star


def compute_portfolio_metrics(returns, weights):
    """
    Compute portfolio return, variance, and Sharpe ratio.
    """

    portfolio_returns = returns @ weights
    mean_return = np.mean(portfolio_returns)
    variance = np.var(portfolio_returns, ddof=1)
    sharpe_ratio = mean_return / np.sqrt(variance) if variance > 0 else 0
    
    return {
        'return': mean_return,
        'variance': variance,
        'sharpe_ratio': sharpe_ratio
    }


def backtest_nodewise_gmv(df, 
                          test_start_date='2000-01-31', 
                          test_end_date='2003-12-31',
                          lookback_window=180,
                          transaction_cost=0.005,
                          verbose=True):
    """
    Backtest Nodewise + GMV strategy with monthly rebalancing,
    180-month rolling window, and NaN filtering as per the paper.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with columns: permno, datadate, ret_fwd_1
    test_start_date : str
        First date for out-of-sample returns (format: 'YYYY-MM-DD')
    test_end_date : str
        Last date for out-of-sample returns (format: 'YYYY-MM-DD')
    lookback_window : int
        Number of months in rolling training window (default: 180)
    transaction_cost : float
        Proportional transaction cost (default: 0.005 = 50 bps)
    verbose : bool
        If True, prints detailed log at each time step.
    
    Returns:
    --------
    results_df : pd.DataFrame
        DataFrame with columns: date, portfolio_return, cumulative_return
    metrics : dict
        Overall performance metrics
    """
    # --- 1. Setup ---
    df = df.copy()
    if 'datadate' not in df.columns or 'permno' not in df.columns:
        raise ValueError("DataFrame must have 'datadate' and 'permno' columns")
    df['datadate'] = pd.to_datetime(df['datadate'])
    
    # Get unique dates
    all_dates = sorted(df['datadate'].unique())
    
    # Convert test dates to datetime
    test_start_dt = pd.to_datetime(test_start_date)
    test_end_dt = pd.to_datetime(test_end_date)
    
    # Find date indices
    try:
        test_start_idx = all_dates.index(test_start_dt)
        test_end_idx = all_dates.index(test_end_dt)
    except ValueError as e:
        raise ValueError(f"Date not found in DataFrame: {e}")
    
    if test_start_idx < lookback_window:
        raise ValueError(f"Not enough data for lookback. Test start date {test_start_date} "
                         f"requires data back to {all_dates[test_start_idx - lookback_window]}, "
                         f"but only {test_start_idx} periods are available.")
    
    # Storage for results
    portfolio_returns = []
    portfolio_dates = []
    portfolio_weights_list = []
    portfolio_turnover_list = []
    portfolio_gross_returns = []
    
    # FIX 7: Use dictionary to track weights by permno (handles entry/exit)
    prev_weights_dict = {}  # Maps permno -> weight
    prev_oos_returns_dict = {}  # Maps permno -> return
    prev_gross_return = 0.0
    
    # --- 2. Rolling Window Backtest ---
    if verbose:
        print("="*60)
        print("STARTING BACKTEST")
        print("="*60)
        
    for t in range(test_start_idx, test_end_idx + 1):
        current_date = all_dates[t]
        
        # Define the lookback window
        window_start_date = all_dates[t - lookback_window]
        window_end_date = all_dates[t - 1]
        
        # Get training data for this window
        train_data = df[(df['datadate'] >= window_start_date) & 
                        (df['datadate'] <= window_end_date)]
        
        # Pivot to get returns matrix (time x assets)
        returns_pivot = train_data.pivot(index='datadate', columns='permno', values='ret_fwd_1')
        
        # Reindex to ensure all dates are present
        window_dates = all_dates[t - lookback_window : t]
        returns_pivot = returns_pivot.reindex(index=window_dates)
        
        # Filter assets with any NaNs in this window
        # (This follows the paper's approach of requiring complete data)
        nan_assets = returns_pivot.columns[returns_pivot.isna().any()]
        filtered_pivot = returns_pivot.drop(columns=nan_assets)
        
        current_assets = filtered_pivot.columns.tolist()
        Y = filtered_pivot.values
        n_train, p_current = Y.shape

        if verbose:
            print(f"\n[{t - test_start_idx + 1}/{test_end_idx - test_start_idx + 1}] "
                  f"Date: {current_date.strftime('%Y-%m-%d')}")
            print(f"  Window: {window_start_date.strftime('%Y-%m-%d')} to "
                  f"{window_end_date.strftime('%Y-%m-%d')}")
            print(f"  Assets: {p_current} with complete data")

        # Check for valid data
        if n_train < lookback_window or p_current < 2:
            if verbose:
                print(f"  ⚠ Insufficient data (n={n_train}, p={p_current}), using prev weights")
            new_weights_dict = prev_weights_dict.copy()
        else:
            try:
                # Demean the returns
                Y_bar = Y.mean(axis=0)
                Y_star = Y - Y_bar
                
                if verbose:
                    print(f"  Running Nodewise Regression...")
                Theta_hat = naive_nodewise_regression(Y_star)
                
                if verbose:
                    print(f"  Computing GMV weights...")
                w_star = gmv_weights(Theta_hat)
                
                # Create weights dictionary
                new_weights_dict = {asset: w_star[i] for i, asset in enumerate(current_assets)}
                
            except Exception as e:
                if verbose:
                    print(f"  ✗ Error: {e}")
                    print(f"  Using previous weights")
                new_weights_dict = prev_weights_dict.copy()

        # Normalize weights to sum to 1
        weight_sum = sum(new_weights_dict.values())
        if weight_sum > 1e-10:
            new_weights_dict = {k: v/weight_sum for k, v in new_weights_dict.items()}
        else:
            if verbose:
                print("  ⚠ Zero weight sum, using previous weights")
            new_weights_dict = prev_weights_dict.copy()
        
        # --- 3. OOS Returns & Transaction Costs ---
        
        # Get out-of-sample returns for current month
        oos_data = df[df['datadate'] == current_date]
        oos_returns_series = oos_data.set_index('permno')['ret_fwd_1']
        
        # FIX 8: Filter out NaN returns and create dictionary
        oos_returns_series = oos_returns_series.dropna()
        oos_returns_dict = oos_returns_series.to_dict()
        
        # Find common assets between weights and returns (both existing and non-NaN)
        common_assets = set(new_weights_dict.keys()) & set(oos_returns_dict.keys())
        
        if len(common_assets) == 0:
            if verbose:
                print("  ⚠ No common assets with valid returns, skipping period")
            continue
        
        # Filter to common assets and renormalize
        common_weights = {a: new_weights_dict[a] for a in common_assets}
        common_weight_sum = sum(common_weights.values())
        if common_weight_sum > 1e-10:
            common_weights = {k: v/common_weight_sum for k, v in common_weights.items()}
        else:
            if verbose:
                print("  ⚠ Zero weight sum after filtering, skipping period")
            continue
        
        # Compute gross portfolio return (all returns should be valid now)
        gross_return = sum(common_weights[a] * oos_returns_dict[a] for a in common_assets)
        
        # Sanity check
        if np.isnan(gross_return) or np.isinf(gross_return):
            if verbose:
                print(f"  ⚠ Invalid gross return: {gross_return}, skipping period")
            continue
        
        # FIX 9: Calculate transaction costs with proper weight adjustment
        if len(prev_weights_dict) > 0:
            # Get the universe of all assets (current + previous)
            all_traded_assets = set(common_weights.keys()) | set(prev_weights_dict.keys())
            
            # Adjust previous weights for all assets that were held
            # w+_{t,j} = w_{t,j} * (1 + r_{t,j}) / (1 + r_p,t)
            adjusted_prev = {}
            for asset in all_traded_assets:
                prev_w = prev_weights_dict.get(asset, 0.0)
                
                if asset in prev_oos_returns_dict:
                    prev_r = prev_oos_returns_dict[asset]
                    # Avoid division by zero
                    if prev_gross_return > -0.99:  # Allow for up to 99% loss
                        adjusted_prev[asset] = prev_w * (1 + prev_r) / (1 + prev_gross_return)
                    else:
                        adjusted_prev[asset] = 0.0
                else:
                    # Asset wasn't in portfolio last period, so adjusted weight is 0
                    adjusted_prev[asset] = 0.0
            
            # Renormalize adjusted weights (only over assets that still exist)
            adj_sum = sum(adjusted_prev.get(a, 0.0) for a in common_weights.keys())
            if adj_sum > 1e-10:
                adjusted_prev_normalized = {k: adjusted_prev.get(k, 0.0)/adj_sum 
                                           for k in common_weights.keys()}
            else:
                adjusted_prev_normalized = {k: 0.0 for k in common_weights.keys()}
            
            # Turnover: sum over ALL assets (current and previous)
            # This captures: rebalancing existing positions + exiting old + entering new
            turnover = sum(abs(common_weights.get(a, 0.0) - adjusted_prev_normalized.get(a, 0.0)) 
                          for a in all_traded_assets)
            
            # Transaction cost: c * (1 + gross_return) * turnover
            tc = transaction_cost * (1 + gross_return) * turnover
        else:
            # First period: full turnover (entering all positions)
            turnover = sum(abs(w) for w in common_weights.values())
            tc = transaction_cost * turnover
        
        # Net return
        net_return = gross_return - tc
        
        # Store results
        portfolio_returns.append(net_return)
        portfolio_dates.append(current_date)
        portfolio_weights_list.append(common_weights.copy())
        portfolio_turnover_list.append(turnover)
        portfolio_gross_returns.append(gross_return)
        
        # Update previous values for next iteration
        prev_weights_dict = common_weights.copy()
        prev_oos_returns_dict = {a: oos_returns_dict[a] for a in common_assets}
        prev_gross_return = gross_return
        
        if verbose:
            print(f"  Gross: {gross_return:>8.5f} | Turnover: {turnover:>6.4f} | "
                  f"TC: {tc:>8.6f} | Net: {net_return:>8.5f}")

    if verbose:
        print("\n" + "="*60)
        print("BACKTEST COMPLETE")
        print("="*60)
    
    # --- 4. Compile Results ---
    results_df = pd.DataFrame({
        'date': portfolio_dates,
        'portfolio_return': portfolio_returns,
        'portfolio_gross_return': portfolio_gross_returns,
        'portfolio_weights': portfolio_weights_list,
        'portfolio_turnover': portfolio_turnover_list
    })
    results_df['cumulative_return'] = (1 + results_df['portfolio_return']).cumprod() - 1
    
    # Compute overall metrics
    if len(portfolio_returns) > 0:
        mean_return = np.mean(portfolio_returns)
        variance = np.var(portfolio_returns, ddof=1)
        sharpe_ratio = mean_return / np.sqrt(variance) if variance > 0 else 0
        
        # Annualized metrics (monthly data)
        annual_return = mean_return * 12
        annual_volatility = np.sqrt(variance * 12)
        annual_sharpe = annual_return / annual_volatility if annual_volatility > 0 else 0
        
        metrics = {
            'mean_return': mean_return,
            'variance': variance,
            'sharpe_ratio': sharpe_ratio,
            'annual_return': annual_return,
            'annual_volatility': annual_volatility,
            'annual_sharpe_ratio': annual_sharpe,
            'total_return': results_df['cumulative_return'].iloc[-1],
            'avg_turnover': np.mean(portfolio_turnover_list),
            'n_periods': len(portfolio_returns)
        }
    else:
        metrics = {
            'mean_return': 0,
            'variance': 0,
            'sharpe_ratio': 0,
            'annual_return': 0,
            'annual_volatility': 0,
            'annual_sharpe_ratio': 0,
            'total_return': 0,
            'avg_turnover': 0,
            'n_periods': 0
        }
    
    return results_df, metrics

In [2]:
# Assuming you have loaded your dataframe as 'df'
df = pd.read_csv('../green cleaned.csv', dtype={'ncusip': 'string'})
df['ret_fwd_1'] = (df.groupby('permno')['ret_excess'].shift(-1) )

In [3]:
buys = pd.read_csv('buys.csv', index_col=1)
sells = pd.read_csv('sells.csv', index_col=1)
buys.index.name='permno'
sells.index.name='permno'

In [6]:
buys_index = buys.index.astype(int)
sells_index = sells.index.astype(int)

index = buys_index.union(sells_index)
index

Index([11404, 12369, 12558, 14776, 15707, 21020, 24053, 26710, 27959, 28484,
       34746, 36003, 46578, 47896, 49373, 52708, 57817, 57904, 59408, 59459,
       60442, 64282, 66093, 66800, 69032, 70519, 71563, 75107, 78916, 79323,
       81055, 82598, 82775, 85269, 85459, 86868, 86964, 87842, 89195, 92322],
      dtype='int64', name='permno')

In [7]:
df_filtered = df[df['permno'].isin(index)]

In [8]:
df_filtered

Unnamed: 0,datadate,permno,comnam,ncusip,shrcd,exchcd,siccd,industry,ticker,gvkey_x,...,BETA,betasq,rsq1,pricedelay,idiovol,year,mom6,indmom,industry_return,ret_fwd_1
9178,1980-01-31,11404,CONSOLIDATED EDISON CO NY INC,20911110,11,1,4932,Utils,ED,3413,...,-1.566032,0.223568,0.177811,-1.690356,-1.343182,1980,,0.000000,-0.001083,-0.069674
9179,1980-02-29,11404,CONSOLIDATED EDISON CO NY INC,20911110,11,1,4932,Utils,ED,3413,...,-1.660300,0.195325,0.166120,-0.019633,-1.398064,1980,,0.000000,-0.040298,-0.006218
9180,1980-03-31,11404,CONSOLIDATED EDISON CO NY INC,20911110,11,1,4932,Utils,ED,3413,...,-1.677291,0.182257,0.164222,-0.152879,-1.427993,1980,,0.000000,-0.060439,0.127751
9181,1980-04-30,11404,CONSOLIDATED EDISON CO NY INC,20911110,11,1,4932,Utils,ED,3413,...,-1.751379,0.161145,0.131506,0.849219,-1.362464,1980,,0.000000,0.108856,0.034772
9182,1980-05-31,11404,CONSOLIDATED EDISON CO NY INC,20911110,11,1,4932,Utils,ED,3413,...,-1.739403,0.137143,0.126378,1.555308,-1.363175,1980,,0.000000,0.037701,0.034304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267174,2024-08-31,92322,ULTA BEAUTY INC,90384S30,11,3,5999,Shops,ULTA,178704,...,-0.264944,0.667977,0.164541,-0.549524,0.562486,2024,-0.293240,-1.020220,-0.009293,0.098823
267175,2024-09-30,92322,ULTA BEAUTY INC,90384S30,11,3,5999,Shops,ULTA,178704,...,-0.301633,0.657322,0.159835,-0.277798,0.572679,2024,-0.375227,-1.229321,0.010808,-0.055658
267176,2024-10-31,92322,ULTA BEAUTY INC,90384S30,11,3,5999,Shops,ULTA,178704,...,-0.053729,0.795199,0.177672,0.015673,0.662640,2024,-0.276504,-1.263369,-0.028337,0.043862
267177,2024-11-30,92322,ULTA BEAUTY INC,90384S30,11,3,5999,Shops,ULTA,178704,...,-0.122171,0.742341,0.174624,-0.113983,0.716745,2024,-0.112173,-1.567353,0.073804,0.121196


In [9]:
results_df, metrics = backtest_nodewise_gmv(
    df_filtered,
    test_start_date='2020-01-31',  # Last date of training period
    test_end_date='2024-11-30',   # Last date of testing period
    lookback_window=180,
    transaction_cost=0.001
)
print(f"Sharpe Ratio: {metrics['sharpe_ratio']:.4f}")
print(f"Annualized Sharpe Ratio: {metrics['annual_sharpe_ratio']:.4f}")

STARTING BACKTEST

[1/59] Date: 2020-01-31
  Window: 2005-01-31 to 2019-12-31
  Assets: 28 with complete data
  Running Nodewise Regression...
  Computing GMV weights...
  Gross: -0.09034 | Turnover: 1.1206 | TC: 0.001121 | Net: -0.09146

[2/59] Date: 2020-02-29
  Window: 2005-02-28 to 2020-01-31
  Assets: 28 with complete data
  Running Nodewise Regression...
  Computing GMV weights...
  Gross: -0.16274 | Turnover: 0.0519 | TC: 0.000043 | Net: -0.16279

[3/59] Date: 2020-03-31
  Window: 2005-03-31 to 2020-02-29
  Assets: 28 with complete data
  Running Nodewise Regression...
  Computing GMV weights...
  Gross:  0.08449 | Turnover: 0.1283 | TC: 0.000139 | Net:  0.08435

[4/59] Date: 2020-04-30
  Window: 2005-04-30 to 2020-03-31
  Assets: 28 with complete data
  Running Nodewise Regression...
  Computing GMV weights...
  Gross:  0.02032 | Turnover: 0.0769 | TC: 0.000078 | Net:  0.02024

[5/59] Date: 2020-05-31
  Window: 2005-05-31 to 2020-04-30
  Assets: 27 with complete data
  Running 

In [133]:

with pd.option_context("display.max_rows", None):
    print(results_df['portfolio_weights'][0])

{71563: 0.046143226137337984, 11404: 0.12685064089769094, 59408: -0.012886094656159485, 85269: 0.0141167746368508, 47896: 0.06642555612494488, 60442: 0.06450228507596885, 81055: 0.0248985479751954, 87842: 0.05850088324891889, 69032: 0.017580664924571717, 66093: 0.095159059312691, 57904: 0.04705663787272832, 27959: 0.14337562372941687, 34746: -0.005332003750249765, 59459: 0.0973302505871284, 28484: 0.017675852878690914, 78916: 0.03966567853146244, 86868: 0.03237267882707811, 26710: 0.10338734372557407, 82775: -0.010512069999471836, 79323: 0.07209574324787528, 89195: -0.02184729992856469, 66800: -0.007948713766046368, 70519: -0.008611265633632726}


In [134]:
metrics['total_return']

0.5472443390053199

In [135]:
metrics['variance']

0.0030387906845717964

In [136]:
results_df['portfolio_turnover'].mean()

0.07637773521915324