In [112]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
import warnings
warnings.filterwarnings('ignore')


def naive_nodewise_regression(Y_star, lambda_grid=None):
    """
    Implements Naive Nodewise Regression (Section 5.1.2).
    Uses GIC (Generalized Information Criterion) as in the paper.
    
    Parameters:
    -----------
    Y_star : np.ndarray, shape (n, p)
        Demeaned returns matrix (time x assets)
    lambda_grid : list or None
        Grid of lambda values to try. If None, creates default grid.
    
    Returns:
    --------
    Theta_hat : np.ndarray, shape (p, p)
        Estimated precision matrix
    """
    n, p = Y_star.shape

    # Initialize matrices
    Theta_hat = np.zeros((p, p))
    tau_squared = np.zeros(p)
    
    # Create lambda grid if not provided
    if lambda_grid is None:
        lambda_grid = np.logspace(-3, 1, 50)
    
    # For each asset j
    for j in range(p):
        # Step 1: Get y_j (target) and Y_{-j} (predictors)
        y_j = Y_star[:, j]
        Y_minus_j = np.delete(Y_star, j, axis=1)
        
        # Step 2-3: Estimate gamma_j using Lasso with GIC
        # GIC(λ) = log(σ²_λ) + |S_λ| * (log(p-1) / n) * log(log(n))
        best_gic = np.inf
        best_lambda = lambda_grid[0]
        best_gamma = None
        best_ssr = None
        
        for lam in lambda_grid:
            lasso = Lasso(alpha=2*lam, fit_intercept=False, max_iter=10000)
            lasso.fit(Y_minus_j, y_j)
            gamma_j = lasso.coef_
            
            # Compute SSR and number of non-zero coefficients
            residuals = y_j - Y_minus_j @ gamma_j
            ssr = np.sum(residuals ** 2)
            sigma_sq_lambda = ssr / n
            q_lambda = np.sum(gamma_j != 0)
            
            # Compute GIC
            if sigma_sq_lambda > 1e-10: # Check for non-zero variance
                # GIC formula from paper
                gic = np.log(sigma_sq_lambda) + q_lambda * (np.log(p - 1) / n) * np.log(np.log(n))
            else:
                gic = np.inf
            
            if gic < best_gic:
                best_gic = gic
                best_lambda = lam
                best_gamma = gamma_j.copy()
                best_ssr = ssr
        
        gamma_j_star = best_gamma
        

        tau_squared[j] = best_ssr / n + best_lambda * np.sum(np.abs(gamma_j_star))

        # [cite_start]Step 5: Form the j-th row of Theta_hat [cite: 579, 543-547]
        Theta_hat[j, j] = 1 / tau_squared[j]
        off_diag = -gamma_j_star / tau_squared[j]
        Theta_hat[j, :j] = off_diag[:j]
        Theta_hat[j, j+1:] = off_diag[j:]
    
    # Step 6: Symmetrize
    #Theta_hat_sym = (Theta_hat + Theta_hat.T) / 2
    
    return Theta_hat


def gmv_weights(Theta_hat, p):
    """
    Compute Global Minimum Variance (GMV) portfolio weights (Section 6.1).
    """
    ones_p = np.ones(p)
    
    # [cite_start]w* = (Θ 1_p) / (1_p' Θ 1_p) [cite: 818-819]
    numerator = Theta_hat @ ones_p
    denominator = ones_p @ Theta_hat @ ones_p
    
    w_star = numerator / denominator
    
    return w_star


def compute_portfolio_metrics(returns, weights):
    """
    Compute portfolio return, variance, and Sharpe ratio.
    """

    portfolio_returns = returns @ weights
    mean_return = np.mean(portfolio_returns)
    variance = np.var(portfolio_returns, ddof=1)
    sharpe_ratio = mean_return / np.sqrt(variance) if variance > 0 else 0
    
    return {
        'return': mean_return,
        'variance': variance,
        'sharpe_ratio': sharpe_ratio
    }


def backtest_nodewise_gmv(df, test_start_date='2000-01-31', test_end_date='2003-12-31',
                          lookback_window=180,
                          transaction_cost=0.001,
                          verbose=True): # <-- Added verbose parameter
    """
    Backtest Nodewise + GMV strategy with monthly rebalancing,
    180-month rolling window, and NaN filtering as per the paper.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with columns: permno, datadate, ret_fwd_1
    test_start_date : str
        First date for *out-of-sample* returns (format: 'YYYY-MM-DD')
    test_end_date : str
        Last date for *out-of-sample* returns (format: 'YYYY-MM-DD')
    lookback_window : int
        Number of months in rolling training window (default: 180)
    transaction_cost : float
        Proportional transaction cost (default: 0.005 = 50 bps)
    verbose : bool
        If True, prints detailed log at each time step.
    
    Returns:
    --------
    results_df : pd.DataFrame
        DataFrame with columns: date, portfolio_return, cumulative_return
    metrics : dict
        Overall performance metrics
    """
    # --- 1. Setup ---
    df = df.copy()
    if 'datadate' not in df.columns or 'permno' not in df.columns:
        raise ValueError("DataFrame must have 'datadate' and 'permno' columns")
    df['datadate'] = pd.to_datetime(df['datadate'])
    
    # Get unique dates and all assets
    all_dates = sorted(df['datadate'].unique())
    all_assets = sorted(df['permno'].unique())
    p_full = len(all_assets)
    
    # Convert test dates to datetime
    test_start_dt = pd.to_datetime(test_start_date)
    test_end_dt = pd.to_datetime(test_end_date)
    
    # Find date indices
    try:
        test_start_idx = all_dates.index(test_start_dt)
        test_end_idx = all_dates.index(test_end_dt)
    except ValueError as e:
        raise ValueError(f"Date not found in DataFrame: {e}")
    
    if test_start_idx < lookback_window:
        raise ValueError(f"Not enough data for lookback. Test start date {test_start_date} "
                         f"requires data back to {all_dates[test_start_idx - lookback_window]}, "
                         f"but only {test_start_idx} periods are available.")
    
    # Storage for results
    portfolio_returns = []
    portfolio_dates = []
    portfolio_weights = []
    portfolio_turnover= []
    
    # Initial weights (full universe, equal-weighted)
    prev_weights = np.ones(p_full) / p_full
    last_oos_returns = np.zeros(p_full)
    last_gross_return = 0.0
    
    # --- 2. Rolling Window Backtest ---
    if verbose:
        print("--- Starting Backtest ---")
        
    for t in range(test_start_idx, test_end_idx + 1):
        current_date = all_dates[t]
        
        # Define the 180-month (or lookback_window) in-sample period
        window_start_date = all_dates[t - lookback_window]
        window_end_date = all_dates[t - 1]
        
        # Get training data for this window
        train_data = df[(df['datadate'] >= window_start_date) & 
                        (df['datadate'] <= window_end_date)]
        
        # Pivot to get returns matrix (time x assets)
        returns_pivot = train_data.pivot(index='datadate', columns='permno', values='ret_fwd_1')
        
        # Reindex to ensure all dates are present (fills missing dates with NaNs)
        window_dates = all_dates[t - lookback_window : t]
        returns_pivot = returns_pivot.reindex(index=window_dates, columns=all_assets)
        
        # **Filter assets with any NaNs in this window**
        nan_assets = returns_pivot.columns[returns_pivot.isna().any()]
        filtered_pivot = returns_pivot.drop(columns=nan_assets)
        
        current_assets = filtered_pivot.columns.tolist()
        Y = filtered_pivot.values
        n_train, p_current = Y.shape
        
        new_weights = np.zeros(p_full) # Start with 0 weights for all assets

        if verbose:
            print("\n" + "="*50)
            print(f"Processing Date: {current_date.strftime('%Y-%m-%d')}")
            print(f"  In-Sample Window: {window_start_date.strftime('%Y-%m-%d')} to {window_end_date.strftime('%Y-%m-%d')}")
            print(f"  Asset Universe: {p_full} total, {p_current} with complete data in window")

        # Check for valid data panel
        if n_train < lookback_window or p_current < 2:
            if verbose:
                print(f"  Skipping estimation: insufficient data (n={n_train}, p={p_current}). Carrying forward weights.")
            new_weights = prev_weights 
        else:
            try:
                # Demean the returns
                Y_bar = Y.mean(axis=0)
                Y_star = Y - Y_bar
                
                if verbose: print(f"  Running Naive Nodewise for {p_current} assets...")
                Theta_hat = naive_nodewise_regression(Y_star)
                
                if verbose: print("  Calculating GMV weights...")
                w_star_filtered = gmv_weights(Theta_hat, p_current)
                
                # Map weights back to full asset universe
                new_weights_series = pd.Series(0.0, index=all_assets)
                new_weights_series.loc[current_assets] = w_star_filtered
                new_weights = new_weights_series.values
                
            except Exception as e:
                if verbose:
                    print(f"  Error at {current_date}: {e}. Carrying forward weights.")
                new_weights = prev_weights

        # Normalize weights to sum to 1
        weight_sum = np.sum(new_weights)
        if np.abs(weight_sum) > 1e-6:
            new_weights = new_weights / weight_sum
        else:
            if verbose: print("  Warning: Zero weight sum, carrying forward weights.")
            new_weights = prev_weights
        
        # --- 3. OOS Calculation & Transaction Costs ---
        
        # Get out-of-sample returns for current month (full universe)
        oos_data = df[df['datadate'] == current_date]
        oos_returns_pivot = oos_data.pivot(index='datadate', columns='permno', values='ret_fwd_1')
        oos_returns_pivot = oos_returns_pivot.reindex(columns=all_assets)
        oos_returns = oos_returns_pivot.values[0]
        valid = ~np.isnan(oos_returns)
        oos_returns = np.nan_to_num(oos_returns, nan=0.0)
        
        # Set weights of missing-return stocks to 0 and re-normalize
        new_weights = new_weights * valid
        if new_weights.sum() != 0:
            new_weights /= new_weights.sum()
                
        # Compute gross portfolio return (y_P,t+1)
        gross_return = new_weights @ oos_returns
        
        # Adjust previous weights (w_t,j+) as per paper
        adj_prev_weights = prev_weights * (1 + last_oos_returns) / (1 + last_gross_return)
        
        # Compute turnover and transaction costs
        turnover = np.sum(np.abs(new_weights - adj_prev_weights))
        tc = transaction_cost * (1 + gross_return) * turnover
        
        # Net return (after transaction costs)
        net_return = gross_return - tc
        
        # Store results
        portfolio_returns.append(net_return)
        portfolio_dates.append(current_date)
        w_temp = pd.Series(new_weights, index=all_assets)
        w_temp = w_temp[w_temp != 0]
        portfolio_weights.append(w_temp)
        portfolio_turnover.append(turnover)
        
        # Update "previous" variables for next loop
        prev_weights = new_weights.copy()
        last_oos_returns = oos_returns.copy()
        last_gross_return = gross_return
        
        if verbose:
            print(f"  Gross Return: {gross_return: .6f}")
            print(f"  Turnover:     {turnover: .6f}")
            print(f"  Trans. Cost:  {tc: .6f}")
            print(f"  Net Return:   {net_return: .6f}")
            
            print(portfolio_weights[-1])

    if verbose:
        print("="*50)
        print("--- Backtest Complete ---")
    
    # --- 4. Final Metrics ---
    results_df = pd.DataFrame({
        'date': portfolio_dates,
        'portfolio_return': portfolio_returns,
        'portfolio_weights': portfolio_weights,
        'portfolio_turnover': portfolio_turnover
    })
    results_df['cumulative_return'] = (1 + results_df['portfolio_return']).cumprod() - 1
    
    # Compute overall metrics
    mean_return = np.mean(portfolio_returns)
    variance = np.var(portfolio_returns, ddof=1)
    sharpe_ratio = mean_return / np.sqrt(variance) if variance > 0 else 0
    
    # Annualized metrics (monthly data)
    annual_return = mean_return * 12
    annual_volatility = np.sqrt(variance * 12)
    annual_sharpe = annual_return / annual_volatility if annual_volatility > 0 else 0
    
    metrics = {
        'mean_return': mean_return,
        'variance': variance,
        'sharpe_ratio': sharpe_ratio,
        'annual_return': annual_return,
        'annual_volatility': annual_volatility,
        'annual_sharpe_ratio': annual_sharpe,
        'total_return': results_df['cumulative_return'].iloc[-1] if len(results_df) > 0 else 0
    }
    
    return results_df, metrics

In [78]:
# Assuming you have loaded your dataframe as 'df'
df = pd.read_csv('../green cleaned.csv', dtype={'ncusip': 'string'})
df['ret_fwd_1'] = (df.groupby('permno')['ret_excess'].shift(-1) )

In [108]:
buys = pd.read_csv('buys.csv', index_col=1)
sells = pd.read_csv('sells.csv', index_col=1)
buys.index.name='permno'
sells.index.name='permno'

In [109]:
buys_index = buys.index.astype(int)
sells_index = sells.index.astype(int)

index = buys_index #.union(sells_index)
index

Index([11404, 12369, 12558, 14776, 15707, 24053, 26710, 27959, 28484, 34746,
       47896, 52708, 57904, 59408, 59459, 60442, 66093, 66800, 69032, 70519,
       71563, 78916, 79323, 81055, 82775, 85269, 86868, 87842, 89195],
      dtype='int64', name='permno')

In [110]:
df_filtered = df[df['permno'].isin(index)]

In [111]:
df_filtered

Unnamed: 0,datadate,permno,comnam,ncusip,shrcd,exchcd,siccd,industry,ticker,gvkey_x,...,BETA,betasq,rsq1,pricedelay,idiovol,year,mom6,indmom,industry_return,ret_fwd_1
9178,1980-01-31,11404,CONSOLIDATED EDISON CO NY INC,20911110,11,1,4932,Utils,ED,3413,...,-1.566032,0.223568,0.177811,-1.690356,-1.343182,1980,,0.000000,-0.001083,-0.069674
9179,1980-02-29,11404,CONSOLIDATED EDISON CO NY INC,20911110,11,1,4932,Utils,ED,3413,...,-1.660300,0.195325,0.166120,-0.019633,-1.398064,1980,,0.000000,-0.040298,-0.006218
9180,1980-03-31,11404,CONSOLIDATED EDISON CO NY INC,20911110,11,1,4932,Utils,ED,3413,...,-1.677291,0.182257,0.164222,-0.152879,-1.427993,1980,,0.000000,-0.060439,0.127751
9181,1980-04-30,11404,CONSOLIDATED EDISON CO NY INC,20911110,11,1,4932,Utils,ED,3413,...,-1.751379,0.161145,0.131506,0.849219,-1.362464,1980,,0.000000,0.108856,0.034772
9182,1980-05-31,11404,CONSOLIDATED EDISON CO NY INC,20911110,11,1,4932,Utils,ED,3413,...,-1.739403,0.137143,0.126378,1.555308,-1.363175,1980,,0.000000,0.037701,0.034304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257113,2024-08-31,89195,PRINCIPAL FINANCIAL GROUP INC,74251V10,11,3,6321,Money,PFG,145701,...,0.498810,1.267343,0.468943,-0.405290,-0.787679,2024,0.021091,1.093926,0.038455,0.059866
257114,2024-09-30,89195,PRINCIPAL FINANCIAL GROUP INC,74251V10,11,3,6321,Money,PFG,145701,...,0.522321,1.285254,0.465807,-0.071802,-0.771478,2024,-0.002934,1.438417,0.004272,-0.044645
257115,2024-10-31,89195,PRINCIPAL FINANCIAL GROUP INC,74251V10,11,3,6321,Money,PFG,145701,...,0.597026,1.325183,0.483067,-0.030404,-0.810549,2024,-0.013883,0.950104,-0.001965,0.052917
257116,2024-11-30,89195,PRINCIPAL FINANCIAL GROUP INC,74251V10,11,3,6321,Money,PFG,145701,...,0.638789,1.362859,0.505670,0.054618,-0.836565,2024,0.032738,0.975252,0.081336,-0.106467


In [113]:
results_df, metrics = backtest_nodewise_gmv(
    df_filtered,
    test_start_date='2020-01-31',  # Last date of training period
    test_end_date='2024-11-30',   # Last date of testing period
    lookback_window=180,
    transaction_cost=0.001
)
print(f"Sharpe Ratio: {metrics['sharpe_ratio']:.4f}")
print(f"Annualized Sharpe Ratio: {metrics['annual_sharpe_ratio']:.4f}")

--- Starting Backtest ---

Processing Date: 2020-01-31
  In-Sample Window: 2005-01-31 to 2019-12-31
  Asset Universe: 29 total, 23 with complete data in window
  Running Naive Nodewise for 23 assets...
  Calculating GMV weights...
  Gross Return: -0.103981
  Turnover:      0.901052
  Trans. Cost:   0.000807
  Net Return:   -0.104788
11404    0.126851
26710    0.093014
27959    0.143376
28484    0.016323
34746    0.006899
47896    0.049573
57904    0.036617
59408    0.009008
59459    0.093313
60442    0.050682
66093    0.095159
66800   -0.000696
69032    0.017454
70519    0.011435
71563    0.036523
78916    0.039666
79323    0.059619
81055    0.023652
82775    0.005366
85269    0.012565
86868    0.030313
87842    0.039927
89195    0.003361
dtype: float64

Processing Date: 2020-02-29
  In-Sample Window: 2005-02-28 to 2020-01-31
  Asset Universe: 29 total, 23 with complete data in window
  Running Naive Nodewise for 23 assets...
  Calculating GMV weights...
  Gross Return: -0.182392
  Tur

In [87]:

with pd.option_context("display.max_rows", None):
    print(results_df['portfolio_weights'][0])

10104    0.014157
10107    0.013370
10145    0.020299
10696    0.018943
11308    0.032671
11850    0.021576
12060    0.007817
12490    0.017076
13856    0.035770
13901    0.020899
14008    0.011004
14541    0.017758
14593    0.006272
15579    0.012480
17005    0.013289
17830    0.022510
18163    0.032076
18542    0.006204
19393    0.012741
19561    0.010228
20482    0.022690
21178    0.019130
21936    0.020964
22111    0.036158
22592    0.017865
22752    0.016301
24205    0.025857
24643    0.003239
26403    0.015860
27828    0.008145
27983    0.005226
38703    0.010461
43449    0.030656
46578    0.027846
47896    0.012712
48725    0.013920
49373    0.008603
50876    0.020438
52230    0.007426
54148    0.001972
55976    0.024941
57665    0.014694
57817    0.004701
59010    0.004981
59176    0.005870
59328    0.012143
59408    0.001888
60097    0.017366
61399    0.008981
62092    0.015380
64282    0.003536
65875    0.023569
66093    0.024109
66157    0.016225
66181    0.018219
69032    0

In [114]:
metrics['total_return']

0.5492286216286506

In [116]:
metrics['variance']

0.0031702314687692373

In [117]:
results_df['portfolio_turnover'].mean()

0.06106678375597444