In [10]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
import warnings
warnings.filterwarnings('ignore')


def naive_nodewise_regression(Y_star, lambda_grid=None):
    """
    Implements Naive Nodewise Regression (Section 5.1.2).
    Uses GIC (Generalized Information Criterion) as in the paper.
    
    Parameters:
    -----------
    Y_star : np.ndarray, shape (n, p)
        Demeaned returns matrix (time x assets)
    lambda_grid : list or None
        Grid of lambda values to try. If None, creates default grid.
    
    Returns:
    --------
    Theta_hat : np.ndarray, shape (p, p)
        Estimated precision matrix
    """
    n, p = Y_star.shape

    # Initialize matrices
    Theta_hat = np.zeros((p, p))
    tau_squared = np.zeros(p)
    
    # Create lambda grid if not provided
    if lambda_grid is None:
        lambda_grid = np.logspace(-3, 1, 50)
    
    # For each asset j
    for j in range(p):
        # Step 1: Get y_j (target) and Y_{-j} (predictors)
        y_j = Y_star[:, j]
        Y_minus_j = np.delete(Y_star, j, axis=1)
        
        # Step 2-3: Estimate gamma_j using Lasso with GIC
        best_gic = np.inf
        best_lambda = lambda_grid[0]
        best_gamma = None
        best_ssr = None
        
        for lam in lambda_grid:
            lasso = Lasso(alpha=2*lam, fit_intercept=False, max_iter=10000)
            lasso.fit(Y_minus_j, y_j)
            gamma_j = lasso.coef_
            
            # Compute SSR and number of non-zero coefficients
            residuals = y_j - Y_minus_j @ gamma_j
            ssr = np.sum(residuals ** 2)
            sigma_sq_lambda = ssr / n
            q_lambda = np.sum(np.abs(gamma_j) > 1e-8)
            
            # Compute GIC
            if sigma_sq_lambda > 1e-10:
                gic = np.log(sigma_sq_lambda) + q_lambda * (np.log(p) / n) * np.log(np.log(n))
            else:
                gic = np.inf
            
            if gic < best_gic:
                best_gic = gic
                best_lambda = lam
                best_gamma = gamma_j.copy()
                best_ssr = ssr
        
        gamma_j_star = best_gamma
        tau_squared[j] = best_ssr / n + best_lambda * np.sum(np.abs(gamma_j_star))

        # Form the j-th row of Theta_hat
        Theta_hat[j, j] = 1 / tau_squared[j]
        off_diag = -gamma_j_star / tau_squared[j]
        Theta_hat[j, :j] = off_diag[:j]
        Theta_hat[j, j+1:] = off_diag[j:]
    
    # Symmetrize
    Theta_hat_sym = (Theta_hat + Theta_hat.T) / 2
    
    return Theta_hat_sym


def gmv_weights(Theta_hat):
    """
    Compute Global Minimum Variance (GMV) portfolio weights.
    
    Parameters:
    -----------
    Theta_hat : np.ndarray, shape (p, p)
        Precision matrix
    
    Returns:
    --------
    w_star : np.ndarray, shape (p,)
        Portfolio weights
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    # w* = (Θ 1_p) / (1_p' Θ 1_p)
    numerator = Theta_hat @ ones_p
    denominator = ones_p @ Theta_hat @ ones_p
    
    if np.abs(denominator) < 1e-10:
        return ones_p / p
    
    w_star = numerator / denominator
    return w_star


def mv_weights(Theta_hat, mu, target_return=0.01):
    """
    Compute Mean-Variance portfolio weights with target return.
    
    Solves: min w' Sigma w  subject to  w' mu = target_return  and  w' 1 = 1
    
    Parameters:
    -----------
    Theta_hat : np.ndarray, shape (p, p)
        Precision matrix (Sigma^{-1})
    mu : np.ndarray, shape (p,)
        Expected returns
    target_return : float
        Target portfolio return (default: 0.01 = 1% monthly)
    
    Returns:
    --------
    w_star : np.ndarray, shape (p,)
        Portfolio weights
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    # Compute key quantities
    A = ones_p @ Theta_hat @ ones_p
    B = ones_p @ Theta_hat @ mu
    C = mu @ Theta_hat @ mu
    D = A * C - B * B
    
    # Check for singularity
    if np.abs(D) < 1e-10:
        if np.abs(A) > 1e-10:
            w_star = (Theta_hat @ ones_p) / A
            return w_star
        else:
            return ones_p / p
    
    # Compute Lagrange multipliers
    lambda1 = (C - B * target_return) / D
    lambda2 = (A * target_return - B) / D
    
    # Compute weights
    w_star = lambda1 * (Theta_hat @ ones_p) + lambda2 * (Theta_hat @ mu)
    
    return w_star


def msr_weights(Theta_hat, mu):
    """
    Compute Maximum Sharpe Ratio portfolio weights.
    
    Solution: w ∝ Theta mu, normalized to sum to 1
    
    Parameters:
    -----------
    Theta_hat : np.ndarray, shape (p, p)
        Precision matrix (Sigma^{-1})
    mu : np.ndarray, shape (p,)
        Expected excess returns
    
    Returns:
    --------
    w_star : np.ndarray, shape (p,)
        Portfolio weights (sum to 1)
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    # Compute unnormalized weights
    w_unnorm = Theta_hat @ mu
    
    # Normalize to sum to 1
    weight_sum = np.sum(w_unnorm)
    
    if np.abs(weight_sum) < 1e-10:
        return ones_p / p
    
    w_star = w_unnorm / weight_sum
    return w_star


def compute_portfolio_metrics(returns, weights):
    """
    Compute portfolio return, variance, and Sharpe ratio.
    """
    portfolio_returns = returns @ weights
    mean_return = np.mean(portfolio_returns)
    variance = np.var(portfolio_returns, ddof=1)
    sharpe_ratio = mean_return / np.sqrt(variance) if variance > 0 else 0
    
    return {
        'return': mean_return,
        'variance': variance,
        'sharpe_ratio': sharpe_ratio
    }


def backtest_nodewise_gmv(df, 
                                  test_start_date='2000-01-31', 
                                  test_end_date='2003-12-31',
                                  lookback_window=180,
                                  transaction_cost=0.005,
                                  mv_target_return=0.01,
                                  verbose=True):
    """
    Backtest GMV, MV, and MSR portfolios with monthly rebalancing,
    180-month rolling window, and NaN filtering.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with columns: permno, datadate, ret_fwd_1
    test_start_date : str
        First date for out-of-sample returns
    test_end_date : str
        Last date for out-of-sample returns
    lookback_window : int
        Number of months in rolling training window (default: 180)
    transaction_cost : float
        Proportional transaction cost (default: 0.005 = 50 bps)
    mv_target_return : float
        Target return for MV portfolio (default: 0.01 = 1% monthly)
    verbose : bool
        If True, prints detailed log at each time step.
    
    Returns:
    --------
    results_df : pd.DataFrame
        DataFrame with results for all three portfolios
    metrics : dict
        Performance metrics for all three portfolios
    """
    # --- 1. Setup ---
    df = df.copy()
    if 'datadate' not in df.columns or 'permno' not in df.columns:
        raise ValueError("DataFrame must have 'datadate' and 'permno' columns")
    df['datadate'] = pd.to_datetime(df['datadate'])
    
    # Get unique dates
    all_dates = sorted(df['datadate'].unique())
    
    # Convert test dates to datetime
    test_start_dt = pd.to_datetime(test_start_date)
    test_end_dt = pd.to_datetime(test_end_date)
    
    # Find date indices
    try:
        test_start_idx = all_dates.index(test_start_dt)
        test_end_idx = all_dates.index(test_end_dt)
    except ValueError as e:
        raise ValueError(f"Date not found in DataFrame: {e}")
    
    if test_start_idx < lookback_window:
        raise ValueError(f"Not enough data for lookback.")
    
    # Storage for results - one dict per portfolio type
    portfolios = {
        'GMV': {'returns': [], 'weights': [], 'turnover': [], 'gross_returns': [],
                'prev_weights': {}, 'prev_oos_returns': {}, 'prev_gross_return': 0.0},
        'MV': {'returns': [], 'weights': [], 'turnover': [], 'gross_returns': [],
               'prev_weights': {}, 'prev_oos_returns': {}, 'prev_gross_return': 0.0},
        'MSR': {'returns': [], 'weights': [], 'turnover': [], 'gross_returns': [],
                'prev_weights': {}, 'prev_oos_returns': {}, 'prev_gross_return': 0.0}
    }
    portfolio_dates = []
    
    # --- 2. Rolling Window Backtest ---
    if verbose:
        print("="*60)
        print("STARTING BACKTEST - GMV, MV, MSR PORTFOLIOS")
        print("="*60)
        
    for t in range(test_start_idx, test_end_idx + 1):
        current_date = all_dates[t]
        
        # Define the lookback window
        window_start_date = all_dates[t - lookback_window]
        window_end_date = all_dates[t - 1]
        
        # Get training data
        train_data = df[(df['datadate'] >= window_start_date) & 
                        (df['datadate'] <= window_end_date)]
        
        # Pivot to get returns matrix
        returns_pivot = train_data.pivot(index='datadate', columns='permno', values='ret_fwd_1')
        
        # Reindex to ensure all dates are present
        window_dates = all_dates[t - lookback_window : t]
        returns_pivot = returns_pivot.reindex(index=window_dates)
        
        # Filter assets with any NaNs
        nan_assets = returns_pivot.columns[returns_pivot.isna().any()]
        filtered_pivot = returns_pivot.drop(columns=nan_assets)
        
        current_assets = filtered_pivot.columns.tolist()
        Y = filtered_pivot.values
        n_train, p_current = Y.shape

        if verbose:
            print(f"\n[{t - test_start_idx + 1}/{test_end_idx - test_start_idx + 1}] "
                  f"Date: {current_date.strftime('%Y-%m-%d')}")
            print(f"  Window: {window_start_date.strftime('%Y-%m-%d')} to "
                  f"{window_end_date.strftime('%Y-%m-%d')}")
            print(f"  Assets: {p_current} with complete data")

        # Check for valid data
        if n_train < lookback_window or p_current < 2:
            if verbose:
                print(f"  ⚠ Insufficient data, using prev weights")
            weights_dict = {
                'GMV': portfolios['GMV']['prev_weights'].copy(),
                'MV': portfolios['MV']['prev_weights'].copy(),
                'MSR': portfolios['MSR']['prev_weights'].copy()
            }
        else:
            try:
                # Demean the returns
                Y_bar = Y.mean(axis=0)
                Y_star = Y - Y_bar
                
                if verbose:
                    print(f"  Running Nodewise Regression...")
                Theta_hat = naive_nodewise_regression(Y_star)
                
                if verbose:
                    print(f"  Computing portfolio weights...")
                
                # GMV weights
                w_gmv = gmv_weights(Theta_hat)
                
                # MV weights (need expected returns)
                mu = Y_bar  # Use sample mean as expected return
                w_mv = mv_weights(Theta_hat, mu, target_return=mv_target_return)
                
                # MSR weights
                w_msr = msr_weights(Theta_hat, mu)
                
                # Create weights dictionaries
                weights_dict = {
                    'GMV': {asset: w_gmv[i] for i, asset in enumerate(current_assets)},
                    'MV': {asset: w_mv[i] for i, asset in enumerate(current_assets)},
                    'MSR': {asset: w_msr[i] for i, asset in enumerate(current_assets)}
                }
                
            except Exception as e:
                if verbose:
                    print(f"  ✗ Error: {e}")
                    print(f"  Using previous weights")
                weights_dict = {
                    'GMV': portfolios['GMV']['prev_weights'].copy(),
                    'MV': portfolios['MV']['prev_weights'].copy(),
                    'MSR': portfolios['MSR']['prev_weights'].copy()
                }

        # --- 3. Process each portfolio type ---
        
        # Get OOS returns for current month
        oos_data = df[df['datadate'] == current_date]
        oos_returns_series = oos_data.set_index('permno')['ret_fwd_1']
        oos_returns_series = oos_returns_series.dropna()
        oos_returns_dict = oos_returns_series.to_dict()
        
        for port_name in ['GMV', 'MV', 'MSR']:
            new_weights_dict = weights_dict[port_name]
            port = portfolios[port_name]
            
            # Normalize weights to sum to 1
            weight_sum = sum(new_weights_dict.values())
            if weight_sum > 1e-10:
                new_weights_dict = {k: v/weight_sum for k, v in new_weights_dict.items()}
            else:
                new_weights_dict = port['prev_weights'].copy()
            
            # Find common assets
            common_assets = set(new_weights_dict.keys()) & set(oos_returns_dict.keys())
            
            if len(common_assets) == 0:
                continue
            
            # Filter to common assets and renormalize
            common_weights = {a: new_weights_dict[a] for a in common_assets}
            common_weight_sum = sum(common_weights.values())
            if common_weight_sum > 1e-10:
                common_weights = {k: v/common_weight_sum for k, v in common_weights.items()}
            else:
                continue
            
            # Compute gross portfolio return
            gross_return = sum(common_weights[a] * oos_returns_dict[a] for a in common_assets)
            
            if np.isnan(gross_return) or np.isinf(gross_return):
                continue
            
            # Calculate transaction costs
            if len(port['prev_weights']) > 0:
                # Adjust previous weights for returns
                adjusted_prev = {}
                for asset, prev_w in port['prev_weights'].items():
                    if asset in port['prev_oos_returns']:
                        prev_r = port['prev_oos_returns'][asset]
                        if abs(1 + port['prev_gross_return']) > 1e-6:
                            adjusted_prev[asset] = prev_w * (1 + prev_r) / (1 + port['prev_gross_return'])
                        else:
                            adjusted_prev[asset] = 0.0
                    else:
                        if abs(1 + port['prev_gross_return']) > 1e-6:
                            adjusted_prev[asset] = prev_w / (1 + port['prev_gross_return'])
                        else:
                            adjusted_prev[asset] = 0.0
                
                # Calculate turnover
                all_assets = set(adjusted_prev.keys()) | set(common_weights.keys())
                turnover = sum(abs(common_weights.get(a, 0.0) - adjusted_prev.get(a, 0.0)) 
                             for a in all_assets)
                tc = transaction_cost * (1 + gross_return) * turnover
            else:
                turnover = sum(abs(w) for w in common_weights.values())
                tc = transaction_cost * (1 + gross_return) * turnover
            
            # Net return
            net_return = gross_return - tc
            
            # Store results
            port['returns'].append(net_return)
            port['gross_returns'].append(gross_return)
            port['weights'].append(common_weights.copy())
            port['turnover'].append(turnover)
            
            # Update previous values
            port['prev_weights'] = common_weights.copy()
            port['prev_oos_returns'] = {a: oos_returns_dict[a] for a in common_assets}
            port['prev_gross_return'] = gross_return
            
            if verbose and port_name == 'GMV':  # Print once per timestep
                print(f"  {port_name}: Gross={gross_return:>8.5f} | Turnover={turnover:>6.4f} | "
                      f"TC={tc:>8.6f} | Net={net_return:>8.5f}")
        
        portfolio_dates.append(current_date)

    if verbose:
        print("\n" + "="*60)
        print("BACKTEST COMPLETE")
        print("="*60)
    
    # --- 4. Compile Results ---
    results_dfs = {}
    all_metrics = {}
    
    for port_name in ['GMV', 'MV', 'MSR']:
        port = portfolios[port_name]
        
        results_df = pd.DataFrame({
            'date': portfolio_dates[:len(port['returns'])],
            'portfolio_return': port['returns'],
            'portfolio_gross_return': port['gross_returns'],
            'portfolio_weights': port['weights'],
            'portfolio_turnover': port['turnover']
        })
        results_df['cumulative_return'] = (1 + results_df['portfolio_return']).cumprod() - 1
        results_dfs[port_name] = results_df
        
        # Compute metrics
        if len(port['returns']) > 0:
            mean_return = np.mean(port['returns'])
            variance = np.var(port['returns'], ddof=1)
            sharpe_ratio = mean_return / np.sqrt(variance) if variance > 0 else 0
            
            annual_return = mean_return * 12
            annual_volatility = np.sqrt(variance * 12)
            annual_sharpe = annual_return / annual_volatility if annual_volatility > 0 else 0
            
            all_metrics[port_name] = {
                'mean_return': mean_return,
                'variance': variance,
                'sharpe_ratio': sharpe_ratio,
                'annual_return': annual_return,
                'annual_volatility': annual_volatility,
                'annual_sharpe_ratio': annual_sharpe,
                'total_return': results_df['cumulative_return'].iloc[-1],
                'avg_turnover': np.mean(port['turnover']),
                'n_periods': len(port['returns'])
            }
        else:
            all_metrics[port_name] = {
                'mean_return': 0, 'variance': 0, 'sharpe_ratio': 0,
                'annual_return': 0, 'annual_volatility': 0, 'annual_sharpe_ratio': 0,
                'total_return': 0, 'avg_turnover': 0, 'n_periods': 0
            }
    
    return results_dfs, all_metrics

ERROR! Session/line number was not unique in database. History logging moved to new session 594


In [8]:
# Assuming you have loaded your dataframe as 'df'
df = pd.read_csv('../../green cleaned.csv', dtype={'ncusip': 'string'})
df['ret_fwd_1'] = (df.groupby('permno')['ret_excess'].shift(-1) )

In [11]:
results_dfs, all_metrics = backtest_nodewise_gmv(
    df,
    test_start_date='2015-01-31',  # Last date of training period
    test_end_date='2024-04-30',   # Last date of testing period
    lookback_window=180,
    transaction_cost=0.001
)

# Access GMV results
gmv_df = results_dfs['GMV']
gmv_metrics = all_metrics['GMV']

# Access MV results
mv_df = results_dfs['MV']
mv_metrics = all_metrics['MV']

# Access MSR results
msr_df = results_dfs['MSR']
msr_metrics = all_metrics['MSR']

# Compare Sharpe ratios
print(f"GMV Sharpe: {gmv_metrics['annual_sharpe_ratio']:.4f}")
print(f"MV Sharpe: {mv_metrics['annual_sharpe_ratio']:.4f}")
print(f"MSR Sharpe: {msr_metrics['annual_sharpe_ratio']:.4f}")

STARTING BACKTEST - GMV, MV, MSR PORTFOLIOS

[1/112] Date: 2015-01-31
  Window: 2000-01-31 to 2014-12-31
  Assets: 242 with complete data
  Running Nodewise Regression...
  Computing portfolio weights...
  GMV: Gross= 0.03767 | Turnover=1.1208 | TC=0.001163 | Net= 0.03651

[2/112] Date: 2015-02-28
  Window: 2000-02-29 to 2015-01-31
  Assets: 242 with complete data
  Running Nodewise Regression...
  Computing portfolio weights...
  GMV: Gross=-0.01051 | Turnover=0.0904 | TC=0.000089 | Net=-0.01060

[3/112] Date: 2015-03-31
  Window: 2000-03-31 to 2015-02-28
  Assets: 240 with complete data
  Running Nodewise Regression...
  Computing portfolio weights...
  GMV: Gross=-0.00385 | Turnover=0.0730 | TC=0.000073 | Net=-0.00392

[4/112] Date: 2015-04-30
  Window: 2000-04-30 to 2015-03-31
  Assets: 242 with complete data
  Running Nodewise Regression...
  Computing portfolio weights...
  GMV: Gross= 0.01231 | Turnover=0.0644 | TC=0.000065 | Net= 0.01225

[5/112] Date: 2015-05-31
  Window: 2000

In [4]:
metrics['mean_return']*12

0.09262884549437062

In [5]:
metrics['variance']*12

0.01879919367787151

In [6]:
results_df['portfolio_turnover'].mean()

0.07873411540966938