In [1]:
def mv_weights(Theta_hat, mu, target_return=0.01):
    """
    Compute Mean-Variance portfolio weights with target return.
    
    Solves the constrained optimization:
    min w' Sigma w  subject to  w' mu = target_return  and  w' 1 = 1
    
    Solution uses Lagrange multipliers with two constraints.
    
    Parameters:
    -----------
    Theta_hat : np.ndarray, shape (p, p)
        Precision matrix (Sigma^{-1})
    mu : np.ndarray, shape (p,)
        Expected returns
    target_return : float
        Target portfolio return (default: 0.01 = 1% monthly)
    long_only : bool
        If True, falls back to GMV if MV produces negative weights
    
    Returns:
    --------
    w_star : np.ndarray, shape (p,)
        Portfolio weights
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    # Compute key quantities
    A = ones_p @ Theta_hat @ ones_p  # 1' Theta 1
    B = ones_p @ Theta_hat @ mu       # 1' Theta mu  
    C = mu @ Theta_hat @ mu           # mu' Theta mu
    D = A * C - B * B                  # Determinant
    
    # Check for singularity
    if np.abs(D) < 1e-10:
        print('SINGULARITY')
        # System is singular, use GMV instead
        if np.abs(A) > 1e-10:
            w_star = (Theta_hat @ ones_p) / A
            return w_star
        else:
            return ones_p / p
    
    
    # Compute Lagrange multipliers
    lambda1 = (C - B * target_return) / D
    lambda2 = (A * target_return - B) / D
    
    # Compute weights: w = lambda1 * Theta^{-1} 1 + lambda2 * Theta^{-1} mu
    w_star = lambda1 * (Theta_hat @ ones_p) + lambda2 * (Theta_hat @ mu)
    
    return w_star

def msr_weights(Theta_hat, mu):
    """
    Compute Maximum Sharpe Ratio portfolio weights.
    
    The maximum Sharpe ratio portfolio solves:
    max (w' mu) / sqrt(w' Sigma w)
    
    Solution (when mu represents excess returns):
    w ∝ Sigma^{-1} mu = Theta mu
    
    Then normalize so that sum(w) = 1.
    
    Parameters:
    -----------
    Theta_hat : np.ndarray, shape (p, p)
        Precision matrix (Sigma^{-1})
    mu : np.ndarray, shape (p,)
        Expected excess returns
    
    Returns:
    --------
    w_star : np.ndarray, shape (p,)
        Portfolio weights (sum to 1)
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    # Compute unnormalized weights: w ∝ Theta mu
    w_unnorm = Theta_hat @ mu
    
    # Normalize to sum to 1
    weight_sum = np.sum(w_unnorm)
    
    if np.abs(weight_sum) < 1e-10:
        print('WARNING: Weight sum near zero, returning equal weights')
        return ones_p / p
    
    w_star = w_unnorm / weight_sum
    
    return w_star

In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
import warnings
warnings.filterwarnings('ignore')


def train_logistic_regression(df, train_start, train_end, features=['mom12m', 'mve', 'bm']):
    """
    Train logistic regression on historical data to predict positive returns.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Full dataset with features and returns
    train_start : str or pd.Timestamp
        Start date for training window
    train_end : str or pd.Timestamp
        End date for training window
    features : list
        List of feature column names
    
    Returns:
    --------
    log_reg : LogisticRegression
        Trained model
    scaler : StandardScaler
        Fitted scaler for features
    """
    train_df = df[(df['datadate'] >= train_start) & (df['datadate'] <= train_end)].copy()
    
    # Create binary target: 1 if positive return, 0 otherwise
    train_df['target'] = (train_df['ret_fwd_1'] > 0).astype(int)
    
    # Remove rows with missing values
    train_df = train_df.dropna(subset=features + ['target'])
    
    # Prepare features
    X_train = train_df[features]
    y_train = train_df['target']
    
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Train logistic regression
    log_reg = LogisticRegression(random_state=42, max_iter=1000)
    log_reg.fit(X_train_scaled, y_train)
    
    print(f"  Training samples: {len(train_df)}")
    print(f"  Training accuracy: {log_reg.score(X_train_scaled, y_train):.4f}")
    
    return log_reg, scaler


def select_stocks_with_logistic(df, predict_date, log_reg, scaler, 
                                 features=['mom12m', 'mve', 'bm'],
                                 method='top_n', n_stocks=100, threshold=0.5):
    """
    Use trained logistic regression to select stocks for a given date.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Full dataset
    predict_date : str or pd.Timestamp
        Date to generate predictions for
    log_reg : LogisticRegression
        Trained model
    scaler : StandardScaler
        Fitted scaler
    features : list
        List of feature column names
    method : str
        Selection method: 'top_n', 'threshold', or 'top_and_bottom'
    n_stocks : int
        Number of stocks to select (used if method='top_n' or 'top_and_bottom')
    threshold : float
        Probability threshold (used if method='threshold')
    
    Returns:
    --------
    selected_permnos : list
        List of selected PERMNOs
    """
    predict_df = df[df['datadate'] == predict_date].copy()
    predict_df = predict_df.dropna(subset=features)
    
    if len(predict_df) == 0:
        print(f"  ⚠ No stocks with complete data on {predict_date}")
        return []
    
    # Prepare features for prediction
    X_predict = predict_df[features]
    X_predict_scaled = scaler.transform(X_predict)
    
    # Generate buy probabilities
    predict_df['buy_probability'] = log_reg.predict_proba(X_predict_scaled)[:, 1]
    
    # Select stocks based on method
    if method == 'top_n':
        selected_df = predict_df.nlargest(n_stocks, 'buy_probability')
    elif method == 'top_and_bottom':
        top_n = predict_df.nlargest(n_stocks, 'buy_probability')
        bottom_n = predict_df.nsmallest(n_stocks, 'buy_probability')
        selected_df = pd.concat([top_n, bottom_n])
    elif method == 'threshold':
        selected_df = predict_df[predict_df['buy_probability'] >= threshold]
    else:
        raise ValueError(f"Unknown method: {method}")
    
    selected_permnos = selected_df['permno'].tolist()
    
    print(f"  Stocks evaluated: {len(predict_df)}")
    print(f"  Stocks selected: {len(selected_permnos)}")
    print(f"  Buy probability range: [{predict_df['buy_probability'].min():.4f}, "
          f"{predict_df['buy_probability'].max():.4f}]")
    
    return selected_permnos


def est_ndwcov_factor(Y, factors, ic, lambda_min=True):
    """
    Estimate nodewise covariance with factor models using LASSO.
    
    Parameters:
    -----------
    Y : numpy.ndarray
        n x p matrix of observations
    factors : numpy.ndarray
        n x k matrix of factors
    ic : str
        Information criterion: 'WIC', 'BIC', 'GIC', 'AIC', or 'cv'
    lambda_min : bool
        If True and ic='cv', use lambda.min; otherwise use lambda.1se
        
    Returns:
    --------
    TAU : numpy.ndarray
        p x p precision matrix estimate
    """
    # Initialization
    p = Y.shape[1]
    n = Y.shape[0]
    C = np.zeros((p, p))
    np.fill_diagonal(C, 1)
    tau = []
    ns1 = np.ones((n, 1))
    
    # Fit factor model: Y = factors * beta + u
    # Add intercept to factors
    factors_with_intercept = np.column_stack([np.ones(n), factors])
    
    # Fit linear regression for each column of Y
    factormodel = LinearRegression(fit_intercept=False)
    factormodel.fit(factors_with_intercept, Y)
    
    # Get residuals and beta coefficients (excluding intercept)
    u = Y - factormodel.predict(factors_with_intercept)
    beta = factormodel.coef_[:, 1:]  # p x k matrix (excluding intercept)
    
    # Loop over the assets
    for j in range(p):
        # Create design matrix excluding column j
        X_j = np.delete(u, j, axis=1)
        y_j = u[:, j]
        
        if ic != 'cv':
            # Fit LASSO path
            alphas = np.logspace(-4, 1, 100)  # Create lambda sequence
            df_list = []
            sig_list = []
            bic_list = []
            coef_list = []
            res_list = []
            
            for alpha in alphas:
                model = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)
                model.fit(X_j, y_j)
                
                # Predictions and residuals
                y_pred = model.predict(X_j)
                res = y_j - y_pred
                
                # Degrees of freedom (number of non-zero coefficients)
                df = np.sum(np.abs(model.coef_) > 1e-8)
                
                # Variance of residuals
                sig = np.sum(res**2) / n
                
                # Compute information criterion
                if ic == 'WIC':
                    bic_val = np.log(sig) + df * np.log(n) / n * np.log(np.log(p))
                elif ic == 'BIC':
                    bic_val = np.log(sig) + df * np.log(n) / n
                elif ic == 'GIC':
                    bic_val = np.log(sig) + df * np.log(p) * np.log(np.log(n)) / n
                elif ic == 'AIC':
                    bic_val = np.log(sig) + 2 * df
                else:
                    raise ValueError(f"Unknown IC: {ic}")
                
                df_list.append(df)
                sig_list.append(sig)
                bic_list.append(bic_val)
                coef_list.append(model.coef_.copy())
                res_list.append(res)
            
            # Select model with minimum IC
            jind = np.argmin(bic_list)
            jpar = coef_list[jind]
            jres = res_list[jind]
            jtau = np.sum(y_j * jres) / n
            
        else:  # Cross-validation
            lasso_cv = LassoCV(cv=5, fit_intercept=False, max_iter=10000, n_alphas=100)
            lasso_cv.fit(X_j, y_j)
            
            if lambda_min:
                # Use alpha that minimizes CV error (lambda.min equivalent)
                jfit = lasso_cv.predict(X_j)
                jpar = lasso_cv.coef_
            else:
                # Use alpha within 1 SE of minimum (lambda.1se equivalent)
                cv_scores = lasso_cv.mse_path_.mean(axis=1)
                cv_std = lasso_cv.mse_path_.std(axis=1)
                min_idx = np.argmin(cv_scores)
                threshold = cv_scores[min_idx] + cv_std[min_idx]
                
                # Find largest alpha with CV score below threshold
                valid_indices = np.where(cv_scores <= threshold)[0]
                se_idx = valid_indices[0] if len(valid_indices) > 0 else min_idx
                
                selected_alpha = lasso_cv.alphas_[se_idx]
                model_1se = Lasso(alpha=selected_alpha, fit_intercept=False, max_iter=10000)
                model_1se.fit(X_j, y_j)
                jfit = model_1se.predict(X_j)
                jpar = model_1se.coef_
            
            jres = y_j - jfit
            jtau = np.sum(y_j * jres) / n
        
        # Fill in C matrix
        # Insert coefficients back (accounting for missing j-th position)
        C_row = np.insert(-jpar / jtau, j, 0)
        C[j, :] = C_row
        tau.append(jtau)
    
    # Set diagonal
    np.fill_diagonal(C, 1 / np.array(tau))
    omega = C.copy()
    omegasym = (C + C.T) / 2
    
    # Compute factor covariance - ensure float64
    covft = (1/n) * (factors.T @ factors) - (1/(n**2)) * (factors.T @ ns1 @ ns1.T @ factors)
    covft = covft.astype(np.float64)
    
    # Ensure beta and omegasym are float64
    beta = beta.astype(np.float64)
    omegasym = omegasym.astype(np.float64)
    

    covft_inv = np.linalg.inv(covft)
    p1 = np.linalg.inv(covft_inv + beta.T @ omegasym @ beta)
    TAU = omega - omega @ beta @ p1 @ beta.T @ omega
    
    return TAU


def gmv_weights(Theta_hat):
    """
    Compute Global Minimum Variance portfolio weights.
    """
    p = Theta_hat.shape[0]
    ones_p = np.ones(p)
    
    numerator = Theta_hat @ ones_p
    denominator = ones_p @ Theta_hat @ ones_p
    
    if np.abs(denominator) < 1e-10:
        return ones_p / p
    
    w_star = numerator / denominator
    return w_star

def load_ff_factors(factors_path='factors_ff_monthly_raw.csv'):
    """
    Load Fama-French factors from CSV file.
    
    Parameters:
    -----------
    factors_path : str
        Path to the factors CSV file
    
    Returns:
    --------
    factors_df : pd.DataFrame
        DataFrame with date index and factor columns
    """
    factors_df = pd.read_csv(factors_path)
    
    # Convert month column (e.g., 192707) to datetime
    # This gives us the first day of the month (1927-07-01)
    factors_df['date'] = pd.to_datetime(factors_df.iloc[:, 0].astype(str), format='%Y%m')
    
    # Convert to end of month to match returns data
    factors_df['date'] = factors_df['date'] + pd.offsets.MonthEnd(0)
    
    # Set date as index and keep only factor columns
    factors_df = factors_df.set_index('date')[['Mkt-RF', 'SMB', 'HML']]
    
    # Convert to decimal form (assuming factors are in percentage points)
    factors_df = factors_df / 100
    
    return factors_df

def integrated_backtest(df,
                        factors_path='factors_ff_monthly_raw.csv',
                        ic='GIC',
                       test_start_date='2020-01-31',
                       test_end_date='2024-04-30',
                       logistic_train_years=15,
                       logistic_features=['mom12m', 'mve', 'bm'],
                       stock_selection_method='top_n',
                       n_stocks=100,
                       lookback_window=180,
                       transaction_cost=0.005,
                       portfolio_type='all',  # 'gmv', 'mv', 'msr', or 'all'
                       mv_target_return=0.01,
                       verbose=True):
    """
    Integrated backtest combining logistic regression stock selection 
    with nodewise regression portfolio optimization.
    
    Annual workflow:
    1. Train logistic regression on past 15 years (every January)
    2. Select stocks based on buy probability
    3. Run nodewise regression on selected stocks monthly until next retrain
    
    Parameters:
    -----------
    df : pd.DataFrame
        Full dataset with columns: datadate, permno, ret_fwd_1, mom12m, mve, bm
    test_start_date : str
        Start date for testing (format: 'YYYY-MM-DD')
    test_end_date : str
        End date for testing (format: 'YYYY-MM-DD')
    logistic_train_years : int
        Number of years to use for training logistic regression
    logistic_features : list
        Features for logistic regression
    stock_selection_method : str
        'top_n', 'threshold', or 'top_and_bottom'
    n_stocks : int
        Number of stocks to select
    lookback_window : int
        Number of months for nodewise regression rolling window
    transaction_cost : float
        Proportional transaction cost
    portfolio_type : str
        'gmv', 'mv', 'msr', or 'all' (compute all three portfolios)
    mv_target_return : float
        Target return for MV portfolio (default: 0.01 = 1% monthly)
    verbose : bool
        Print detailed logs
    
    Returns:
    --------
    results_dict : dict
        Dictionary with keys 'gmv', 'mv', 'msr' (depending on portfolio_type)
        Each contains: {'results_df': DataFrame, 'metrics': dict}
    """
    df = df.copy()
    df['datadate'] = pd.to_datetime(df['datadate'])

    # Load Fama-French factors
    factors_df = load_ff_factors(factors_path)
    
    all_dates = sorted(df['datadate'].unique())
    
    # Parse test dates
    test_start_dt = pd.to_datetime(test_start_date)
    test_end_dt = pd.to_datetime(test_end_date)
    
    # Extract years that need logistic regression retraining (every January in test period)
    start_year = test_start_dt.year
    end_year = test_end_dt.year
    test_years = list(range(start_year, end_year + 1))
    
    # Determine which portfolios to compute
    if portfolio_type == 'all':
        portfolio_types = ['gmv', 'mv', 'msr']
    else:
        portfolio_types = [portfolio_type]
    
    # Storage for each portfolio type
    results_storage = {ptype: {
        'returns': [],
        'dates': [],
        'weights_list': [],
        'turnover_list': [],
        'gross_returns': [],
        'prev_weights_dict': {},
        'prev_oos_returns_dict': {},
        'prev_gross_return': 0.0
    } for ptype in portfolio_types}
    
    # Track current stock universe
    current_permnos = []
    
    if verbose:
        print("="*70)
        print("INTEGRATED BACKTEST: LOGISTIC REGRESSION + NODEWISE")
        print(f"Test Period: {test_start_date} to {test_end_date}")
        print("="*70)
    
    for year in test_years:
        # Retrain logistic regression in January
        retrain_date = pd.to_datetime(f'{year}-01-31')
        
        if retrain_date not in all_dates:
            print(f"\n⚠ Warning: {retrain_date} not in dataset, skipping year {year}")
            continue
        
        if verbose:
            print(f"\n{'='*70}")
            print(f"YEAR {year}: RETRAINING LOGISTIC REGRESSION")
            print(f"{'='*70}")
        
        # Define training window for logistic regression
        train_end = pd.to_datetime(f'{year-1}-12-31')
        train_start = pd.to_datetime(f'{year - logistic_train_years}-01-31')
        
        if verbose:
            print(f"Logistic training period: {train_start.strftime('%Y-%m-%d')} to "
                  f"{train_end.strftime('%Y-%m-%d')}")
        
        # Train logistic regression
        log_reg, scaler = train_logistic_regression(
            df, train_start, train_end, features=logistic_features
        )
        
        # Select stocks for this year
        current_permnos = select_stocks_with_logistic(
            df, retrain_date, log_reg, scaler,
            features=logistic_features,
            method=stock_selection_method,
            n_stocks=n_stocks
        )
        
        if len(current_permnos) == 0:
            print(f"  ⚠ No stocks selected for {year}, skipping")
            continue
        
        # Determine date range for this year
        year_start_date = retrain_date
        
        # Determine end date for this year's strategy
        if year == end_year:
            # Last year: use test_end_date
            year_end_date = test_end_dt
        else:
            # Use December of current year
            year_end_date = pd.to_datetime(f'{year}-12-31')
            if year_end_date not in all_dates:
                # Find last available date in this year
                year_dates = [d for d in all_dates if d.year == year]
                year_end_date = max(year_dates) if year_dates else year_start_date
        
        # For the first year, respect test_start_date
        if year == start_year and test_start_dt > year_start_date:
            year_start_date = test_start_dt
        
        try:
            year_start_idx = all_dates.index(year_start_date)
            year_end_idx = all_dates.index(year_end_date)
        except ValueError as e:
            print(f"  ⚠ Date error: {e}")
            continue
        
        if verbose:
            print(f"\nRunning monthly rebalancing from {year_start_date.strftime('%Y-%m-%d')} "
                  f"to {year_end_date.strftime('%Y-%m-%d')}")
            print(f"{'='*70}")
        
        # Monthly loop for this year
        for t in range(year_start_idx, year_end_idx + 1):
            current_date = all_dates[t]
            
            if t < lookback_window:
                if verbose:
                    print(f"\n[{current_date.strftime('%Y-%m-%d')}] "
                          f"Skipping: insufficient lookback")
                continue
            
            # Define training window
            window_start_date = all_dates[t - lookback_window]
            window_end_date = all_dates[t - 1]
            
            # Filter data to selected stocks only
            train_data = df[
                (df['datadate'] >= window_start_date) & 
                (df['datadate'] <= window_end_date) &
                (df['permno'].isin(current_permnos))
            ]
            
            # Pivot returns
            returns_pivot = train_data.pivot(
                index='datadate', columns='permno', values='ret_fwd_1'
            )
            
            # Reindex to ensure all dates
            window_dates = all_dates[t - lookback_window : t]
            returns_pivot = returns_pivot.reindex(index=window_dates)

            # Align factors with return realization dates
            factor_dates = [(d + pd.DateOffset(months=1) + pd.offsets.MonthEnd(0)) for d in window_dates]
            
            try:
                factors_window = factors_df.loc[factor_dates]
            except KeyError as e:
                raise ValueError(f"Factor dates not found in factors file. Missing dates: {e}")
            
            if factors_window.isna().any().any():
                if verbose:
                    print(f"\n[{t - test_start_idx + 1}/{test_end_idx - test_start_idx + 1}] "
                          f"Date: {current_date.strftime('%Y-%m-%d')}")
                    print(f"  ⚠ Missing factor data in window, skipping period")
                continue
            
            # Filter assets with any NaNs
            nan_assets = returns_pivot.columns[returns_pivot.isna().any()]
            filtered_pivot = returns_pivot.drop(columns=nan_assets)
            
            current_assets = filtered_pivot.columns.tolist()
            Y = filtered_pivot.values
            factors = factors_window.values
            n_train, p_current = Y.shape
            
            if verbose:
                month_num = t - year_start_idx + 1
                print(f"\n[Month {month_num}] {current_date.strftime('%Y-%m-%d')}")
                print(f"  Assets: {p_current}/{len(current_permnos)} with complete data")
            
            # Check validity
            if n_train < lookback_window or p_current < 2:
                if verbose:
                    print(f"  ⚠ Insufficient data, using previous weights")
                
                # Use previous weights for all portfolio types
                new_weights_dict = {ptype: results_storage[ptype]['prev_weights_dict'].copy() 
                                   for ptype in portfolio_types}
            else:
                try:
                    # Demean
                    Y_bar = Y.mean(axis=0)
                    Y_star = Y - Y_bar
                    
                    # Run nodewise regression
                    Theta_hat = est_ndwcov_factor(Y, factors, ic=ic, lambda_min=True)
                    
                    # Compute weights for each portfolio type
                    new_weights_dict = {}
                    
                    if 'gmv' in portfolio_types:
                        w_gmv = gmv_weights(Theta_hat)
                        new_weights_dict['gmv'] = {
                            asset: w_gmv[i] for i, asset in enumerate(current_assets)
                        }
                    
                    if 'mv' in portfolio_types or 'msr' in portfolio_types:
                        # Need expected returns (use sample mean from training window)
                        mu = Y_bar
                        
                        if 'mv' in portfolio_types:
                            w_mv = mv_weights(Theta_hat, mu, target_return=mv_target_return)
                            new_weights_dict['mv'] = {
                                asset: w_mv[i] for i, asset in enumerate(current_assets)
                            }
                        
                        if 'msr' in portfolio_types:
                            w_msr = msr_weights(Theta_hat, mu)
                            new_weights_dict['msr'] = {
                                asset: w_msr[i] for i, asset in enumerate(current_assets)
                            }
                    
                    if verbose:
                        print(f"  ✓ Nodewise completed for {', '.join(portfolio_types)}")
                    
                except Exception as e:
                    if verbose:
                        print(f"  ✗ Error: {e}")
                    new_weights_dict = {ptype: results_storage[ptype]['prev_weights_dict'].copy() 
                                       for ptype in portfolio_types}
            
            # Process each portfolio type
            for ptype in portfolio_types:
                # Normalize weights
                weights = new_weights_dict[ptype]
                weight_sum = sum(weights.values())
                if weight_sum > 1e-10:
                    weights = {k: v/weight_sum for k, v in weights.items()}
                else:
                    weights = results_storage[ptype]['prev_weights_dict'].copy()
            
            # Get OOS returns (common for all portfolio types)
            oos_data = df[df['datadate'] == current_date]
            oos_returns_series = oos_data.set_index('permno')['ret_fwd_1'].dropna()
            oos_returns_dict = oos_returns_series.to_dict()
            
            # Process each portfolio type
            for ptype in portfolio_types:
                # Get weights for this portfolio
                weights = new_weights_dict[ptype]
                prev_weights = results_storage[ptype]['prev_weights_dict']
                prev_oos_returns = results_storage[ptype]['prev_oos_returns_dict']
                prev_gross_ret = results_storage[ptype]['prev_gross_return']
                
                # Common assets
                common_assets = set(weights.keys()) & set(oos_returns_dict.keys())
                
                if len(common_assets) == 0:
                    continue
                
                # Filter and renormalize
                common_weights = {a: weights[a] for a in common_assets}
                common_weight_sum = sum(common_weights.values())
                if common_weight_sum > 1e-10:
                    common_weights = {k: v/common_weight_sum for k, v in common_weights.items()}
                else:
                    continue
                
                # Gross return
                gross_return = sum(
                    common_weights[a] * oos_returns_dict[a] for a in common_assets
                )
                
                if np.isnan(gross_return) or np.isinf(gross_return):
                    continue
                
                # Transaction costs
                if len(prev_weights) > 0:
                    adjusted_prev = {}
                    for asset, prev_w in prev_weights.items():
                        if asset in prev_oos_returns:
                            prev_r = prev_oos_returns[asset]
                            if abs(1 + prev_gross_ret) > 1e-6:
                                adjusted_prev[asset] = prev_w * (1 + prev_r) / (1 + prev_gross_ret)
                            else:
                                adjusted_prev[asset] = 0.0
                        else:
                            if abs(1 + prev_gross_ret) > 1e-6:
                                adjusted_prev[asset] = prev_w / (1 + prev_gross_ret)
                            else:
                                adjusted_prev[asset] = 0.0
                    
                    all_assets = set(adjusted_prev.keys()) | set(common_weights.keys())
                    turnover = sum(
                        abs(common_weights.get(a, 0.0) - adjusted_prev.get(a, 0.0))
                        for a in all_assets
                    )
                    tc = transaction_cost * (1 + gross_return) * turnover
                else:
                    turnover = sum(abs(w) for w in common_weights.values())
                    tc = transaction_cost * (1 + gross_return) * turnover
                
                net_return = gross_return - tc
                
                # Store results for this portfolio
                results_storage[ptype]['returns'].append(net_return)
                results_storage[ptype]['dates'].append(current_date)
                results_storage[ptype]['weights_list'].append(common_weights.copy())
                results_storage[ptype]['turnover_list'].append(turnover)
                results_storage[ptype]['gross_returns'].append(gross_return)
                
                # Update state
                results_storage[ptype]['prev_weights_dict'] = common_weights.copy()
                results_storage[ptype]['prev_oos_returns_dict'] = {a: oos_returns_dict[a] for a in common_assets}
                results_storage[ptype]['prev_gross_return'] = gross_return
            
            if verbose:
                # Print summary for all portfolios
                print(f"  Portfolio Returns:")
                for ptype in portfolio_types:
                    if len(results_storage[ptype]['returns']) > 0:
                        last_idx = len(results_storage[ptype]['returns']) - 1
                        gross_ret = results_storage[ptype]['gross_returns'][last_idx]
                        net_ret = results_storage[ptype]['returns'][last_idx]
                        to = results_storage[ptype]['turnover_list'][last_idx]
                        tc = gross_ret - net_ret
                        print(f"    {ptype.upper()}: Gross={gross_ret:>7.4f} | TO={to:>5.3f} | "
                              f"TC={tc:>7.5f} | Net={net_ret:>7.4f}")
    
    if verbose:
        print("\n" + "="*70)
        print("BACKTEST COMPLETE")
        print("="*70)
    
    # Compile results for each portfolio type
    results_dict = {}
    
    for ptype in portfolio_types:
        portfolio_returns = results_storage[ptype]['returns']
        portfolio_dates = results_storage[ptype]['dates']
        portfolio_turnover_list = results_storage[ptype]['turnover_list']
        portfolio_gross_returns = results_storage[ptype]['gross_returns']
        
        if len(portfolio_returns) == 0:
            results_dict[ptype] = {
                'results_df': pd.DataFrame(),
                'metrics': {}
            }
            continue
        
        results_df = pd.DataFrame({
            'date': portfolio_dates,
            'portfolio_return': portfolio_returns,
            'portfolio_gross_return': portfolio_gross_returns,
            'portfolio_turnover': portfolio_turnover_list
        })
        results_df['cumulative_return'] = (1 + results_df['portfolio_return']).cumprod() - 1
        
        # Overall metrics
        mean_return = np.mean(portfolio_returns)
        variance = np.var(portfolio_returns, ddof=1)
        sharpe_ratio = mean_return / np.sqrt(variance) if variance > 0 else 0
        
        annual_return = mean_return * 12
        annual_volatility = np.sqrt(variance * 12)
        annual_sharpe = annual_return / annual_volatility if annual_volatility > 0 else 0
        
        overall_metrics = {
            'mean_return': mean_return,
            'variance': variance,
            'sharpe_ratio': sharpe_ratio,
            'annual_return': annual_return,
            'annual_volatility': annual_volatility,
            'annual_sharpe_ratio': annual_sharpe,
            'total_return': results_df['cumulative_return'].iloc[-1],
            'avg_turnover': np.mean(portfolio_turnover_list),
            'n_periods': len(portfolio_returns)
        }
        
        results_dict[ptype] = {
            'results_df': results_df,
            'metrics': overall_metrics
        }
    
    return results_dict

In [3]:
df = pd.read_csv('../green cleaned.csv', dtype={'ncusip': 'string'})
df['ret_fwd_1'] = (df.groupby('permno')['ret_excess'].shift(-1) )

In [6]:
results = integrated_backtest(
    df,
    factors_path='../AI Portfolio Selection/factors_ff_monthly_raw.csv',
    ic='GIC',  # or 'BIC', 'WIC', 'AIC', 'cv'
    test_start_date='2020-01-31',
    test_end_date='2024-04-30',
    logistic_train_years=15,
    stock_selection_method='top_and_bottom',
    n_stocks=50,
    lookback_window=180,
    transaction_cost=0.001,
    verbose=True
)

# Access results for each portfolio
gmv_results = results['gmv']['results_df']
gmv_metrics = results['gmv']['metrics']

mv_results = results['mv']['results_df']
mv_metrics = results['mv']['metrics']

msr_results = results['msr']['results_df']
msr_metrics = results['msr']['metrics']

# Compare Sharpe ratios
print(f"GMV Sharpe: {gmv_metrics['annual_sharpe_ratio']:.4f}")
print(f"MV Sharpe:  {mv_metrics['annual_sharpe_ratio']:.4f}")
print(f"MSR Sharpe: {msr_metrics['annual_sharpe_ratio']:.4f}")

INTEGRATED BACKTEST: LOGISTIC REGRESSION + NODEWISE
Test Period: 2020-01-31 to 2024-04-30

YEAR 2020: RETRAINING LOGISTIC REGRESSION
Logistic training period: 2005-01-31 to 2019-12-31
  Training samples: 89793
  Training accuracy: 0.5590
  Stocks evaluated: 497
  Stocks selected: 100
  Buy probability range: [0.5322, 0.5826]

Running monthly rebalancing from 2020-01-31 to 2020-12-31

[Month 1] 2020-01-31
  Assets: 64/100 with complete data
  ✓ Nodewise completed for gmv, mv, msr
  Portfolio Returns:
    GMV: Gross=-0.0861 | TO=1.910 | TC=0.00175 | Net=-0.0878
    MV: Gross=-0.0803 | TO=1.929 | TC=0.00177 | Net=-0.0821
    MSR: Gross=-0.0399 | TO=3.411 | TC=0.00327 | Net=-0.0432

[Month 2] 2020-02-29
  Assets: 64/100 with complete data
  ✓ Nodewise completed for gmv, mv, msr
  Portfolio Returns:
    GMV: Gross=-0.0590 | TO=0.182 | TC=0.00017 | Net=-0.0592
    MV: Gross=-0.0373 | TO=0.229 | TC=0.00022 | Net=-0.0375
    MSR: Gross= 0.1106 | TO=0.790 | TC=0.00088 | Net= 0.1097

[Month 3] 2

In [8]:
print(f"\n GMV")
print(f"Annualized Sharpe Ratio: {gmv_metrics['annual_sharpe_ratio']:.4f}")
print(f"Mean Return: {gmv_metrics['mean_return']*12:.4f}")
print(f"Variance: {gmv_metrics['variance']*12:.4f}")
print(f"Avg Turnover: {gmv_metrics['avg_turnover']:.4f}")

print(f"\n MV")
print(f"Annualized Sharpe Ratio: {mv_metrics['annual_sharpe_ratio']:.4f}")
print(f"Mean Return: {mv_metrics['mean_return']*12:.4f}")
print(f"Variance: {mv_metrics['variance']*12:.4f}")
print(f"Avg Turnover: {mv_metrics['avg_turnover']:.4f}")

print(f"\n MSR")
print(f"Annualized Sharpe Ratio: {msr_metrics['annual_sharpe_ratio']:.4f}")
print(f"Mean Return: {msr_metrics['mean_return']*12:.4f}")
print(f"Variance: {msr_metrics['variance']*12:.4f}")
print(f"Avg Turnover: {msr_metrics['avg_turnover']:.4f}")


 GMV
Annualized Sharpe Ratio: 0.4170
Mean Return: 0.0583
Variance: 0.0195
Avg Turnover: 0.3606

 MV
Annualized Sharpe Ratio: 0.4646
Mean Return: 0.0646
Variance: 0.0194
Avg Turnover: 0.3696

 MSR
Annualized Sharpe Ratio: 0.4156
Mean Return: 0.1048
Variance: 0.0637
Avg Turnover: 0.9097
