<h2>Back-test MAC3-style Regression vs. ICA</h2>

In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import FastICA
from scipy.stats import pearsonr
import warnings

In [35]:
# Step 1
warnings.filterwarnings("ignore")

# Read the file (adjust the path if needed)
df = pd.read_csv(
    "book1 cleaned.csv",
    parse_dates=["Date"],
    na_values=["#N/A Invalid Security", "N/A", "NA", ""]
)
df.set_index("Date", inplace=True)

# keep cols with >=90% real data (tune if needed)
df = df.dropna(axis=1, thresh=int(0.9 * len(df)))

# forward-fill the rest and drop any remaining NaNs
df = df.ffill().dropna(how="any")

# compute daily returns
returns = df.pct_change().replace([np.inf, -np.inf], np.nan).dropna(how="any")

print(f"Retained {returns.shape[1]} columns.")

Retained 4 columns.


In [37]:
# Step 2 Set up parameters & containers

# 30-day rolling window for volatility forecasts
WINDOW = 30

# Get the list of columns once
all_cols = returns.columns.tolist()

In [39]:
Z_full = (returns - returns.mean()) / returns.std(ddof=0)
ica    = FastICA(n_components=2, random_state=0, max_iter=500, tol=1e-3)
Z_reconstruct = pd.DataFrame(
        ica.fit_transform(Z_full) @ ica.mixing_.T,
        index=returns.index, columns=returns.columns)

In [41]:
# Step 3 Loop over each target asset
# treat every column once as “the asset we’re forecasting” and the rest as regressors.
from tqdm import tqdm   # pip install tqdm

# Prepare result holders
bias_records = []
q_records   = []

# ---------- pre-fit ICA once (unchanged) ----------
Z_full = (returns - returns.mean()) / returns.std(ddof=0)
ica    = FastICA(n_components=2, random_state=0, max_iter=500, tol=1e-3)
Z_reconstruct = pd.DataFrame(
        ica.fit_transform(Z_full) @ ica.mixing_.T,
        index=returns.index, columns=returns.columns)

# ---------- rolling back-test ----------------------
bias_records = []
q_records   = []

for asset in tqdm(all_cols[:10], desc="Assets"):
    factor_cols = [c for c in all_cols if c != asset]
    if len(factor_cols) < 2:
        continue

    y = returns[asset]
    X = returns[factor_cols]

    fcst_var_mac3 = []
    fcst_var_ica  = []
    real_var      = []

    for i in range(WINDOW, len(returns)):
        # integer slice, so use .iloc everywhere
        X_win = X.iloc[i-WINDOW:i]
        y_win = y.iloc[i-WINDOW:i]

        # ---- MAC3 regression
        reg = LinearRegression().fit(X_win, y_win)
        fcst_var_mac3.append(np.var(reg.predict(X_win)))

        # ---- ICA variance (already reconstructed)
        ica_slice = Z_reconstruct.iloc[i-WINDOW:i, :]
        fcst_var_ica.append(np.var(ica_slice[asset]))

        # ---- realised variance
        real_var.append(np.var(y_win))

    # convert to arrays
    fcst_var_mac3 = np.array(fcst_var_mac3, dtype=float)
    fcst_var_ica  = np.array(fcst_var_ica,  dtype=float)
    real_var      = np.array(real_var,      dtype=float)

    # masks (exclude NaNs)
    mask_mac = np.isfinite(fcst_var_mac3) & np.isfinite(real_var)
    mask_ica = np.isfinite(fcst_var_ica)  & np.isfinite(real_var)

    # ---- Bias
    if mask_mac.any():
        bias_records.append({
            "Asset": asset, "Model": "MAC3",
            "Bias": np.mean(np.sqrt(fcst_var_mac3[mask_mac]) /
                            np.sqrt(real_var[mask_mac]) - 1)
        })
    if mask_ica.any():
        bias_records.append({
            "Asset": asset, "Model": "ICA",
            "Bias": np.mean(np.sqrt(fcst_var_ica[mask_ica]) /
                            np.sqrt(real_var[mask_ica]) - 1)
        })

    # ---- Q-statistic (needs ≥4 points)
    if mask_mac.sum() > 3:
        q_records.append({
            "Asset": asset, "Model": "MAC3",
            "Q": pearsonr(fcst_var_mac3[mask_mac], real_var[mask_mac])[0]
        })
    if mask_ica.sum() > 3:
        q_records.append({
            "Asset": asset, "Model": "ICA",
            "Q": pearsonr(fcst_var_ica[mask_ica], real_var[mask_ica])[0]
        })

# ---------- display results ----------
bias_df = pd.DataFrame(bias_records)
q_df    = pd.DataFrame(q_records)

print("\nBias (30-day window)")
print(bias_df.pivot(index="Asset", columns="Model", values="Bias").round(4))

print("\nQ-statistic (30-day window)")
print(q_df.pivot(index="Asset", columns="Model", values="Q").round(4))


Assets: 100%|████████████████████████████████████████████████████████████████████████████| 4/4 [02:02<00:00, 30.51s/it]


Bias (30-day window)
Model                ICA    MAC3
Asset                           
LUACTRUU INDEX  272.5520 -0.1257
LUMSTRUU INDEX  489.1575 -0.1267
RMS G INDEX          inf     NaN
SPGCCITR INDEX   40.5462 -0.6439

Q-statistic (30-day window)
Model              ICA    MAC3
Asset                         
LUACTRUU INDEX  0.7691  0.8734
LUMSTRUU INDEX  0.9304  0.9678
RMS G INDEX     0.9982  0.8184
SPGCCITR INDEX  0.0526  0.7359





<h2>Test >= 60% non-NaNs</h2>

In [17]:
# Step 1
warnings.filterwarnings("ignore")

# Read the file (adjust the path if needed)
df = pd.read_csv(
    "book1 cleaned.csv",
    parse_dates=["Date"],
    na_values=["#N/A Invalid Security", "N/A", "NA", ""]
)
df.set_index("Date", inplace=True)

# keep cols with >=60% real data (tune if needed)
df = df.dropna(axis=1, thresh=int(0.6 * len(df)))

# forward-fill the rest and drop any remaining NaNs
df = df.ffill().dropna(how="any")

# compute daily returns
returns = df.pct_change().replace([np.inf, -np.inf], np.nan).dropna(how="any")

print(f"Retained {returns.shape[1]} columns.")

Retained 19 columns.


In [13]:
# Step 2 Set up parameters & containers

# 30-day rolling window for volatility forecasts
WINDOW = 30

# Get the list of columns once
all_cols = returns.columns.tolist()

In [27]:
Z_full = (returns - returns.mean()) / returns.std(ddof=0)
ica    = FastICA(n_components=2, random_state=0, max_iter=500, tol=1e-3)
Z_reconstruct = pd.DataFrame(
        ica.fit_transform(Z_full) @ ica.mixing_.T,
        index=returns.index, columns=returns.columns)

In [31]:
# Step 3 Loop over each target asset
# treat every column once as “the asset we’re forecasting” and the rest as regressors.
from tqdm import tqdm   # pip install tqdm

# Prepare result holders
bias_records = []
q_records   = []

# ---------- pre-fit ICA once (unchanged) ----------
Z_full = (returns - returns.mean()) / returns.std(ddof=0)
ica    = FastICA(n_components=2, random_state=0, max_iter=500, tol=1e-3)
Z_reconstruct = pd.DataFrame(
        ica.fit_transform(Z_full) @ ica.mixing_.T,
        index=returns.index, columns=returns.columns)

# ---------- rolling back-test ----------------------
bias_records = []
q_records   = []

for asset in tqdm(all_cols[:10], desc="Assets"):
    factor_cols = [c for c in all_cols if c != asset]
    if len(factor_cols) < 2:
        continue

    y = returns[asset]
    X = returns[factor_cols]

    fcst_var_mac3 = []
    fcst_var_ica  = []
    real_var      = []

    for i in range(WINDOW, len(returns)):
        # integer slice, so use .iloc everywhere
        X_win = X.iloc[i-WINDOW:i]
        y_win = y.iloc[i-WINDOW:i]

        # ---- MAC3 regression
        reg = LinearRegression().fit(X_win, y_win)
        fcst_var_mac3.append(np.var(reg.predict(X_win)))

        # ---- ICA variance (already reconstructed)
        ica_slice = Z_reconstruct.iloc[i-WINDOW:i, :]
        fcst_var_ica.append(np.var(ica_slice[asset]))

        # ---- realised variance
        real_var.append(np.var(y_win))

    # convert to arrays
    fcst_var_mac3 = np.array(fcst_var_mac3, dtype=float)
    fcst_var_ica  = np.array(fcst_var_ica,  dtype=float)
    real_var      = np.array(real_var,      dtype=float)

    # masks (exclude NaNs)
    mask_mac = np.isfinite(fcst_var_mac3) & np.isfinite(real_var)
    mask_ica = np.isfinite(fcst_var_ica)  & np.isfinite(real_var)

    # ---- Bias
    if mask_mac.any():
        bias_records.append({
            "Asset": asset, "Model": "MAC3",
            "Bias": np.mean(np.sqrt(fcst_var_mac3[mask_mac]) /
                            np.sqrt(real_var[mask_mac]) - 1)
        })
    if mask_ica.any():
        bias_records.append({
            "Asset": asset, "Model": "ICA",
            "Bias": np.mean(np.sqrt(fcst_var_ica[mask_ica]) /
                            np.sqrt(real_var[mask_ica]) - 1)
        })

    # ---- Q-statistic (needs ≥4 points)
    if mask_mac.sum() > 3:
        q_records.append({
            "Asset": asset, "Model": "MAC3",
            "Q": pearsonr(fcst_var_mac3[mask_mac], real_var[mask_mac])[0]
        })
    if mask_ica.sum() > 3:
        q_records.append({
            "Asset": asset, "Model": "ICA",
            "Q": pearsonr(fcst_var_ica[mask_ica], real_var[mask_ica])[0]
        })

# ---------- display results ----------
bias_df = pd.DataFrame(bias_records)
q_df    = pd.DataFrame(q_records)

print("\nBias (30-day window)")
print(bias_df.pivot(index="Asset", columns="Model", values="Bias").round(4))

print("\nQ-statistic (30-day window)")
print(q_df.pivot(index="Asset", columns="Model", values="Q").round(4))


Assets: 100%|██████████████████████████████████████████████████████████████████████████| 10/10 [05:36<00:00, 33.68s/it]


Bias (30-day window)
Model                ICA    MAC3
Asset                           
EFA US EQUITY        inf     NaN
GDDUUS INDEX     70.8588 -0.0243
LD12TRUU INDEX       inf     NaN
LF98TRUU INDEX  228.5931 -0.0871
LQD US EQUITY        inf     NaN
LUABTRUU INDEX       inf     NaN
LUACTRUU INDEX  243.4206 -0.0347
NDDUEAFE INDEX   56.6333 -0.0468
NDUEEGF INDEX        inf     NaN
SPGCCITR INDEX   19.4144 -0.2617

Q-statistic (30-day window)
Model              ICA    MAC3
Asset                         
EFA US EQUITY   0.9455  0.9999
GDDUUS INDEX    0.9273  0.9991
LD12TRUU INDEX -0.1752  0.9998
LF98TRUU INDEX  0.4852  0.9976
LQD US EQUITY   0.5998  0.9985
LUABTRUU INDEX  0.4015  0.9892
LUACTRUU INDEX  0.7895  0.9966
NDDUEAFE INDEX  0.7553  0.9985
NDUEEGF INDEX   0.8652  0.9978
SPGCCITR INDEX  0.6968  0.9411





In [None]:
'''
Interpretations: 
| Setting         | Kept columns | ICA Bias                          | MAC3 Bias                         | ICA Q                 | MAC3 Q                 | Main take-away                                                                                            |
| --------------- | ------------ | --------------------------------- | --------------------------------- | --------------------- | ---------------------- | --------------------------------------------------------------------------------------------------------- |
| **≥ 90 % data** | 4 assets     | extremely large (40–490) or **∞** | small & negative (-0.13 to -0.64) | moderate (0.05–0.93)  | decent (0.74–0.97)     | small universe ⇒ ICA has near-zero variance in many 30-day windows, exploding the bias; MAC3 still usable |
| **≥ 60 % data** | 10 assets    | many **∞** or 20–240              | tiny (-0.03 to -0.26)             | mixed (-0.18 to 0.95) | > 0.97 for 8/10 assets | richer universe ⇒ MAC3 forecasts excellent; ICA still unstable (∞ and one negative Q)                     |

1.Bias (ideal ≈ 0)
    - MAC3: consistently close to zero → forecasts are almost unbiased.
    - ICA: huge positive or inf → the reconstructed ICA factor series often has near-zero variance in a 30-day window. Dividing by √σ (realised) shoots the ratio to infinity.
2.Q-statistic (ideal → 1)
    - MAC3: 0.74–1.00 → forecasted variance co-moves strongly with realised variance.
    - ICA: ranges from -0.18 to 0.99 → erratic tracking; negative Q on LD12TRUU INDEX shows anticorrelation.

ICA back-test issue: 
    1. Pre-fitting ICA (due to missing values) once on the full sample fixes the mixing matrix. In short rolling windows many assets become almost flat after standardisation ⇒ ICA’s reconstructed signal has ~0 variance.
    2. Only two ICs for 10+ macro/bond series means most short-run variance is pushed into residuals; if residual variance is close to zero, bias goes to ∞.
MACs is robust.
'''

<h2>Tweaks to the pre-fit approach</h2>

In [43]:
# Step 1
warnings.filterwarnings("ignore")

# Read the file (adjust the path if needed)
df = pd.read_csv(
    "book1 cleaned.csv",
    parse_dates=["Date"],
    na_values=["#N/A Invalid Security", "N/A", "NA", ""]
)
df.set_index("Date", inplace=True)

# keep cols with >=60% real data (tune if needed)
df = df.dropna(axis=1, thresh=int(0.6 * len(df)))

# forward-fill the rest and drop any remaining NaNs
df = df.ffill().dropna(how="any")

# compute daily returns
returns = df.pct_change().replace([np.inf, -np.inf], np.nan).dropna(how="any")

print(f"Retained {returns.shape[1]} columns.")

Retained 19 columns.


In [45]:
# Step 2 Set up parameters & containers

# 30-day rolling window for volatility forecasts
WINDOW = 30

# Get the list of columns once
all_cols = returns.columns.tolist()

In [47]:
# single global ICA fit (now with up to 4 comps)
Z_full  = (returns - returns.mean()) / returns.std(ddof=0)
ncomp   = min(4, Z_full.shape[1]-1)          # <-- more components
ica     = FastICA(n_components=ncomp,
                  random_state=0, max_iter=500, tol=1e-3)
S_full  = ica.fit_transform(Z_full)
A_full  = ica.mixing_

Z_reconstruct = pd.DataFrame(S_full @ A_full.T,
                             index=returns.index,
                             columns=returns.columns)

In [59]:
# ------------------------------------------------------------------
# ROLLING BACK-TEST : MAC3-style regression  vs  ICA (4 components)
# ------------------------------------------------------------------
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr

bias_records = []
q_records    = []

for asset in tqdm(all_cols[:10], desc="Assets"):      # limit to 10 while testing
    factor_cols = [c for c in all_cols if c != asset]
    if len(factor_cols) < 2:
        continue

    y = returns[asset]
    X = returns[factor_cols]

    asset_idx = Z_reconstruct.columns.get_loc(asset)  # integer col position

    fcst_var_mac3 = []
    fcst_var_ica  = []
    real_var      = []

    # 30-day rolling window
    for i in range(WINDOW, len(returns)):
        idx = slice(i - WINDOW, i)

        # ----- 1) MAC3 regression forecast variance -----
        X_win = X.iloc[idx]
        y_win = y.iloc[idx]
        reg   = LinearRegression().fit(X_win, y_win)
        y_hat = reg.predict(X_win)                     # factor component
        fcst_var_mac3.append(np.var(y_hat))

        # ----- 2) ICA forecast variance (rescaled) -----
        slice_recon  = Z_reconstruct.iloc[idx, asset_idx]
        scale        = returns[asset].iloc[idx].std(ddof=0)  # window-std of asset
        recon_window = slice_recon * scale
        var_hat      = max(np.var(recon_window), 1e-8)       # floor to avoid 0
        fcst_var_ica.append(var_hat)

        # ----- 3) realised variance (actual) -----------
        real_var.append(np.var(y_win))

    # ----------- convert to arrays ---------------------
    real_var      = np.maximum(real_var, 1e-8)            # floor to avoid 0
    fcst_var_mac3 = np.array(fcst_var_mac3, dtype=float)
    fcst_var_ica  = np.array(fcst_var_ica,  dtype=float)
    real_var      = np.array(real_var,      dtype=float)

    # ----------- masks to drop NaNs --------------------
    mask_mac = np.isfinite(fcst_var_mac3) & np.isfinite(real_var)
    mask_ica = np.isfinite(fcst_var_ica ) & np.isfinite(real_var)

    # ---------------- Bias -----------------------------
    if mask_mac.any():
        bias_records.append({
            "Asset": asset, "Model": "MAC3",
            "Bias": np.mean(np.sqrt(fcst_var_mac3[mask_mac]) /
                            np.sqrt(real_var[mask_mac]) - 1)
        })
    if mask_ica.any():
        bias_records.append({
            "Asset": asset, "Model": "ICA",
            "Bias": np.mean(np.sqrt(fcst_var_ica[mask_ica]) /
                            np.sqrt(real_var[mask_ica]) - 1)
        })

    # ---------------- Q-statistic ----------------------
    if mask_mac.sum() > 3:
        q_records.append({
            "Asset": asset, "Model": "MAC3",
            "Q": pearsonr(fcst_var_mac3[mask_mac], real_var[mask_mac])[0]
        })
    if mask_ica.sum() > 3:
        q_records.append({
            "Asset": asset, "Model": "ICA",
            "Q": pearsonr(fcst_var_ica[mask_ica], real_var[mask_ica])[0]
        })

# ----------------- Results tables --------------------
bias_df = pd.DataFrame(bias_records)
q_df    = pd.DataFrame(q_records)

print("\nBias (30-day window)")
print(bias_df.pivot(index="Asset", columns="Model", values="Bias").round(4))

print("\nQ-statistic (30-day window)")
print(q_df.pivot(index="Asset", columns="Model", values="Q").round(4))

Assets: 100%|██████████████████████████████████████████████████████████████████████████| 10/10 [06:12<00:00, 37.24s/it]


Bias (30-day window)
Model              ICA    MAC3
Asset                         
EFA US EQUITY  -0.1025 -0.3488
GDDUUS INDEX   -0.3275 -0.0258
LD12TRUU INDEX  0.0817 -0.4978
LF98TRUU INDEX -0.3655 -0.0886
LQD US EQUITY  -0.1323 -0.3812
LUABTRUU INDEX -0.3349 -0.0915
LUACTRUU INDEX -0.2369 -0.0347
NDDUEAFE INDEX -0.3158 -0.0480
NDUEEGF INDEX  -0.1772 -0.3254
SPGCCITR INDEX -0.5586 -0.2617

Q-statistic (30-day window)
Model              ICA    MAC3
Asset                         
EFA US EQUITY   0.9223  0.9999
GDDUUS INDEX    0.8905  0.9991
LD12TRUU INDEX  0.9289  0.9999
LF98TRUU INDEX  0.8471  0.9976
LQD US EQUITY   0.8883  0.9985
LUABTRUU INDEX  0.9537  0.9892
LUACTRUU INDEX  0.8872  0.9966
NDDUEAFE INDEX  0.7763  0.9985
NDUEEGF INDEX   0.8724  0.9978
SPGCCITR INDEX  0.6421  0.9411



