In [31]:
# Depression Predictor, Validation test to identify optimal hyper-parameters (Gamma, Dim, & Lambda)
# Data used based from https://www.globalmacrodata.com/
# train_validate_depression_rff.py
# Train on TrainML.csv, validate on ValidationML.csv.
# Model: Standardize -> RFF (RBF) -> Logistic Regression (L2 / ridge-like)
# Target: binary column "Depression"
# Outputs: AUC, Brier, and probabilities for ValidationML rows.

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss

# ---------- Config ----------
TRAIN_CSV = "TestTrainML2.csv"
VAL_CSV   = "TestValidation2.csv"

RFF_GAMMA = 0.15      # RBF width
RFF_DIM   = 4000     # number of random Fourier features
LOGIT_C   = 20.0     # inverse of L2 strength
SEED      = 42

LABEL_COL = "Depression"  # binary outcome

# Features (same as before, but WITHOUT "Cons/GDP")
FEATURES = [
    # Circulation
    "M0/GDP","M1/GDP","M0/M1",
    # Creation
    "∆ Debt/GDP","Debt Growth","Invest/GDP",
    # Valuation
    "REER","Inflation Rate","ST Interest","HPI","Curve",
    # Efficiency
    "UR","CreditSprd","Nom GDPg"
]

# Output CSV for validation predictions
OUT_CSV = "ValidationML_with_probs.csv"

# ---------- Load ----------
train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)

# Drop rows with missing features or label
train_df = train_df.dropna(subset=FEATURES + [LABEL_COL]).copy()
val_df   = val_df.dropna(subset=FEATURES + [LABEL_COL]).copy()

X_tr = train_df[FEATURES].values
y_tr = train_df[LABEL_COL].astype(int).values

X_va = val_df[FEATURES].values
y_va = val_df[LABEL_COL].astype(int).values

print(f"Train rows: {X_tr.shape[0]}")
print(f"Validation rows: {X_va.shape[0]}")

# ---------- Standardize -> RFF -> Logistic ----------
scaler = StandardScaler().fit(X_tr)
Ztr    = scaler.transform(X_tr)

rff = RBFSampler(
    gamma=RFF_GAMMA,
    n_components=RFF_DIM,
    random_state=SEED
).fit(Ztr)

Ztr_r = rff.transform(Ztr)

clf = LogisticRegression(
    C=LOGIT_C,
    penalty="l2",
    solver="lbfgs",
    max_iter=2000,
    class_weight="balanced",
    random_state=SEED
).fit(Ztr_r, y_tr)

# ---------- Validate (AUC + Brier on PROBABILITIES) ----------
Zva   = scaler.transform(X_va)
Zva_r = rff.transform(Zva)
p_va  = clf.predict_proba(Zva_r)[:, 1]  # probability Depression = 1

auc   = roc_auc_score(y_va, p_va)
brier = brier_score_loss(y_va, p_va)

print("\n=== Validation performance (ValidationML.csv) ===")
print(f"AUC   : {auc:.6f}")
print(f"Brier : {brier:.6f}")

# ---------- Save validation probabilities ----------
val_out = val_df.copy()
val_out["Depression_prob"] = p_va

val_out.to_csv(OUT_CSV, index=False)
print(f"\nSaved validation probabilities to '{OUT_CSV}'.")
print("Column added: 'Depression_prob' (predicted probability of Depression = 1).")


Train rows: 610
Validation rows: 396

=== Validation performance (ValidationML.csv) ===
AUC   : 0.758146
Brier : 0.105898

Saved validation probabilities to 'ValidationML_with_probs.csv'.
Column added: 'Depression_prob' (predicted probability of Depression = 1).


In [3]:
# train_predict_depression_rff.py
# Train on Train2ML.csv, validate on PredictML.csv.
# Model: Standardize -> RFF (RBF) -> Logistic Regression (L2 / ridge-like)
# Target: binary column "Depression"
# Outputs: AUC, Brier, and probabilities for ValidationML rows.

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss

# ---------- Config ----------
TRAIN_CSV = "Train2ML.csv"
VAL_CSV   = "PredictML2.csv"

RFF_GAMMA = 0.2      # RBF width
RFF_DIM   = 3000     # number of random Fourier features
LOGIT_C   = 20.0     # inverse of L2 strength
SEED      = 42

LABEL_COL = "Depression"  # binary outcome

# Features (same as before, but WITHOUT "Cons/GDP")
FEATURES = [
    # Circulation
    "M0/GDP","M1/GDP","M0/M1",
    # Creation
    "∆ Debt/GDP","Debt Growth","Invest/GDP",
    # Valuation
    "REER","Inflation Rate","ST Interest","HPI","Curve",
    # Efficiency
    "UR","CreditSprd","Nom GDPg"
]

# Output CSV for validation predictions
OUT_CSV = "CheckML_with_probs.csv"

# ---------- Load ----------
train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)

# Drop rows with missing features or label
train_df = train_df.dropna(subset=FEATURES + [LABEL_COL]).copy()
val_df   = val_df.dropna(subset=FEATURES + [LABEL_COL]).copy()

X_tr = train_df[FEATURES].values
y_tr = train_df[LABEL_COL].astype(int).values

X_va = val_df[FEATURES].values
y_va = val_df[LABEL_COL].astype(int).values

print(f"Train rows: {X_tr.shape[0]}")
print(f"Validation rows: {X_va.shape[0]}")

# ---------- Standardize -> RFF -> Logistic ----------
scaler = StandardScaler().fit(X_tr)
Ztr    = scaler.transform(X_tr)

rff = RBFSampler(
    gamma=RFF_GAMMA,
    n_components=RFF_DIM,
    random_state=SEED
).fit(Ztr)

Ztr_r = rff.transform(Ztr)

clf = LogisticRegression(
    C=LOGIT_C,
    penalty="l2",
    solver="lbfgs",
    max_iter=2000,
    class_weight="balanced",
    random_state=SEED
).fit(Ztr_r, y_tr)

# ---------- Validate (AUC + Brier on PROBABILITIES) ----------
Zva   = scaler.transform(X_va)
Zva_r = rff.transform(Zva)
p_va  = clf.predict_proba(Zva_r)[:, 1]  # probability Depression = 1

auc   = roc_auc_score(y_va, p_va)
brier = brier_score_loss(y_va, p_va)

print("\n=== Validation performance (ValidationML.csv) ===")
print(f"AUC   : {auc:.6f}")
print(f"Brier : {brier:.6f}")

# ---------- Save validation probabilities ----------
val_out = val_df.copy()
val_out["Depression_prob"] = p_va

val_out.to_csv(OUT_CSV, index=False)
print(f"\nSaved validation probabilities to '{OUT_CSV}'.")
print("Column added: 'Depression_prob' (predicted probability of Depression = 1).")

Train rows: 1037
Validation rows: 60

=== Validation performance (ValidationML.csv) ===
AUC   : 0.915254
Brier : 0.085230

Saved validation probabilities to 'CheckML_with_probs.csv'.
Column added: 'Depression_prob' (predicted probability of Depression = 1).


In [4]:
# train_predict_depression_rff.py
# Train on Train2ML.csv, score ForecastML.csv as 2026 probabilities (from 2025 features).
# Model: Standardize -> RFF (RBF) -> Logistic Regression (L2)
# Target column: "Depression" (binary) in TRAIN only. We do not evaluate on Forecast.

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import LogisticRegression

# ---------- Config ----------
TRAIN_CSV = "Train2ML.csv"
VAL_CSV   = "ForecastML3.csv"       # 2025 rows; we will predict 2026 probs

RFF_GAMMA = 0.2      # RBF width
RFF_DIM   = 3000     # number of random Fourier features
LOGIT_C   = 20.0     # inverse of L2 strength (larger C = weaker L2)
SEED      = 42

LABEL_COL = "Depression"  # binary outcome in TRAIN

# Features (same as before, without "Cons/GDP")
FEATURES = [
    # Circulation
    "M0/GDP","M1/GDP","M0/M1",
    # Creation
    "∆ Debt/GDP","Debt Growth","Invest/GDP",
    # Valuation
    "REER","Inflation Rate","ST Interest","HPI","Curve",
    # Efficiency
    "UR","CreditSprd","Nom GDPg"
]

OUT_CSV = "ForecastML3_with_probs.csv"

# ---------- Load ----------
train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)

# Drop rows with missing features/label in TRAIN; drop missing features in FORECAST
train_df = train_df.dropna(subset=FEATURES + [LABEL_COL]).copy()
val_df   = val_df.dropna(subset=FEATURES).copy()

# Matrices
X_tr = train_df[FEATURES].values
y_tr = train_df[LABEL_COL].astype(int).values

X_va = val_df[FEATURES].values

print(f"Train rows: {X_tr.shape[0]}")
print(f"Forecast rows (2025 features -> 2026 prob): {X_va.shape[0]}")

# ---------- Standardize -> RFF -> Logistic ----------
scaler = StandardScaler().fit(X_tr)
Ztr    = scaler.transform(X_tr)

rff = RBFSampler(
    gamma=RFF_GAMMA,
    n_components=RFF_DIM,
    random_state=SEED
).fit(Ztr)

Ztr_r = rff.transform(Ztr)

clf = LogisticRegression(
    C=LOGIT_C,
    penalty="l2",
    solver="lbfgs",
    max_iter=2000,
    class_weight="balanced",
    random_state=SEED
).fit(Ztr_r, y_tr)

# ---------- Predict Forecast (as 2026 probability) ----------
Zva   = scaler.transform(X_va)
Zva_r = rff.transform(Zva)
p_va  = clf.predict_proba(Zva_r)[:, 1]  # probability Depression=1 (for 2026)

# ---------- Save ----------
val_out = val_df.copy()
val_out["Depression_prob_2026"] = p_va

val_out.to_csv(OUT_CSV, index=False)
print(f"\nSaved probabilities to '{OUT_CSV}' (column: Depression_prob_2026).")


Train rows: 1037
Forecast rows (2025 features -> 2026 prob): 25

Saved probabilities to 'ForecastML3_with_probs.csv' (column: Depression_prob_2026).


In [27]:
# Depression Predictor, Validation test to identify optimal hyper-parameters (Gamma, Dim, & Lambda)
# Train on TrainML.csv, validate on ValidationML.csv.
# Model: Standardize -> RFF (RBF) -> Logistic Regression (L2 / ridge-like)
# Target: Depression in year t+1 (Depression_next) using features at time t.

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss

# ---------- Config ----------
TRAIN_CSV = "TrainML.csv"
VAL_CSV   = "ValidationML.csv"

RFF_GAMMA = 0.15      # RBF width
RFF_DIM   = 3000     # number of random Fourier features
LOGIT_C   = 20.0     # inverse of L2 strength
SEED      = 42

LABEL_COL = "Depression_next"  # binary outcome: Depression in year t+1

# Features (same as before, but WITHOUT "Cons/GDP")
FEATURES = [
    # Circulation
    "M0/GDP","M1/GDP","M0/M1",
    # Creation
    "∆ Debt/GDP","Debt Growth","Invest/GDP",
    # Valuation
    "REER","Inflation Rate","ST Interest","HPI","Curve",
    # Efficiency
    "UR","CreditSprd","Nom GDPg"
]

# Output CSV for validation predictions
OUT_CSV = "ValidationML_with_probs.csv"

# ---------- Load ----------
train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)

# ---------- Build next-year labels (Depression_{t+1}) ----------
# Sort by Country & Year so shift(-1) means "next year within same country"
train_df = train_df.sort_values(["Country", "Year"])
val_df   = val_df.sort_values(["Country", "Year"])

# For each country, label for year t is Depression in year t+1
train_df[LABEL_COL] = train_df.groupby("Country")["Depression"].shift(-1)
val_df[LABEL_COL]   = val_df.groupby("Country")["Depression"].shift(-1)

# Drop rows where we don't have full features or next-year label
train_df = train_df.dropna(subset=FEATURES + [LABEL_COL]).copy()
val_df   = val_df.dropna(subset=FEATURES + [LABEL_COL]).copy()

X_tr = train_df[FEATURES].values
y_tr = train_df[LABEL_COL].astype(int).values

X_va = val_df[FEATURES].values
y_va = val_df[LABEL_COL].astype(int).values

print(f"Train rows (with next-year labels): {X_tr.shape[0]}")
print(f"Validation rows (with next-year labels): {X_va.shape[0]}")

# ---------- Standardize -> RFF -> Logistic ----------
scaler = StandardScaler().fit(X_tr)
Ztr    = scaler.transform(X_tr)

rff = RBFSampler(
    gamma=RFF_GAMMA,
    n_components=RFF_DIM,
    random_state=SEED
).fit(Ztr)

Ztr_r = rff.transform(Ztr)

clf = LogisticRegression(
    C=LOGIT_C,
    penalty="l2",
    solver="lbfgs",
    max_iter=2000,
    class_weight="balanced",
    random_state=SEED
).fit(Ztr_r, y_tr)

# ---------- Validate (AUC + Brier on PROBABILITIES for Depression_{t+1}) ----------
Zva   = scaler.transform(X_va)
Zva_r = rff.transform(Zva)
p_va  = clf.predict_proba(Zva_r)[:, 1]  # P(Depression_next = 1)

auc   = roc_auc_score(y_va, p_va)
brier = brier_score_loss(y_va, p_va)

print("\n=== Validation performance (predicting next-year Depression) ===")
print(f"AUC   : {auc:.6f}")
print(f"Brier : {brier:.6f}")

# ---------- Save validation probabilities ----------
val_out = val_df.copy()
# This is: P(Depression in Year+1 | features at Year)
val_out["Depression_prob_next"] = p_va

val_out.to_csv(OUT_CSV, index=False)
print(f"\nSaved validation probabilities to '{OUT_CSV}'.")
print("Column added: 'Depression_prob_next' (P(Depression in next year = 1)).")


Train rows (with next-year labels): 610
Validation rows (with next-year labels): 396

=== Validation performance (predicting next-year Depression) ===
AUC   : 0.719733
Brier : 0.107820

Saved validation probabilities to 'ValidationML_with_probs.csv'.
Column added: 'Depression_prob_next' (P(Depression in next year = 1)).


In [38]:
# Depression Predictor, Validation test to identify optimal hyper-parameters (Gamma, Dim, & Lambda)
# Train on TrainML.csv, validate on ValidationML.csv.
# Model: Standardize -> RFF (RBF) -> Logistic Regression (L2 / ridge-like)
# Target: Depression in year t+1 (Depression_next) using features at time t.

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss

# ---------- Config ----------
TRAIN_CSV = "Train2ML.csv"
VAL_CSV   = "PredictML2.csv"

RFF_GAMMA = 0.15      # RBF width
RFF_DIM   = 3000     # number of random Fourier features
LOGIT_C   = 20.0     # inverse of L2 strength
SEED      = 42

LABEL_COL = "Depression_next"  # binary outcome: Depression in year t+1

# Features (same as before, but WITHOUT "Cons/GDP")
FEATURES = [
    # Circulation
    "M0/GDP","M1/GDP","M0/M1",
    # Creation
    "∆ Debt/GDP","Debt Growth","Invest/GDP",
    # Valuation
    "REER","Inflation Rate","ST Interest","HPI","Curve",
    # Efficiency
    "UR","CreditSprd","Nom GDPg"
]

# Output CSV for validation predictions
OUT_CSV = "ValidationML_with_probs.csv"

# ---------- Load ----------
train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)

# ---------- Build next-year labels (Depression_{t+1}) ----------
# Sort by Country & Year so shift(-1) means "next year within same country"
train_df = train_df.sort_values(["Country", "Year"])
val_df   = val_df.sort_values(["Country", "Year"])

# For each country, label for year t is Depression in year t+1
train_df[LABEL_COL] = train_df.groupby("Country")["Depression"].shift(-1)
val_df[LABEL_COL]   = val_df.groupby("Country")["Depression"].shift(-1)

# Drop rows where we don't have full features or next-year label
train_df = train_df.dropna(subset=FEATURES + [LABEL_COL]).copy()
val_df   = val_df.dropna(subset=FEATURES + [LABEL_COL]).copy()

X_tr = train_df[FEATURES].values
y_tr = train_df[LABEL_COL].astype(int).values

X_va = val_df[FEATURES].values
y_va = val_df[LABEL_COL].astype(int).values

print(f"Train rows (with next-year labels): {X_tr.shape[0]}")
print(f"Validation rows (with next-year labels): {X_va.shape[0]}")

# ---------- Standardize -> RFF -> Logistic ----------
scaler = StandardScaler().fit(X_tr)
Ztr    = scaler.transform(X_tr)

rff = RBFSampler(
    gamma=RFF_GAMMA,
    n_components=RFF_DIM,
    random_state=SEED
).fit(Ztr)

Ztr_r = rff.transform(Ztr)

clf = LogisticRegression(
    C=LOGIT_C,
    penalty="l2",
    solver="lbfgs",
    max_iter=2000,
    class_weight="balanced",
    random_state=SEED
).fit(Ztr_r, y_tr)

# ---------- Validate (AUC + Brier on PROBABILITIES for Depression_{t+1}) ----------
Zva   = scaler.transform(X_va)
Zva_r = rff.transform(Zva)
p_va  = clf.predict_proba(Zva_r)[:, 1]  # P(Depression_next = 1)

auc   = roc_auc_score(y_va, p_va)
brier = brier_score_loss(y_va, p_va)

print("\n=== Validation performance (predicting next-year Depression) ===")
print(f"AUC   : {auc:.6f}")
print(f"Brier : {brier:.6f}")

# ---------- Save validation probabilities ----------
val_out = val_df.copy()
# This is: P(Depression in Year+1 | features at Year)
val_out["Depression_prob_next"] = p_va

val_out.to_csv(OUT_CSV, index=False)
print(f"\nSaved validation probabilities to '{OUT_CSV}'.")
print("Column added: 'Depression_prob_next' (P(Depression in next year = 1)).")

Train rows (with next-year labels): 1021
Validation rows (with next-year labels): 53

=== Validation performance (predicting next-year Depression) ===
AUC   : 0.750000
Brier : 0.115521

Saved validation probabilities to 'ValidationML_with_probs.csv'.
Column added: 'Depression_prob_next' (P(Depression in next year = 1)).


In [37]:
# train_predict_depression_rff_next.py
# Train on Train2ML.csv, using year t features to predict Depression in year t+1.
# Then score ForecastML3.csv (2025 features) as 2026 depression probabilities.
# Model: Standardize -> RFF (RBF) -> Logistic Regression (L2).
# Target: Depression_next (Depression in t+1) in TRAIN only.

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import LogisticRegression

# ---------- Config ----------
TRAIN_CSV = "Train2ML.csv"
VAL_CSV   = "ForecastML3.csv"   # 2025 rows; we will predict 2026 probs

RFF_GAMMA = 0.2      # RBF width
RFF_DIM   = 3000     # number of random Fourier features
LOGIT_C   = 20.0     # inverse of L2 strength (larger C = weaker L2)
SEED      = 42

LABEL_COL = "Depression_next"   # we will create this from 'Depression' in TRAIN

# Features (same as before, without "Cons/GDP")
FEATURES = [
    # Circulation
    "M0/GDP","M1/GDP","M0/M1",
    # Creation
    "∆ Debt/GDP","Debt Growth","Invest/GDP",
    # Valuation
    "REER","Inflation Rate","ST Interest","HPI","Curve",
    # Efficiency
    "UR","CreditSprd","Nom GDPg"
]

OUT_CSV = "ForecastML3_with_probs_next.csv"

# ---------- Load ----------
train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)

# ---------- Build next-year labels in TRAIN (Depression_{t+1}) ----------
# We assume Train2ML has at least: Year, Depression, and FEATURES.
# If you have Country, we use (Country, Year) to define "next year within country".

if "Country" in train_df.columns and "Year" in train_df.columns:
    train_df = train_df.sort_values(["Country", "Year"])
    train_df[LABEL_COL] = train_df.groupby("Country")["Depression"].shift(-1)
elif "Year" in train_df.columns:
    train_df = train_df.sort_values("Year")
    train_df[LABEL_COL] = train_df["Depression"].shift(-1)
else:
    raise ValueError("TRAIN_CSV must have at least a 'Year' column to build next-year labels.")

# Keep only rows with full features AND a defined next-year label
train_df = train_df.dropna(subset=FEATURES + [LABEL_COL]).copy()

# ---------- Design matrices ----------
X_tr = train_df[FEATURES].values                 # features at year t
y_tr = train_df[LABEL_COL].astype(int).values    # Depression at year t+1

# For forecast file, we just need features at year t (2025); no label needed
val_df = val_df.dropna(subset=FEATURES).copy()
X_va   = val_df[FEATURES].values

print(f"Train rows (with next-year labels): {X_tr.shape[0]}")
print(f"Forecast rows (2025 features -> 2026 prob): {X_va.shape[0]}")

# ---------- Standardize -> RFF -> Logistic ----------
scaler = StandardScaler().fit(X_tr)
Ztr    = scaler.transform(X_tr)

rff = RBFSampler(
    gamma=RFF_GAMMA,
    n_components=RFF_DIM,
    random_state=SEED
).fit(Ztr)

Ztr_r = rff.transform(Ztr)

clf = LogisticRegression(
    C=LOGIT_C,
    penalty="l2",
    solver="lbfgs",
    max_iter=2000,
    class_weight="balanced",
    random_state=SEED
).fit(Ztr_r, y_tr)

# ---------- Predict Forecast (as 2026 probabilities from 2025 features) ----------
Zva   = scaler.transform(X_va)
Zva_r = rff.transform(Zva)
p_va  = clf.predict_proba(Zva_r)[:, 1]  # P(Depression_next = 1 | features at 2025)

# ---------- Save ----------
val_out = val_df.copy()
val_out["Depression_prob_2026"] = p_va  # probability of Depression in 2026

val_out.to_csv(OUT_CSV, index=False)
print(f"\nSaved probabilities to '{OUT_CSV}' (column: Depression_prob_2026).")



Train rows (with next-year labels): 1021
Forecast rows (2025 features -> 2026 prob): 25

Saved probabilities to 'ForecastML3_with_probs_next.csv' (column: Depression_prob_2026).
