In [9]:
# Constant-memory filter -> COMPRESSED CSV (GZIP)
# Input: outputs/labels_20pct.parquet  +  train_data.csv  (same folder as notebook)
# Output: outputs/dev_sample_raw.csv.gz

from pathlib import Path
import pandas as pd
import csv, gzip, os

TRAIN_CSV = Path("train_data.csv")
OUT_DIR   = Path("outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_GZ    = OUT_DIR / "dev_sample_raw.csv.gz"   # compressed!

# Load sampled IDs (small)
LABELS_20 = OUT_DIR / "labels_20pct.parquet"
labels_20 = pd.read_parquet(LABELS_20)
sample_ids = set(labels_20["customer_ID"].astype(str))

# Remove any old big files to free space
for p in [OUT_DIR / "dev_sample_raw.csv", OUT_GZ]:
    if p.exists():
        try: p.unlink()
        except Exception: pass

scanned = kept = 0

with open(TRAIN_CSV, "r", newline="", encoding="utf-8") as src, \
     gzip.open(OUT_GZ, "wt", newline="", encoding="utf-8") as dst:
    r = csv.reader(src)
    w = csv.writer(dst)

    header = [h.strip() for h in next(r)]
    try:
        cid_idx = header.index("customer_ID")
    except ValueError:
        raise RuntimeError("'customer_ID' not found in header.")
    w.writerow(header)

    for row in r:
        scanned += 1
        if row[cid_idx] in sample_ids:
            w.writerow(row)
            kept += 1
        if scanned % 1_000_000 == 0:
            print(f"Scanned {scanned:,} | kept {kept:,}")

print("\nDone.")
print(f"Scanned {scanned:,} | kept {kept:,} -> {OUT_GZ.resolve()}")


Scanned 1,000,000 | kept 201,694
Scanned 2,000,000 | kept 402,800
Scanned 3,000,000 | kept 601,918
Scanned 4,000,000 | kept 802,521
Scanned 5,000,000 | kept 1,001,355

Done.
Scanned 5,531,451 | kept 1,107,221 -> C:\Users\hp\Desktop\UTDALLAS\Fall 25\BUANML\GroupProject\outputs\dev_sample_raw.csv.gz


In [10]:
from pathlib import Path
import pandas as pd

OUT = Path("outputs")
dev_gz = OUT / "dev_sample_raw.csv.gz"
lbl_pq = OUT / "labels_20pct.parquet"

assert dev_gz.exists(), "dev_sample_raw.csv.gz not found"
labels_20 = pd.read_parquet(lbl_pq)
want = set(labels_20["customer_ID"].astype(str))

# check coverage using a light read of only customer_ID
seen = set()
for chunk in pd.read_csv(dev_gz, usecols=["customer_ID"], dtype={"customer_ID":"string"}, chunksize=200_000):
    seen.update(chunk["customer_ID"])

print("customers in labels_20:", len(want))
print("customers found in dev :", len(seen))
print("coverage:", len(seen & want) / len(want))


customers in labels_20: 91783
customers found in dev : 91783
coverage: 1.0


In [11]:
# Step 5: One-Hot Encoding
# First, let's examine the data structure to identify categorical variables

from pathlib import Path
import pandas as pd

OUT = Path("outputs")
dev_gz = OUT / "dev_sample_raw.csv.gz"

# Read a small sample to inspect columns
sample = pd.read_csv(dev_gz, nrows=1000)
print("Columns in dataset:", len(sample.columns))
print("\nColumn types:")
print(sample.dtypes.value_counts())
print("\nFirst few column names:")
print(list(sample.columns[:20]))


Columns in dataset: 190

Column types:
float64    185
object       4
int64        1
Name: count, dtype: int64

First few column names:
['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5', 'R_2', 'D_46', 'D_47', 'D_48']


In [1]:
# Better categorical probe (EDA only)
categorical_cols = [c for c in sample.columns
                    if sample[c].dtype == 'object' and c in ('D_63','D_64')]
print("Categoricals found:", categorical_cols)


NameError: name 'sample' is not defined

In [16]:
# Aggregation with full Kaggle categorical list (encode on last snapshot only)

import pandas as pd, numpy as np
from pathlib import Path

OUT = Path("outputs")
DEV_GZ      = OUT / "dev_sample_raw.csv.gz"
LABELS_20PQ = OUT / "labels_20pct.parquet"
OUT_AGG_GZ  = OUT / "dev_sample_agg.csv.gz"

# 1) Load
dev = pd.read_csv(
    DEV_GZ,
    dtype={"customer_ID":"string"},
    parse_dates=["S_2"],
    low_memory=True
)
labels_20 = pd.read_parquet(LABELS_20PQ)
labels_20["customer_ID"] = labels_20["customer_ID"].astype("string")
tgt = "target" if "target" in labels_20.columns else labels_20.columns[-1]
labels_20[tgt] = labels_20[tgt].astype(int)

# 2) Known categoricals from Kaggle (keep only those present)
KAGGLE_CATS = ['B_30','B_38','D_114','D_116','D_117','D_120','D_126','D_63','D_64','D_66','D_68']
cat_cols = [c for c in KAGGLE_CATS if c in dev.columns]

# 3) Sort for last-k logic and get the last row per customer
dev = dev.sort_values(["customer_ID","S_2"])
last = dev.groupby("customer_ID", sort=False).tail(1).copy()

# 4) One-hot encode categoricals on the last snapshot only
if cat_cols:
    for c in cat_cols:
        last[c] = last[c].astype("category")
    cats_last = pd.get_dummies(last[["customer_ID"] + cat_cols],
                               columns=cat_cols, dummy_na=True, dtype="uint8")
else:
    cats_last = last[["customer_ID"]].copy()

# 5) Numeric columns to aggregate (exclude ID, date, and raw cats)
ignore = {"customer_ID","S_2"} | set(cat_cols)
num_cols = [c for c in dev.columns if c not in ignore and np.issubdtype(dev[c].dtype, np.number)]
if not num_cols:
    raise RuntimeError("No numeric features to aggregate.")

# 6) Last values for numeric
last_vals = last[["customer_ID"] + num_cols].copy().set_index("customer_ID")
last_vals.columns = [f"{c}_last" for c in last_vals.columns]

# 7) Rolling means over last 3/6/12 rows for numeric only
def last_k_mean(g, cols, k): return g[cols].tail(k).mean()

rolled = []
for k in [3,6,12]:
    r = dev.groupby("customer_ID", sort=False).apply(lambda g: last_k_mean(g, num_cols, k))
    r.index.name = "customer_ID"
    r.columns = [f"{c}_mean{k}" for c in r.columns]
    for c in r.columns: r[c] = r[c].astype(np.float32)
    rolled.append(r)

# 8) Combine numeric aggs + last-snapshot categoricals, join target
agg = pd.concat([last_vals] + rolled, axis=1).reset_index()
agg = agg.merge(cats_last, on="customer_ID", how="left").merge(
    labels_20[["customer_ID", tgt]], on="customer_ID", how="inner"
)

# 9) Save
agg.to_csv(OUT_AGG_GZ, index=False, compression="gzip")
print("Saved:", OUT_AGG_GZ, "| shape:", agg.shape, "| cats encoded:", cat_cols)


  r = dev.groupby("customer_ID", sort=False).apply(lambda g: last_k_mean(g, num_cols, k))
  r = dev.groupby("customer_ID", sort=False).apply(lambda g: last_k_mean(g, num_cols, k))
  r = dev.groupby("customer_ID", sort=False).apply(lambda g: last_k_mean(g, num_cols, k))


Saved: outputs\dev_sample_agg.csv.gz | shape: (91783, 762) | cats encoded: ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']


# Step 6: Feature Engineering

Next steps for feature engineering:
1. Load the encoded dataset and labels
2. Merge labels with the data
3. Aggregate historical data (up to 13 months) per customer to create features as of April 2018
4. For numerical features: create aggregates (Average, Sum, Min, Max) over different time windows (3, 6, 9, 12 months)
5. Include the most recent value (April 2018) for each feature
6. Handle missing data (some customers may have less than 13 months of data)


In [5]:
# Split 70/15/15 by customer, run two XGBs, save feature importances, select features (≥0.5%)

from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb

OUT = Path("outputs")
AGG = OUT / "dev_sample_agg.csv.gz"   # created in the aggregation step
assert AGG.exists(), "Run the aggregation step first to create outputs/dev_sample_agg.csv.gz"

# 1) Load aggregated, build X/y
df = pd.read_csv(AGG, dtype={"customer_ID":"string"})
target_col = "target" if "target" in df.columns else df.columns[-1]
X = df.drop(columns=["customer_ID", target_col], errors="ignore")
y = df[target_col].astype(int).values

# 2) 70/15/15 split (stratified)
X_tr, X_tmp, y_tr, y_tmp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_t1, X_t2, y_t1, y_t2 = train_test_split(
    X_tmp, y_tmp, test_size=0.50, random_state=42, stratify=y_tmp
)

print("Shapes ->",
      "Train:", X_tr.shape,
      "Test1:", X_t1.shape,
      "Test2:", X_t2.shape)

# Utility to compute FI (gain-based) as % of total
def xgb_fit_and_fi(model, Xtr, ytr, Xt1, yt1, Xt2, yt2, fi_path):
    model.fit(Xtr, ytr)
    # AUCs (quick check)
    p_tr = model.predict_proba(Xtr)[:,1]
    p_t1 = model.predict_proba(Xt1)[:,1]
    p_t2 = model.predict_proba(Xt2)[:,1]
    print(f"AUC Train={roc_auc_score(ytr,p_tr):.4f}  "
          f"Test1={roc_auc_score(yt1,p_t1):.4f}  "
          f"Test2={roc_auc_score(yt2,p_t2):.4f}")

    booster = model.get_booster()
    gain = booster.get_score(importance_type="gain")
    fi = pd.DataFrame({"feature": list(gain.keys()), "gain": list(gain.values())})
    fi["pct"] = fi["gain"] / fi["gain"].sum()
    fi.sort_values("pct", ascending=False).to_csv(fi_path, index=False)
    print("Saved FI ->", fi_path)
    return fi

# 3) XGB (default-ish)
model_default = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
fi_default = xgb_fit_and_fi(model_default, X_tr, y_tr, X_t1, y_t1, X_t2, y_t2,
                            OUT / "xgb_fi_default.csv")

# 4) XGB (tuned as per spec in the brief)
model_tuned = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    n_estimators=300,
    learning_rate=0.5,
    max_depth=4,
    subsample=0.5,
    colsample_bytree=0.5,
    scale_pos_weight=5,
    random_state=42,
    n_jobs=-1
)
fi_tuned = xgb_fit_and_fi(model_tuned, X_tr, y_tr, X_t1, y_t1, X_t2, y_t2,
                          OUT / "xgb_fi_tuned.csv")

# 5) Keep features with FI ≥ 0.5% in either model
keep = set(fi_default.loc[fi_default["pct"] >= 0.005, "feature"]).union(
       set(fi_tuned.loc[fi_tuned["pct"] >= 0.005, "feature"])
)
keep = sorted(keep)

# Save the selection for the grid-search step
(OUT / "selected_features.txt").write_text("\n".join(keep), encoding="utf-8")
print(f"Selected {len(keep)} features with FI ≥ 0.5%. Saved -> outputs/selected_features.txt")


Shapes -> Train: (64248, 792) Test1: (13767, 792) Test2: (13768, 792)
AUC Train=0.9980  Test1=0.9531  Test2=0.9519
Saved FI -> outputs\xgb_fi_default.csv
AUC Train=0.9960  Test1=0.9395  Test2=0.9349
Saved FI -> outputs\xgb_fi_tuned.csv
Selected 14 features with FI ≥ 0.5%. Saved -> outputs/selected_features.txt


In [6]:
# Grid search using selected features (from outputs/selected_features.txt)

from pathlib import Path
import pandas as pd, numpy as np, json
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb

OUT = Path("outputs")
AGG = OUT / "dev_sample_agg.csv.gz"
SEL = OUT / "selected_features.txt"
assert AGG.exists() and SEL.exists(), "Need dev_sample_agg.csv.gz and selected_features.txt"

# Load data
df = pd.read_csv(AGG, dtype={"customer_ID":"string"})
target_col = "target" if "target" in df.columns else df.columns[-1]

with open(SEL, "r", encoding="utf-8") as f:
    keep = [ln.strip() for ln in f if ln.strip()]
# keep only features that still exist (safety)
keep = [c for c in keep if c in df.columns]

X = df[keep].copy()
y = df[target_col].astype(int).values
cust = df["customer_ID"].astype("string").values

# 70/15/15 splits (keep customer ids aligned)
X_tr, X_tmp, y_tr, y_tmp, c_tr, c_tmp = train_test_split(
    X, y, cust, test_size=0.30, random_state=42, stratify=y
)
X_t1, X_t2, y_t1, y_t2, c_t1, c_t2 = train_test_split(
    X_tmp, y_tmp, c_tmp, test_size=0.50, random_state=42, stratify=y_tmp
)

grid = {
    "n_estimators": [50, 100, 300],
    "learning_rate": [0.01, 0.1],
    "subsample": [0.5, 0.8],
    "colsample_bytree": [0.5, 1.0],
    "scale_pos_weight": [1, 5, 10],
}

rows = []
for n in grid["n_estimators"]:
    for lr in grid["learning_rate"]:
        for ss in grid["subsample"]:
            for cs in grid["colsample_bytree"]:
                for w in grid["scale_pos_weight"]:
                    model = xgb.XGBClassifier(
                        objective="binary:logistic",
                        eval_metric="auc",
                        tree_method="hist",
                        n_estimators=n,
                        learning_rate=lr,
                        subsample=ss,
                        colsample_bytree=cs,
                        scale_pos_weight=w,
                        random_state=42,
                        n_jobs=-1,
                    )
                    model.fit(X_tr, y_tr)
                    p_tr = model.predict_proba(X_tr)[:, 1]
                    p_t1 = model.predict_proba(X_t1)[:, 1]
                    p_t2 = model.predict_proba(X_t2)[:, 1]
                    rows.append({
                        "#Trees": n, "LR": lr, "Subsample": ss, "%Features": cs, "Weight": w,
                        "AUC_Train": roc_auc_score(y_tr, p_tr),
                        "AUC_Test1": roc_auc_score(y_t1, p_t1),
                        "AUC_Test2": roc_auc_score(y_t2, p_t2),
                    })

grid_df = pd.DataFrame(rows)
grid_df["AUC_Mean"] = grid_df[["AUC_Train", "AUC_Test1", "AUC_Test2"]].mean(axis=1)
grid_df["AUC_Std"]  = grid_df[["AUC_Train", "AUC_Test1", "AUC_Test2"]].std(axis=1)
grid_df["score"]    = grid_df["AUC_Mean"] - 0.5*grid_df["AUC_Std"]

grid_path = OUT / "xgb_grid_results.csv"
grid_df.to_csv(grid_path, index=False)
best = grid_df.sort_values("score", ascending=False).iloc[0].to_dict()
print("Saved grid ->", grid_path)
print("Best:", best)

best_params = {
    "n_estimators": int(best["#Trees"]),
    "learning_rate": float(best["LR"]),
    "subsample": float(best["Subsample"]),
    "colsample_bytree": float(best["%Features"]),
    "scale_pos_weight": float(best["Weight"]),
}
(OUT / "xgb_best_params.json").write_text(json.dumps(best_params, indent=2))
print("Saved best params -> outputs/xgb_best_params.json")


Saved grid -> outputs\xgb_grid_results.csv
Best: {'#Trees': 100.0, 'LR': 0.1, 'Subsample': 0.8, '%Features': 0.5, 'Weight': 1.0, 'AUC_Train': 0.9597726204942625, 'AUC_Test1': 0.9461327880873672, 'AUC_Test2': 0.9452347912413078, 'AUC_Mean': 0.9503800666076457, 'AUC_Std': 0.008146572958528707, 'score': 0.9463067801283814}
Saved best params -> outputs/xgb_best_params.json


In [7]:
# Final fit + holdout evaluation + rank-ordering plot + save predictions

from pathlib import Path
import pandas as pd, numpy as np, json, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb

OUT = Path("outputs")
AGG = OUT / "dev_sample_agg.csv.gz"
SEL = OUT / "selected_features.txt"
BEST = OUT / "xgb_best_params.json"

# Load data
df = pd.read_csv(AGG, dtype={"customer_ID":"string"})
target_col = "target" if "target" in df.columns else df.columns[-1]
with open(SEL, "r", encoding="utf-8") as f:
    keep = [ln.strip() for ln in f if ln.strip() and ln.strip() in df.columns]
best_params = json.loads(BEST.read_text())

X = df[keep].copy()
y = df[target_col].astype(int).values
cust = df["customer_ID"].astype("string").values

# Splits (same as grid cell)
X_tr, X_tmp, y_tr, y_tmp, c_tr, c_tmp = train_test_split(
    X, y, cust, test_size=0.30, random_state=42, stratify=y
)
X_t1, X_t2, y_t1, y_t2, c_t1, c_t2 = train_test_split(
    X_tmp, y_tmp, c_tmp, test_size=0.50, random_state=42, stratify=y_tmp
)

# Refit on Train+Test1, evaluate on Test2
X_fit = pd.concat([X_tr, X_t1], axis=0)
y_fit = np.concatenate([y_tr, y_t1])
model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=42,
    n_jobs=-1,
    **best_params
)
model.fit(X_fit, y_fit)

p_fit = model.predict_proba(X_fit)[:,1]
p_hold = model.predict_proba(X_t2)[:,1]
print("AUC (fit set):   ", roc_auc_score(y_fit, p_fit))
print("AUC (HOLDOUT):   ", roc_auc_score(y_t2, p_hold))

# Rank-ordering (deciles)
def rank_order(y, scores, q=10):
    df_ = pd.DataFrame({"y": y, "s": scores})
    df_["bin"] = pd.qcut(df_["s"], q, duplicates="drop")
    ro = df_.groupby("bin")["y"].mean()
    ro.index = range(len(ro))
    return ro

ro_fit  = rank_order(y_fit, p_fit)
ro_hold = rank_order(y_t2, p_hold)

plt.figure()
plt.plot(ro_fit.index, ro_fit.values, marker="o", label="Fit (Train+Test1)")
plt.plot(ro_hold.index, ro_hold.values, marker="o", label="Holdout (Test2)")
plt.xlabel("Score decile (low → high)")
plt.ylabel("Default rate")
plt.title("Rank Ordering — XGB (best params)")
plt.legend()
plt.tight_layout()
plt.savefig(OUT / "xgb_rank_order.png", dpi=150)
plt.close()
print("Saved plot -> outputs/xgb_rank_order.png")

# Save model + holdout scores
model.save_model(str(OUT / "xgb_final.json"))
pd.DataFrame({"customer_ID": c_t2, "y_true": y_t2, "score": p_hold}).to_csv(
    OUT / "xgb_scores_test2.csv", index=False
)
print("Saved model -> outputs/xgb_final.json")
print("Saved holdout scores -> outputs/xgb_scores_test2.csv")


AUC (fit set):    0.9578198757587091
AUC (HOLDOUT):    0.9454892883442277


  ro = df_.groupby("bin")["y"].mean()
  ro = df_.groupby("bin")["y"].mean()


Saved plot -> outputs/xgb_rank_order.png
Saved model -> outputs/xgb_final.json
Saved holdout scores -> outputs/xgb_scores_test2.csv


In [8]:
# ---- Point 11: Neural Network grid search (sklearn MLP) ----
# Inputs: outputs/dev_sample_agg.csv.gz  +  outputs/selected_features.txt (from XGB FI step)
# Outputs: outputs/nn_grid_results.csv, outputs/nn_best_params.json,
#          outputs/nn_final.joblib, outputs/nn_scores_test2.csv

from pathlib import Path
import pandas as pd, numpy as np, json
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
import joblib

OUT = Path("outputs")
AGG = OUT / "dev_sample_agg.csv.gz"
SEL = OUT / "selected_features.txt"
assert AGG.exists() and SEL.exists(), "Run aggregation + feature selection first."

# ---- Load features/labels (use the same selected features as XGB for fairness) ----
df = pd.read_csv(AGG, dtype={"customer_ID":"string"})
target = "target" if "target" in df.columns else df.columns[-1]
with open(SEL, "r", encoding="utf-8") as f:
    keep = [ln.strip() for ln in f if ln.strip() and ln.strip() in df.columns]

X = df[keep].copy()
y = df[target].astype(int).values
cust = df["customer_ID"].astype("string").values

# ---- 70/15/15 split (same seed as before) ----
X_tr, X_tmp, y_tr, y_tmp, c_tr, c_tmp = train_test_split(
    X, y, cust, test_size=0.30, random_state=42, stratify=y
)
X_t1, X_t2, y_t1, y_t2, c_t1, c_t2 = train_test_split(
    X_tmp, y_tmp, c_tmp, test_size=0.50, random_state=42, stratify=y_tmp
)

# ---- Preprocess: cap/floor by train quantiles, then impute + standardize ----
q_lo = X_tr.quantile(0.01); q_hi = X_tr.quantile(0.99)
def clip(df_): return df_.clip(q_lo, q_hi, axis=1)

X_tr_c = clip(X_tr); X_t1_c = clip(X_t1); X_t2_c = clip(X_t2)

imp = SimpleImputer(strategy="median")
X_tr_i = imp.fit_transform(X_tr_c)
X_t1_i = imp.transform(X_t1_c)
X_t2_i = imp.transform(X_t2_c)

sc = StandardScaler()
X_tr_s = sc.fit_transform(X_tr_i)
X_t1_s = sc.transform(X_t1_i)
X_t2_s = sc.transform(X_t2_i)

# ---- Grid (keep small to be fast; expand later if you like) ----
grid = [
    {"layers": (128,),        "lr": 1e-3, "alpha": 1e-4, "batch": 512},
    {"layers": (256,),        "lr": 1e-3, "alpha": 1e-4, "batch": 512},
    {"layers": (128, 64),     "lr": 1e-3, "alpha": 1e-4, "batch": 512},
    {"layers": (256, 128),    "lr": 1e-3, "alpha": 1e-4, "batch": 512},
    {"layers": (128,),        "lr": 1e-4, "alpha": 1e-4, "batch": 1024},
    {"layers": (128, 64),     "lr": 1e-4, "alpha": 1e-3, "batch": 1024},
]

rows = []
best = None
best_score = -1e9
for g in grid:
    clf = MLPClassifier(
        hidden_layer_sizes=g["layers"],
        activation="relu",
        solver="adam",
        learning_rate_init=g["lr"],
        alpha=g["alpha"],
        batch_size=g["batch"],
        max_iter=50,                 # keep short; we use early stopping
        early_stopping=True,
        n_iter_no_change=5,
        validation_fraction=0.1,
        random_state=42,
        verbose=False,
    )
    clf.fit(X_tr_s, y_tr)

    p_tr = clf.predict_proba(X_tr_s)[:,1]
    p_t1 = clf.predict_proba(X_t1_s)[:,1]
    p_t2 = clf.predict_proba(X_t2_s)[:,1]

    auc_tr  = roc_auc_score(y_tr,  p_tr)
    auc_t1  = roc_auc_score(y_t1,  p_t1)
    auc_t2  = roc_auc_score(y_t2,  p_t2)
    auc_mu  = np.mean([auc_tr, auc_t1, auc_t2])
    auc_std = np.std([auc_tr, auc_t1, auc_t2])
    score   = auc_mu - 0.5*auc_std     # prefer high mean, low variance

    rows.append({
        "Layers": g["layers"], "LR": g["lr"], "Alpha": g["alpha"], "Batch": g["batch"],
        "AUC_Train": auc_tr, "AUC_Test1": auc_t1, "AUC_Test2": auc_t2,
        "AUC_Mean": auc_mu, "AUC_Std": auc_std, "score": score
    })

    if score > best_score:
        best_score = score
        best = {"params": g, "auc_train": auc_tr, "auc_t1": auc_t1, "auc_t2": auc_t2}

grid_df = pd.DataFrame(rows)
grid_df.to_csv(OUT / "nn_grid_results.csv", index=False)
print("Saved -> outputs/nn_grid_results.csv")
print("Best (by mean-0.5*std):", best)

# ---- Refit best NN on Train+Test1, evaluate on holdout (Test2), save artifacts ----
X_fit = np.vstack([X_tr_s, X_t1_s])
y_fit = np.concatenate([y_tr, y_t1])
best_g = best["params"]
final = MLPClassifier(
    hidden_layer_sizes=best_g["layers"], activation="relu", solver="adam",
    learning_rate_init=best_g["lr"], alpha=best_g["alpha"], batch_size=best_g["batch"],
    max_iter=100, early_stopping=True, n_iter_no_change=8, validation_fraction=0.1,
    random_state=42, verbose=False
).fit(X_fit, y_fit)

p_fit  = final.predict_proba(X_fit)[:,1]
p_hold = final.predict_proba(X_t2_s)[:,1]
print("Final NN AUC (fit):", roc_auc_score(y_fit, p_fit))
print("Final NN AUC (holdout):", roc_auc_score(y_t2, p_hold))

# Save pipeline pieces + model
artifacts = {
    "q_lo": q_lo.to_dict(), "q_hi": q_hi.to_dict(),
    "imputer": imp, "scaler": sc, "model": final, "features": keep
}
joblib.dump(artifacts, OUT / "nn_final.joblib")
pd.DataFrame({"customer_ID": c_t2, "y_true": y_t2, "score": p_hold}).to_csv(
    OUT / "nn_scores_test2.csv", index=False
)
(OUT / "nn_best_params.json").write_text(json.dumps(best_g, indent=2))
print("Saved -> nn_final.joblib, nn_scores_test2.csv, nn_best_params.json")


Saved -> outputs/nn_grid_results.csv
Best (by mean-0.5*std): {'params': {'layers': (128, 64), 'lr': 0.001, 'alpha': 0.0001, 'batch': 512}, 'auc_train': np.float64(0.9465945782649551), 'auc_t1': np.float64(0.944309602412749), 'auc_t2': np.float64(0.9431662909143544)}
Final NN AUC (fit): 0.9462795521742915
Final NN AUC (holdout): 0.9434037152398183
Saved -> nn_final.joblib, nn_scores_test2.csv, nn_best_params.json


In [11]:
# Point 12 — Neural Network preprocessing + grid, per brief

from pathlib import Path
import json, math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tensorflow import keras

OUT = Path("outputs")
AGG = OUT / "dev_sample_agg.csv.gz"
SEL = OUT / "selected_features.txt"
assert AGG.exists() and SEL.exists(), "Run steps up to feature selection first."

# ---------- Load data & selected features ----------
df = pd.read_csv(AGG, dtype={"customer_ID":"string"})
target = "target" if "target" in df.columns else df.columns[-1]
with open(SEL, "r", encoding="utf-8") as f:
    keep = [ln.strip() for ln in f if ln.strip() and ln.strip() in df.columns]

X = df[keep].copy()
y = df[target].astype(int).values
cust = df["customer_ID"].astype("string").values

# ---------- 70/15/15 split (same seed as other steps) ----------
X_tr, X_tmp, y_tr, y_tmp, c_tr, c_tmp = train_test_split(
    X, y, cust, test_size=0.30, random_state=42, stratify=y
)
X_t1, X_t2, y_t1, y_t2, c_t1, c_t2 = train_test_split(
    X_tmp, y_tmp, c_tmp, test_size=0.50, random_state=42, stratify=y_tmp
)

# ---------- Preprocess (cap/floor at 1%/99%, missing=0, StandardScaler) ----------
q_lo = X_tr.quantile(0.01); q_hi = X_tr.quantile(0.99)
def clip(df_): return df_.clip(q_lo, q_hi, axis=1)

X_tr_c = clip(X_tr).fillna(0.0)
X_t1_c = clip(X_t1).fillna(0.0)
X_t2_c = clip(X_t2).fillna(0.0)

sc = StandardScaler()
X_tr_s = sc.fit_transform(X_tr_c)
X_t1_s = sc.transform(X_t1_c)
X_t2_s = sc.transform(X_t2_c)

n_features = X_tr_s.shape[1]

# ---------- Model factory (supports dropout%, layers, nodes, activation) ----------
def make_model(n_layers, n_nodes, activation, dropout_keep):
    inputs = keras.Input(shape=(n_features,))
    x = inputs
    # dropout_keep == 1.0 means "no dropout", 0.5 means "50% dropout"
    drop_rate = 1.0 - dropout_keep
    for _ in range(n_layers):
        x = keras.layers.Dense(n_nodes, activation=activation)(x)
        if drop_rate > 0:
            x = keras.layers.Dropout(drop_rate)(x)
    outputs = keras.layers.Dense(1, activation="sigmoid")(x)
    m = keras.Model(inputs, outputs)
    m.compile(optimizer=keras.optimizers.Adam(),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[keras.metrics.AUC(name="auc")])
    return m

# ---------- Grid per brief ----------
grid = []
for n_layers in [2, 4]:
    for n_nodes in [4, 6]:
        for act in ["relu", "tanh"]:
            for keep_prob in [0.5, 1.0]:          # 50% dropout, or none
                for batch in [100, 10000]:
                    grid.append({
                        "layers": n_layers,
                        "nodes": n_nodes,
                        "activation": act,
                        "keep": keep_prob,
                        "batch": batch,
                    })

rows = []
best = None; best_score = -1e9

for g in grid:
    tf.keras.backend.clear_session()
    model = make_model(g["layers"], g["nodes"], g["activation"], g["keep"])
    # 20 epochs, as specified
    hist = model.fit(
        X_tr_s, y_tr,
        epochs=20, batch_size=g["batch"],
        verbose=0, validation_data=(X_t1_s, y_t1)
    )

    p_tr = model.predict(X_tr_s, verbose=0).ravel()
    p_t1 = model.predict(X_t1_s, verbose=0).ravel()
    p_t2 = model.predict(X_t2_s, verbose=0).ravel()

    auc_tr = roc_auc_score(y_tr, p_tr)
    auc_t1 = roc_auc_score(y_t1, p_t1)
    auc_t2 = roc_auc_score(y_t2, p_t2)
    auc_mu = np.mean([auc_tr, auc_t1, auc_t2])
    auc_sd = np.std([auc_tr, auc_t1, auc_t2])
    score  = auc_mu - 0.5*auc_sd  # high mean, low variance

    row = {
        "#HL": g["layers"], "#Node": g["nodes"], "Activation": g["activation"],
        "Dropout": f"{int((1-g['keep'])*100)}%", "Batch": g["batch"],
        "AUC_Train": auc_tr, "AUC_Test1": auc_t1, "AUC_Test2": auc_t2,
        "AUC_Mean": auc_mu, "AUC_Std": auc_sd, "score": score
    }
    rows.append(row)

    if score > best_score:
        best_score = score
        best = {"grid": g, "metrics": {"AUC_Train": auc_tr, "AUC_Test1": auc_t1, "AUC_Test2": auc_t2}}

grid_df = pd.DataFrame(rows)
grid_df.to_csv(OUT / "nn_grid_results.csv", index=False)
(Path(OUT / "nn_best_params.json")).write_text(json.dumps(best, indent=2))
print("Saved -> outputs/nn_grid_results.csv")
print("Best:", best)

# ---------- Refit best NN on Train+Test1, evaluate on Test2, and save ----------
g = best["grid"]
tf.keras.backend.clear_session()
final = make_model(g["layers"], g["nodes"], g["activation"], g["keep"])
final.fit(
    np.vstack([X_tr_s, X_t1_s]),
    np.concatenate([y_tr, y_t1]),
    epochs=20, batch_size=g["batch"], verbose=0
)

p_fit  = final.predict(np.vstack([X_tr_s, X_t1_s]), verbose=0).ravel()
p_hold = final.predict(X_t2_s, verbose=0).ravel()
print("Final NN AUC (fit):   ", roc_auc_score(np.concatenate([y_tr, y_t1]), p_fit))
print("Final NN AUC (holdout):", roc_auc_score(y_t2, p_hold))

final.save(OUT / "nn_final.keras")
pd.DataFrame({"customer_ID": c_t2, "y_true": y_t2, "score": p_hold}).to_csv(
    OUT / "nn_scores_test2.csv", index=False
)
print("Saved -> outputs/nn_final.keras, outputs/nn_scores_test2.csv, outputs/nn_best_params.json")



Saved -> outputs/nn_grid_results.csv
Best: {'grid': {'layers': 2, 'nodes': 6, 'activation': 'tanh', 'keep': 1.0, 'batch': 100}, 'metrics': {'AUC_Train': np.float64(0.9441669271055484), 'AUC_Test1': np.float64(0.9440770626885304), 'AUC_Test2': np.float64(0.9428032125963555)}}
Final NN AUC (fit):    0.9436333004616435
Final NN AUC (holdout): 0.9421228995294538
Saved -> outputs/nn_final.keras, outputs/nn_scores_test2.csv, outputs/nn_best_params.json


In [12]:
# Compare models on holdout and record the winner
from pathlib import Path
import pandas as pd
from sklearn.metrics import roc_auc_score

OUT = Path("outputs")
xgb_scores = OUT / "xgb_scores_test2.csv"
nn_scores  = OUT / "nn_scores_test2.csv"

rows = []
if xgb_scores.exists():
    df = pd.read_csv(xgb_scores)
    rows.append(("XGB", roc_auc_score(df["y_true"], df["score"])))
if nn_scores.exists():
    df = pd.read_csv(nn_scores)
    rows.append(("NN", roc_auc_score(df["y_true"], df["score"])))

if not rows:
    raise FileNotFoundError("No holdout score files found. Run the XGB/NN steps first.")

for name, auc in rows:
    print(f"{name} HOLDOUT AUC: {auc:.6f}")

winner = max(rows, key=lambda t: t[1])[0]
(OUT / "winner_model.txt").write_text(winner, encoding="utf-8")
print("\nWinner:", winner, "→ saved to outputs/winner_model.txt")


XGB HOLDOUT AUC: 0.945489
NN HOLDOUT AUC: 0.942123

Winner: XGB → saved to outputs/winner_model.txt


In [14]:
# Strategy thresholds for the winner model
# - Picks thresholds on TRAIN (≤5% and ≤10% default caps)
# - Reports metrics on Test1 and Test2
from pathlib import Path
import json, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

OUT = Path("outputs")
AGG = OUT / "dev_sample_agg.csv.gz"
SEL = OUT / "selected_features.txt"
WIN = OUT / "winner_model.txt"
assert AGG.exists() and SEL.exists() and WIN.exists()

winner = WIN.read_text().strip()
print("Using winner:", winner)

# ---- common data/splits ----
df = pd.read_csv(AGG, dtype={"customer_ID":"string"})
target = "target" if "target" in df.columns else df.columns[-1]
with open(SEL, "r", encoding="utf-8") as f:
    keep = [ln.strip() for ln in f if ln.strip() and ln.strip() in df.columns]

X = df[keep].copy()
y = df[target].astype(int).values
cust = df["customer_ID"].astype("string").values

X_tr, X_tmp, y_tr, y_tmp, c_tr, c_tmp = train_test_split(
    X, y, cust, test_size=0.30, random_state=42, stratify=y
)
X_t1, X_t2, y_t1, y_t2, c_t1, c_t2 = train_test_split(
    X_tmp, y_tmp, c_tmp, test_size=0.50, random_state=42, stratify=y_tmp
)

# pick one spend and one balance proxy (for revenue calc)
def pick(prefix):
    for suff in ["_mean6","_mean3","_last"]:
        cols = [c for c in df.columns if c.startswith(prefix) and c.endswith(suff)]
        if cols: return cols[0]
    return None
S_col = pick("S_"); B_col = pick("B_")
mini = df.set_index("customer_ID")[[c for c in [S_col,B_col] if c]].astype("float32").fillna(0.0)
mini = mini.rename(columns={S_col:"S6", B_col:"B6"})

SPEND_FEE = 0.001   # 0.1% of spend
APR       = 0.24    # annual interest on balance

def eval_at(thr, y_true, pd_hat, ids):
    m = pd.DataFrame({"y":y_true, "pd":pd_hat, "id":ids}).join(mini, on="id").fillna(0.0)
    acc = m[m.pd < thr]
    if len(acc)==0: 
        return {"def_rate":np.nan, "accepted":0, "rev12m":0.0}
    monthly = acc.S6*SPEND_FEE + acc.B6*(APR/12)
    return {
        "def_rate": float(acc.y.mean()),
        "accepted": int(len(acc)),
        "rev12m": float((monthly * (1-acc.y)).sum()*12)
    }

def choose_on_train(pd_tr, ids_tr, cap):
    # sweep fixed grid of thresholds 2% → 50%
    ths = np.linspace(0.02, 0.50, 25)
    rows=[]
    m = pd.DataFrame({"pd":pd_tr, "id":ids_tr}).join(mini, on="id").fillna(0.0)
    # need y on train for default rate
    # we kept y_tr in outer scope
    m["y"] = y_tr
    for t in ths:
        acc = m[m.pd < t]
        if len(acc)==0: continue
        monthly = acc.S6*SPEND_FEE + acc.B6*(APR/12)
        rows.append({"thr":float(t),
                     "def_rate": float(acc.y.mean()),
                     "accepted": int(len(acc)),
                     "rev12m": float((monthly * (1-acc.y)).sum()*12)})
    grid = pd.DataFrame(rows)
    ok = grid[grid.def_rate <= cap]
    return None if ok.empty else ok.sort_values(["rev12m","accepted"], ascending=[False,False]).iloc[0].to_dict()

if winner == "XGB":
    import xgboost as xgb
    best = json.loads((OUT / "xgb_best_params.json").read_text())
    model = xgb.XGBClassifier(
        objective="binary:logistic", eval_metric="auc", tree_method="hist",
        random_state=42, n_jobs=-1, **best
    ).fit(pd.concat([X_tr, X_t1]), np.concatenate([y_tr, y_t1]))
    p_tr  = model.predict_proba(X_tr)[:,1]
    p_t1  = model.predict_proba(X_t1)[:,1]
    p_t2  = model.predict_proba(X_t2)[:,1]

else:  # winner == "NN"
    import tensorflow as tf
    from tensorflow import keras
    # preprocessing per brief: cap/floor 1/99, missing->0, StandardScaler
    from sklearn.preprocessing import StandardScaler
    q_lo = X_tr.quantile(0.01); q_hi = X_tr.quantile(0.99)
    def clip(df_): return df_.clip(q_lo, q_hi, axis=1)
    X_tr_c = clip(X_tr).fillna(0.0); X_t1_c = clip(X_t1).fillna(0.0); X_t2_c = clip(X_t2).fillna(0.0)
    sc = StandardScaler().fit(X_tr_c)
    X_tr_s = sc.transform(X_tr_c); X_t1_s = sc.transform(X_t1_c); X_t2_s = sc.transform(X_t2_c)
    model = keras.models.load_model(OUT / "nn_final.keras")
    p_tr  = model.predict(X_tr_s, verbose=0).ravel()
    p_t1  = model.predict(X_t1_s, verbose=0).ravel()
    p_t2  = model.predict(X_t2_s, verbose=0).ravel()

# choose thresholds on TRAIN
cons = choose_on_train(p_tr, c_tr, cap=0.05)   # ≤5% default
aggr = choose_on_train(p_tr, c_tr, cap=0.10)   # ≤10% default
print("Chosen on TRAIN:")
print("  Conservative:", cons)
print("  Aggressive :", aggr)

# --- Report both strategies and save correctly ---
rows = []

def add_rows(name, thr):
    if thr is None: 
        return
    r1 = eval_at(thr, y_t1, p_t1, c_t1)
    r2 = eval_at(thr, y_t2, p_t2, c_t2)
    print(f"\n{name} @ {thr:.3f}")
    print("  Test1:", r1)
    print("  Test2:", r2)
    rows.extend([
        {"Strategy": name, "Split": "Test1", **r1},
        {"Strategy": name, "Split": "Test2", **r2},
    ])

add_rows("Conservative", cons["thr"])
add_rows("Aggressive",   aggr["thr"])

import pandas as pd
pd.DataFrame(rows).to_csv(OUT / "strategy_summary.csv", index=False)
print("Saved -> outputs/strategy_summary.csv")


Using winner: XGB
Chosen on TRAIN:
  Conservative: {'thr': 0.4, 'def_rate': 0.049258133912927764, 'accepted': 45089.0, 'rev12m': 655.527446846479}
  Aggressive : {'thr': 0.5, 'def_rate': 0.06948099799978945, 'accepted': 47495.0, 'rev12m': 739.8265890743073}

Conservative @ 0.400
  Test1: {'def_rate': 0.04909560723514212, 'accepted': 9675, 'rev12m': 139.78147607681785}
  Test2: {'def_rate': 0.06036799669216456, 'accepted': 9674, 'rev12m': 133.17663442033518}

Aggressive @ 0.500
  Test1: {'def_rate': 0.070049965709807, 'accepted': 10207, 'rev12m': 158.77606182194177}
  Test2: {'def_rate': 0.07843522704934146, 'accepted': 10174, 'rev12m': 152.30767116880452}
Saved -> outputs/strategy_summary.csv
