In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [4]:
original_df = pd.read_csv("/teamspace/studios/this_studio/Binary-Classification-with-a-Bank-Dataset/data/bank-full.csv", sep = ";")
df = pd.read_csv("/teamspace/studios/this_studio/Binary-Classification-with-a-Bank-Dataset/data/train.csv")
df = df.drop(["id"], axis =1)
# Convert 'y' column in original_df from 'yes'/'no' to 1/0
original_df['y'] = original_df['y'].map({'yes': 1, 'no': 0})

# Concatenate original_df and df (ignore index to avoid duplicate indices)
data_train = pd.concat([original_df, df], ignore_index=True)

data_test = pd.read_csv("/teamspace/studios/this_studio/Binary-Classification-with-a-Bank-Dataset/data/test.csv")

In [5]:
# ===== Step 2: Feature Engineering + Fold-safe Encodings (LightGBM) =====
# Expects data_train and data_test in memory with target column 'y'
# Outputs a new submission file with probabilities (you can threshold later if desired)

# --------------------
# Config
# --------------------
TARGET_COL = "y"
ID_COL_CANDIDATES = ["id", "ID", "Id"]
N_SPLITS = 5
RANDOM_STATE = 42
WITH_DURATION = True            # Toggle to test with/without duration-related features
TE_COLS = ["job", "education", "contact", "month", "poutcome", "marital"]  # target-encode
FE_COLS = ["job", "education", "contact", "month", "poutcome", "marital"]  # frequency-encode
TE_SMOOTH_M = 50.0             # smoothing strength for target encoding

# --------------------
# Utilities
# --------------------
def month_to_num(s):
    m = str(s).strip().lower()
    order = dict(jan=1,feb=2,mar=3,apr=4,may=5,jun=6,jul=7,aug=8,sep=9,oct=10,nov=11,dec=12)
    return order.get(m, np.nan)

def add_domain_features(df: pd.DataFrame, with_duration: bool = True) -> pd.DataFrame:
    out = df.copy()

    # Contact history
    pdays = out["pdays"].copy()
    contacted_before = (pdays != 999).astype(int)
    out["contacted_before"] = contacted_before

    # Recency features (mask 999 as NaN)
    pdays_masked = pdays.replace(999, np.nan)
    out["days_since_last_contact"] = pdays_masked.fillna(999)  # keep 999 as "never"
    out["pdays_log1p"] = np.log1p(pdays_masked).fillna(0)

    # Contact intensity (safe divide)
    out["contact_intensity"] = (out["campaign"] / (pdays_masked + 1)).fillna(0)

    # Poutcome flags
    if "poutcome" in out.columns:
        pout = out["poutcome"].astype(str).str.lower()
        out["prev_success"] = (pout == "success").astype(int)
        out["prev_failure"] = (pout == "failure").astype(int)
        out["prev_unknown"] = (pout == "unknown").astype(int)

    # Month cyclical encoding
    if "month" in out.columns:
        mnum = out["month"].map(month_to_num)
        out["month_num"] = mnum
        out["month_sin"] = np.sin(2 * np.pi * mnum / 12.0)
        out["month_cos"] = np.cos(2 * np.pi * mnum / 12.0)

    # Day cyclical encoding (optional; keep both raw and cyc)
    if "day" in out.columns:
        day = out["day"].clip(1, 31)
        out["day_sin"] = np.sin(2 * np.pi * day / 31.0)
        out["day_cos"] = np.cos(2 * np.pi * day / 31.0)

    # Season flags (coarse)
    if "month_num" in out.columns:
        out["is_summer"] = out["month_num"].isin([6,7,8]).astype(int)
        out["is_q4"] = out["month_num"].isin([10,11,12]).astype(int)

    # Campaign bins
    out["campaign_bins"] = pd.cut(out["campaign"],
                                  bins=[-np.inf, 1, 3, 6, np.inf],
                                  labels=["1", "2-3", "4-6", "7+"],
                                  ordered=True)

    # Duration transforms
    if with_duration and "duration" in out.columns:
        out["duration_log1p"] = np.log1p(out["duration"].clip(lower=0))
    else:
        # if disabling, drop duration-derived features later
        pass

    # Balance transforms
    # shift for log1p if negatives exist
    shift = max(0, 1 - out["balance"].min())
    out["balance_log1p"] = np.log1p(out["balance"] + shift)
    out["has_positive_balance"] = (out["balance"] > 0).astype(int)

    # Interactions
    if {"housing","loan"}.issubset(out.columns):
        out["housing_loan_combo"] = (out["housing"].astype(str).str.lower() + "_" +
                                     out["loan"].astype(str).str.lower())

    if {"job","education"}.issubset(out.columns):
        out["job_x_education"] = (out["job"].astype(str).str.lower() + "__" +
                                  out["education"].astype(str).str.lower())

    # Interaction with contacted_before
    out["recency_x_campaign"] = contacted_before * out["campaign"]
    if with_duration and "duration_log1p" in out.columns:
        out["recent_and_long"] = contacted_before * out["duration_log1p"]

    return out

def _make_te_map(X_tr_col: pd.Series, y_tr: pd.Series, m: float, prior: float):
    # returns dict: category -> smoothed mean
    stats = X_tr_col.to_frame("cat").assign(y=y_tr.values).groupby("cat")["y"].agg(["sum","count"])
    te = (stats["sum"] + prior * m) / (stats["count"] + m)
    return te.to_dict()

def _apply_map(series: pd.Series, mapping: dict, default_val: float):
    return series.map(mapping).fillna(default_val).astype(float)

def add_fold_encodings(X_tr, y_tr, X_va, X_te,
                       te_cols, fe_cols,
                       te_smooth_m=50.0, strict_freq=True):
    """
    Returns encoded copies with:
      - target encoding: <col>_te
      - frequency encoding: <col>_freq
    TE and FE are fitted on X_tr only, then applied to X_va/X_te (no leakage).
    """
    X_tr_e = X_tr.copy()
    X_va_e = X_va.copy()
    X_te_e = X_te.copy()

    # Target Encoding
    prior = y_tr.mean()
    for col in te_cols:
        if col not in X_tr_e.columns:
            continue
        te_map = _make_te_map(X_tr_e[col].astype(str), y_tr, te_smooth_m, prior)
        default_val = prior
        X_tr_e[f"{col}_te"] = _apply_map(X_tr_e[col].astype(str), te_map, default_val)
        X_va_e[f"{col}_te"] = _apply_map(X_va_e[col].astype(str), te_map, default_val)
        X_te_e[f"{col}_te"] = _apply_map(X_te_e[col].astype(str), te_map, default_val)

    # Frequency Encoding
    for col in fe_cols:
        if col not in X_tr_e.columns:
            continue
        tr_counts = X_tr_e[col].astype(str).value_counts(dropna=False)
        tr_freq = (tr_counts / tr_counts.sum()).to_dict()
        default_freq = 0.0 if strict_freq else (1.0 / max(1, len(tr_counts)))
        X_tr_e[f"{col}_freq"] = X_tr_e[col].astype(str).map(tr_freq).fillna(default_freq).astype(float)
        X_va_e[f"{col}_freq"] = X_va_e[col].astype(str).map(tr_freq).fillna(default_freq).astype(float)
        X_te_e[f"{col}_freq"] = X_te_e[col].astype(str).map(tr_freq).fillna(default_freq).astype(float)

    return X_tr_e, X_va_e, X_te_e

# --------------------
# Data prep
# --------------------
train = data_train.copy()
test  = data_test.copy()

# Map y from "yes"/"no" to 1/0 if needed
if train[TARGET_COL].dtype == "O":
    train[TARGET_COL] = train[TARGET_COL].str.strip().str.lower().map({"yes": 1, "no": 0})

# Identify ID column (fallback to index)
id_col = next((c for c in ID_COL_CANDIDATES if c in test.columns), None)
if id_col is None:
    id_col = "id"
    test[id_col] = np.arange(len(test))

# Base engineered features (no target leakage)
train_f = add_domain_features(train, WITH_DURATION)
test_f  = add_domain_features(test,  WITH_DURATION)

# Optionally drop duration and its derivatives when WITH_DURATION=False
if not WITH_DURATION:
    drop_no_duration = [c for c in train_f.columns if c.startswith("duration")]
    drop_no_duration += ["recent_and_long"]
    train_f = train_f.drop(columns=[c for c in drop_no_duration if c in train_f.columns])
    test_f  = test_f.drop(columns=[c for c in drop_no_duration if c in test_f.columns])

# Cast string categoricals to category dtype so LightGBM can natively handle them
cat_cols = []
for c in train_f.columns:
    if c == TARGET_COL or c == id_col:
        continue
    if train_f[c].dtype == "O" or str(train_f[c].dtype).startswith("category"):
        # mark as categorical if it’s textual or already category (avoid explicitly numeric engineered cols)
        cat_cols.append(c)

# Ensure same categories across train/test for categorical columns
for c in cat_cols:
    train_f[c] = train_f[c].astype("category")
    test_f[c]  = test_f[c].astype("category")
    all_cats = sorted(list(set(train_f[c].cat.categories.tolist()) | set(test_f[c].cat.categories.tolist())))
    train_f[c] = train_f[c].cat.set_categories(all_cats)
    test_f[c]  = test_f[c].cat.set_categories(all_cats)

# Define features (excluding target & id)
features = [c for c in train_f.columns if c not in {TARGET_COL, id_col}]

X_full = train_f[features]
y_full = train_f[TARGET_COL].astype(int)
X_test_full = test_f[features]


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [6]:

# --------------------
# CV + Training with fold-safe encodings
# --------------------
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "min_data_in_leaf": 50,
    "max_depth": -1,
    "verbose": -1,
    "is_unbalance": True,
    "seed": RANDOM_STATE
}

oof = np.zeros(len(X_full))
test_pred = np.zeros(len(X_test_full))
fold_aucs = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_full, y_full), 1):
    X_tr, y_tr = X_full.iloc[tr_idx].copy(), y_full.iloc[tr_idx].copy()
    X_va, y_va = X_full.iloc[va_idx].copy(), y_full.iloc[va_idx].copy()
    X_te = X_test_full.copy()

    # Add fold-safe target & frequency encodings
    X_tr_e, X_va_e, X_te_e = add_fold_encodings(
        X_tr, y_tr, X_va, X_te,
        te_cols=TE_COLS + ["housing_loan_combo", "job_x_education", "campaign_bins"],
        fe_cols=FE_COLS + ["housing_loan_combo", "job_x_education", "campaign_bins"],
        te_smooth_m=TE_SMOOTH_M,
        strict_freq=True
    )

    # Update feature list for this fold (original + new encodings)
    fold_features = list(X_tr_e.columns)

    # Categorical columns for LGBM (only those that are truly categories in this fold)
    fold_cat_cols = [c for c in fold_features if (c in cat_cols and c in X_tr_e.columns)]

    dtr = lgb.Dataset(X_tr_e[fold_features], label=y_tr, categorical_feature=fold_cat_cols, free_raw_data=False)
    dva = lgb.Dataset(X_va_e[fold_features], label=y_va, categorical_feature=fold_cat_cols, reference=dtr, free_raw_data=False)

    model = lgb.train(
        params,
        dtr,
        num_boost_round=5000,
        valid_sets=[dtr, dva],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(200),
            lgb.log_evaluation(200)
        ]
    )

    oof[va_idx] = model.predict(X_va_e[fold_features], num_iteration=model.best_iteration)
    fold_auc = roc_auc_score(y_va, oof[va_idx])
    fold_aucs.append(fold_auc)
    print(f"[Fold {fold}] AUC: {fold_auc:.5f}")

    test_pred += model.predict(X_te_e[fold_features], num_iteration=model.best_iteration) / N_SPLITS

cv_auc = roc_auc_score(y_full, oof)
print(f"\nOOF AUC: {cv_auc:.5f} | Folds: {', '.join(f'{a:.5f}' for a in fold_aucs)}")


Training until validation scores don't improve for 200 rounds
[200]	train's auc: 0.965274	valid's auc: 0.963336
[400]	train's auc: 0.969284	valid's auc: 0.965564


KeyboardInterrupt: 

In [None]:

# --------------------
# Submission
# --------------------
sub = pd.DataFrame({
    id_col: test[id_col].values,
    TARGET_COL: test_pred  # probabilities; threshold later to get 0/1 if needed
})
sub_path = "submission_step2_feat_eng.csv"
sub.to_csv(sub_path, index=False)
print(f"Saved: {sub_path}")

# If you want to save a 0/1 version immediately as well, uncomment:
# sub_bin = sub.copy()
# sub_bin[TARGET_COL] = (sub_bin[TARGET_COL] >= 0.5).astype(int)
# sub_bin_path = "submission_step2_feat_eng_binary.csv"
# sub_bin.to_csv(sub_bin_path, index=False)
# print(f"Saved: {sub_bin_path}")


In [5]:


# --- Config ---
TARGET_COL = "y"
ID_COL_CANDIDATES = ["id", "ID", "Id"]
N_SPLITS = 5
RANDOM_STATE = 42

# --- Safety checks & light cleaning ---
train = data_train.copy()
test  = data_test.copy()

# Map y from "yes"/"no" to 1/0 if needed
if train[TARGET_COL].dtype == "O":
    train[TARGET_COL] = train[TARGET_COL].str.strip().str.lower().map({"yes": 1, "no": 0})

# Identify ID column (fallback to index)
id_col = None
for c in ID_COL_CANDIDATES:
    if c in test.columns:
        id_col = c
        break
if id_col is None:
    id_col = "id"
    test[id_col] = np.arange(len(test))

# Features: drop target + id if present
drop_cols = {TARGET_COL, id_col} & set(train.columns)
features = [c for c in train.columns if c not in drop_cols]

# Find categoricals and cast to category (LightGBM-native handling)
cat_cols = [c for c in features if train[c].dtype == "O"]
for c in cat_cols:
    train[c] = train[c].astype("category")
    if c in test.columns:
        test[c] = test[c].astype("category")

# Align categorical levels across train/test
for c in cat_cols:
    all_cats = sorted(list(set(train[c].cat.categories.tolist()) | set(test[c].cat.categories.tolist())))
    train[c] = train[c].cat.set_categories(all_cats)
    test[c]  = test[c].cat.set_categories(all_cats)

X = train[features]
y = train[TARGET_COL].astype(int)
X_test = test[features]

# --- CV & Training ---
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "min_data_in_leaf": 50,
    "max_depth": -1,
    "verbose": -1,
    "is_unbalance": True,
    "seed": RANDOM_STATE
}

oof = np.zeros(len(X))
test_pred = np.zeros(len(X_test))
fold_aucs = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

    dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
    dva = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols, reference=dtr, free_raw_data=False)

    model = lgb.train(
        params,
        dtr,
        num_boost_round=5000,
        valid_sets=[dtr, dva],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(200),
            lgb.log_evaluation(200)
        ]
    )

    oof[va_idx] = model.predict(X_va, num_iteration=model.best_iteration)
    fold_auc = roc_auc_score(y_va, oof[va_idx])
    fold_aucs.append(fold_auc)
    print(f"[Fold {fold}] AUC: {fold_auc:.5f}")

    test_pred += model.predict(X_test, num_iteration=model.best_iteration) / N_SPLITS

cv_auc = roc_auc_score(y, oof)
print(f"\nOOF AUC: {cv_auc:.5f} | Folds: {', '.join(f'{a:.5f}' for a in fold_aucs)}")

# --- Submission file ---
sub = pd.DataFrame({
    id_col: test[id_col].values,
    TARGET_COL: test_pred
})
sub_path = "submission_lgbm_baseline.csv"
sub.to_csv(sub_path, index=False)
print(f"Saved: {sub_path}")


Training until validation scores don't improve for 200 rounds
[200]	train's auc: 0.964724	valid's auc: 0.963383
[400]	train's auc: 0.968476	valid's auc: 0.965588
[600]	train's auc: 0.970503	valid's auc: 0.966252
[800]	train's auc: 0.972134	valid's auc: 0.966607
[1000]	train's auc: 0.973609	valid's auc: 0.966909
[1200]	train's auc: 0.974898	valid's auc: 0.967112
[1400]	train's auc: 0.976102	valid's auc: 0.967247
[1600]	train's auc: 0.977181	valid's auc: 0.967346
[1800]	train's auc: 0.978197	valid's auc: 0.967437
[2000]	train's auc: 0.979152	valid's auc: 0.967473
[2200]	train's auc: 0.980019	valid's auc: 0.967505
[2400]	train's auc: 0.980885	valid's auc: 0.967555
[2600]	train's auc: 0.981672	valid's auc: 0.967544
Early stopping, best iteration is:
[2427]	train's auc: 0.981001	valid's auc: 0.967561
[Fold 1] AUC: 0.96756
Training until validation scores don't improve for 200 rounds
[200]	train's auc: 0.964613	valid's auc: 0.96362
[400]	train's auc: 0.968346	valid's auc: 0.965789
[600]	trai

In [None]:
import pandas as pd

# Path to your existing predictions file
sub_path = "submission_lgbm_baseline.csv"

# Read the file
sub = pd.read_csv(sub_path)

# Convert predictions to 0/1
sub["y"] = (sub["y"] >= 0.50).astype(int)

# Save updated predictions
sub.to_csv("submission_lgbm_baseline_binary_50.csv", index=False)
print(f"Updated predictions in {sub_path} to binary 0/1")


Updated predictions in submission_lgbm_baseline.csv to binary 0/1
