
# Region-Based LightGBM — Sequential (Non-Parallel) with Tighter Grid & Early Stopping (GPU/Colab)

This notebook trains a **LightGBM** regressor on tabular data where metropolitan regions are **one-hot encoded** (e.g., `Regionname_Northern Metropolitan`).  
It runs **four held-out experiments sequentially** (north/south/east/west) with **from-scratch nested CV** (10× outer, 3× inner), using a **tighter hyperparameter grid** and **early stopping** to reduce runtime while preserving accuracy.

**Runtime savers**
- Focused grid: tune only **`num_leaves`**, **`learning_rate`**, **`min_child_samples`**, **`feature_fraction`**.
- Fix less impactful params: `max_depth=-1`, `bagging_fraction=0.8`, `lambda_l1=0.0`, `lambda_l2=0.1`.
- Set `n_estimators` high and rely on **early stopping** (no `n_estimators` grid).
- Keep features as **pandas DataFrames** and use **GPU** if available.


## Setup & Configuration

In [None]:

# !pip -q install lightgbm pandas numpy

import os, sys, math, random, json, time, itertools, datetime
import numpy as np
import pandas as pd

try:
    import lightgbm as lgb
    _lgb_ok = True
except Exception as e:
    _lgb_ok = False
    print("LightGBM import failed. Please `pip install lightgbm`. Error:", e)

DATA_PATH = 'dataset.csv'
REGION_OHE_PREFIX = "Regionname_"
METRO_KEYWORD = "Metropolitan"
REGION_NAME_MAP = {'Northern':'north','Southern':'south','Eastern':'east','Western':'west'}
TARGET_COL_CANDIDATES = ['target','Target','TARGET','price','Price','label','Label','y']

GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED); random.seed(GLOBAL_SEED)
OUTER_K = 10
INNER_K = 3

HYPERPARAM_GRID = {
    'num_leaves':        [31, 63, 127],
    'learning_rate':     [0.01, 0.05, 0.1],
    'min_child_samples': [20, 40],
    'feature_fraction':  [0.8, 1.0],
}
FIXED_PARAMS = {
    'max_depth':        -1,
    'bagging_fraction': 0.8,
    'lambda_l1':        0.0,
    'lambda_l2':        0.1,
}
N_ESTIMATORS = 2000
EARLY_STOPPING_ROUNDS = 100
VERBOSE_EARLY_STOP = False
USE_GPU_IF_AVAILABLE = True

def gpu_available_lightgbm():
    if not _lgb_ok:
        return False
    try:
        Xp = np.random.rand(64, 5)
        yp = np.random.rand(64)
        dtrain = lgb.Dataset(Xp, label=yp)
        params = {'objective':'regression','metric':'rmse','verbose':-1,'device_type':'gpu'}
        lgb.train(params, dtrain, num_boost_round=1)
        return True
    except Exception:
        return False

GPU_OK = gpu_available_lightgbm() if USE_GPU_IF_AVAILABLE else False
print("LightGBM available:", _lgb_ok, "| GPU usable:", GPU_OK)

def grid_to_list(grid):
    keys = list(grid.keys())
    vals = [grid[k] for k in keys]
    combos = []
    for comb in itertools.product(*vals):
        combos.append(dict(zip(keys, comb)))
    return combos

GRID_LIST = grid_to_list(HYPERPARAM_GRID)
TOTAL_COMBOS = len(GRID_LIST)
print("Total hyperparameter combinations:", TOTAL_COMBOS)


LightGBM available: True | GPU usable: True
Total hyperparameter combinations: 36


## Load Data & Derive Metropolitan Region from One-Hot Columns

In [None]:

if not os.path.exists(DATA_PATH):
    alt_path = '/mnt/data/dataset.csv'
    if os.path.exists(alt_path):
        DATA_PATH = alt_path

df_raw = pd.read_csv(DATA_PATH)
print("Loaded shape:", df_raw.shape)

target_col = None
for c in TARGET_COL_CANDIDATES:
    if c in df_raw.columns:
        target_col = c
        break
if target_col is None:
    numeric_cols = [c for c in df_raw.columns if pd.api.types.is_numeric_dtype(df_raw[c])]
    if not numeric_cols:
        raise ValueError("No numeric columns to infer target from. Please set the target column name explicitly.")
    target_col = numeric_cols[-1]
print("Target column:", target_col)

ohe_cols = [c for c in df_raw.columns if c.startswith(REGION_OHE_PREFIX) and METRO_KEYWORD in c]
if not ohe_cols:
    raise ValueError(f"No region OHE columns found with prefix '{REGION_OHE_PREFIX}' containing '{METRO_KEYWORD}'.")

col_to_region = {}
for c in ohe_cols:
    for k,v in REGION_NAME_MAP.items():
        if k.lower() in c.lower():
            col_to_region[c] = v
            break

ohe_cols_mapped = [c for c in ohe_cols if c in col_to_region]
if not ohe_cols_mapped:
    raise ValueError("Could not map Metropolitan OHE columns to {north,south,east,west}.")

print("Mapped OHE columns -> region:")
print({c: col_to_region[c] for c in ohe_cols_mapped})

df = df_raw.copy()
for c in ohe_cols_mapped:
    df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int)

ohe_matrix = df[ohe_cols_mapped].values
region_labels = []
for i in range(len(df)):
    row = ohe_matrix[i]
    if row.sum() <= 0:
        region_labels.append(None)
    else:
        j = int(np.argmax(row))
        region_labels.append(col_to_region[ohe_cols_mapped[j]])
df['__region_norm__'] = region_labels

before = len(df)
df = df[df['__region_norm__'].isin(['north','south','east','west'])].copy()
after = len(df)
print(f"Kept metro rows (north/south/east/west): {after}/{before}")
print(df['__region_norm__'].value_counts())

drop_cols = set([target_col, '__region_norm__'] + ohe_cols_mapped)
id_like = [c for c in df.columns if c.lower() in {'id','uid','uuid','index'}]
drop_cols.update(id_like)

X_df_all = df.drop(columns=list(drop_cols), errors='ignore')
X_df_all = pd.get_dummies(X_df_all, drop_first=True)
y_all = df[target_col].to_numpy()

feature_cols = X_df_all.columns
regions = ['north','south','east','west']
print("Feature matrix shape:", X_df_all.shape)


Loaded shape: (27244, 109)
Target column: Price
Mapped OHE columns -> region:
{'Regionname_Eastern Metropolitan': 'east', 'Regionname_Northern Metropolitan': 'north', 'Regionname_South-Eastern Metropolitan': 'east', 'Regionname_Southern Metropolitan': 'south', 'Regionname_Western Metropolitan': 'west'}
Kept metro rows (north/south/east/west): 26816/27244
__region_norm__
south    8524
north    7864
west     5815
east     4613
Name: count, dtype: int64
Feature matrix shape: (26816, 4677)


## Utilities (Metrics, K-Fold, Logging, Training with Early Stopping)

In [None]:

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))

def r2(y_true, y_pred):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 0.0 if ss_tot == 0 else float(1 - ss_res / ss_tot)

def kfold_indices(n_samples, k=5, seed=42, shuffle=True):
    idx = np.arange(n_samples)
    if shuffle:
        rng = np.random.default_rng(seed)
        rng.shuffle(idx)
    fold_sizes = [n_samples // k] * k
    for i in range(n_samples % k):
        fold_sizes[i] += 1
    splits, start = [], 0
    for fs in fold_sizes:
        val_idx = idx[start:start+fs]
        train_idx = np.setdiff1d(idx, val_idx, assume_unique=False)
        splits.append((train_idx, val_idx))
        start += fs
    return splits

def make_logger():
    buf = []
    def log(msg, print_also=True):
        ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        line = f"[{ts}] {msg}"
        buf.append(line)
        if print_also: print(line)
    def dump(path):
        with open(path, "w") as f:
            f.write("\n".join(buf))
    return log, dump

def build_regressor(params, seed=GLOBAL_SEED):
    full = dict(params)
    full.update(FIXED_PARAMS)
    reg = lgb.LGBMRegressor(
        objective='regression',
        metric='rmse',
        random_state=seed,
        n_estimators=N_ESTIMATORS,
        **full
    )
    if GPU_OK:
        try: setattr(reg, 'device_type', 'gpu')
        except Exception: pass
        try: setattr(reg, 'device', 'gpu')
        except Exception: pass
    return reg

def fit_predict_with_es(reg, Xtr_df, ytr, Xva_df, yva):
    callbacks = [lgb.early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=VERBOSE_EARLY_STOP)]
    reg.fit(Xtr_df, ytr, eval_set=[(Xva_df, yva)], callbacks=callbacks)
    num_iter = getattr(reg, 'best_iteration_', None)
    if num_iter is not None and num_iter > 0:
        pred = reg.predict(Xva_df, num_iteration=num_iter)
    else:
        pred = reg.predict(Xva_df)
    return pred


## Run All Four Experiments

In [None]:

all_experiment_summaries = []

for held_out in regions:
    log, dump = make_logger()
    log("="*90)
    log(f"Starting experiment | HELD-OUT REGION: {held_out.upper()}")
    log("="*90)
    train_regions = sorted(list(set(regions) - {held_out}))
    log(f"Training regions: {train_regions}")

    # Masks for rows
    mask_train = df['__region_norm__'].isin(train_regions).values

    # Keep as DataFrames
    X_train_df = X_df_all.loc[mask_train, feature_cols].copy()
    y_train    = y_all[mask_train]

    log(f"Train shape: {X_train_df.shape}")
    if X_train_df.shape[0] < OUTER_K + INNER_K:
        log("Warning: Small sample sizes may distort CV results.")

    outer_splits = kfold_indices(X_train_df.shape[0], k=OUTER_K, seed=GLOBAL_SEED, shuffle=True)
    outer_results, per_fold_best = [], []

    for fold_idx, (tr_idx, va_idx) in enumerate(outer_splits, start=1):
        log("-"*70)
        log(f"Outer fold {fold_idx}/{OUTER_K}")

        X_tr_df = X_train_df.iloc[tr_idx]
        y_tr    = y_train[tr_idx]
        X_va_df = X_train_df.iloc[va_idx]
        y_va    = y_train[va_idx]

        inner_splits = kfold_indices(X_tr_df.shape[0], k=INNER_K, seed=GLOBAL_SEED + fold_idx, shuffle=True)

        best_inner_params, best_inner_score = None, float('inf')

        # ---- Inner CV: evaluate each combo with early stopping ----
        for combo_idx, cand in enumerate(GRID_LIST, start=1):
            rmses, maes, r2s = [], [], []
            for in_tr_idx, in_va_idx in inner_splits:
                X_in_tr_df = X_tr_df.iloc[in_tr_idx]
                y_in_tr    = y_tr[in_tr_idx]
                X_in_va_df = X_tr_df.iloc[in_va_idx]
                y_in_va    = y_tr[in_va_idx]

                reg = build_regressor(cand, seed=GLOBAL_SEED+fold_idx)
                pred = fit_predict_with_es(reg, X_in_tr_df, y_in_tr, X_in_va_df, y_in_va)
                rm, ma, r = rmse(y_in_va, pred), mae(y_in_va, pred), r2(y_in_va, pred)
                rmses.append(rm); maes.append(ma); r2s.append(r)

            mean_rmse = float(np.mean(rmses))
            mean_mae  = float(np.mean(maes))
            mean_r2   = float(np.mean(r2s))

            log(f"[Outer {fold_idx}] Combo {combo_idx}/{TOTAL_COMBOS} params={cand} | "
                f"Inner means -> RMSE={mean_rmse:.4f}, MAE={mean_mae:.4f}, R^2={mean_r2:.4f}")

            if mean_rmse < best_inner_score:
                best_inner_score = mean_rmse
                best_inner_params = cand

        log(f"[Outer {fold_idx}] Best inner params: {best_inner_params} (mean RMSE={best_inner_score:.4f})")

        # ---- Outer validation with chosen params + early stopping ----
        reg = build_regressor(best_inner_params, seed=GLOBAL_SEED+fold_idx)
        pred = fit_predict_with_es(reg, X_tr_df, y_tr, X_va_df, y_va)
        rm, m, r = rmse(y_va, pred), mae(y_va, pred), r2(y_va, pred)

        outer_results.append({'fold': fold_idx, 'rmse': rm, 'mae': m, 'r2': r, 'best_params': best_inner_params})
        per_fold_best.append(best_inner_params)

        log(f"[Outer {fold_idx}] Validation -> RMSE={rm:.4f}, MAE={m:.4f}, R^2={r:.4f}")

    # ---- Aggregate outer results ----
    outer_rmse = [d['rmse'] for d in outer_results]
    outer_mae  = [d['mae']  for d in outer_results]
    outer_r2   = [d['r2']   for d in outer_results]

    mean_rmse, std_rmse = float(np.mean(outer_rmse)), float(np.std(outer_rmse))
    mean_mae,  std_mae  = float(np.mean(outer_mae)),  float(np.std(outer_mae))
    mean_r2,   std_r2   = float(np.mean(outer_r2)),   float(np.std(outer_r2))

    log(f"Outer summary -> RMSE mean={mean_rmse:.4f}±{std_rmse:.4f}, "
        f"MAE mean={mean_mae:.4f}±{std_mae:.4f}, R^2 mean={mean_r2:.4f}±{std_r2:.4f}")

    # ---- Save per-region summary (PER-FOLD lines + aggregates) ----
    lines = []
    lines.append(f"=== Experiment: Held-out region = {held_out} ===\n")
    lines.append("Per-outer-fold metrics (validation on training regions):")
    for d in outer_results:
        lines.append(f"Fold {d['fold']:>2}: RMSE={d['rmse']:.6f}, MAE={d['mae']:.6f}, R2={d['r2']:.6f}, best_params={json.dumps(d['best_params'])}")
    lines.append("\nOuter-CV aggregates:")
    lines.append(f"RMSE mean={mean_rmse:.6f}, std={std_rmse:.6f}")
    lines.append(f"MAE  mean={mean_mae:.6f}, std={std_mae:.6f}")
    lines.append(f"R2   mean={mean_r2:.6f}, std={std_r2:.6f}")

    sum_name = f"summary_{held_out}.txt"
    with open(sum_name, "w") as f:
        f.write("\n".join(lines))

    # Save per-region log
    log_name = f"log_{held_out}.txt"
    dump(log_name)
    print(f"Saved per-region summary to {sum_name} and log to {log_name}")

    all_experiment_summaries.append({
        'held_out': held_out,
        'outer_results': outer_results,
        'aggregates': {'rmse_mean': mean_rmse, 'rmse_std': std_rmse,
                       'mae_mean': mean_mae, 'mae_std': std_mae,
                       'r2_mean': mean_r2, 'r2_std': std_r2},
    })

# ---- Master summary: include PER-FOLD lines for each region ----
master_lines = ["=== Master Summary: All Four Held-Out Metropolitan Regions (Sequential, No Final Fit) ===\n"]
for exp in all_experiment_summaries:
    ho = exp['held_out']
    master_lines.append(f"--- Held-out: {ho} ---")
    master_lines.append("Per-outer-fold metrics:")
    for d in exp['outer_results']:
        master_lines.append(f"  Fold {d['fold']:>2}: RMSE={d['rmse']:.6f}, MAE={d['mae']:.6f}, R2={d['r2']:.6f}")
    ag = exp['aggregates']
    master_lines.append("Aggregates:")
    master_lines.append(f"  RMSE mean={ag['rmse_mean']:.6f}, std={ag['rmse_std']:.6f}")
    master_lines.append(f"  MAE  mean={ag['mae_mean']:.6f}, std={ag['mae_std']:.6f}")
    master_lines.append(f"  R2   mean={ag['r2_mean']:.6f}, std={ag['r2_std']:.6f}")
    master_lines.append("")

with open("experiment_summary_all_regions.txt", "w") as f:
    f.write("\n".join(master_lines))

print("\nWrote combined master summary (with per-fold metrics) to experiment_summary_all_regions.txt")


[2025-10-14 07:32:17] Starting experiment | HELD-OUT REGION: NORTH
[2025-10-14 07:32:17] Training regions: ['east', 'south', 'west']
[2025-10-14 07:32:18] Train shape: (18952, 4677)
[2025-10-14 07:32:18] ----------------------------------------------------------------------
[2025-10-14 07:32:18] Outer fold 1/10
[2025-10-14 07:32:30] [Outer 1] Combo 1/36 params={'num_leaves': 31, 'learning_rate': 0.01, 'min_child_samples': 20, 'feature_fraction': 0.8} | Inner means -> RMSE=299347.9864, MAE=174545.0178, R^2=0.8137
[2025-10-14 07:32:41] [Outer 1] Combo 2/36 params={'num_leaves': 31, 'learning_rate': 0.01, 'min_child_samples': 20, 'feature_fraction': 1.0} | Inner means -> RMSE=302165.7074, MAE=176812.2815, R^2=0.8102
[2025-10-14 07:32:52] [Outer 1] Combo 3/36 params={'num_leaves': 31, 'learning_rate': 0.01, 'min_child_samples': 40, 'feature_fraction': 0.8} | Inner means -> RMSE=300064.4491, MAE=174725.4365, R^2=0.8128
[2025-10-14 07:33:03] [Outer 1] Combo 4/36 params={'num_leaves': 31, 'le