In [20]:
# =========================
# =========================
# COMP3010 BLEVE — FULL NOTEBOOK PIPELINE (Dev Rewrite)
# Key decisions:
# - Train on log1p(y) (stability + better relative error behavior)
# - Predict with expm1
# - Leakage-safe GroupKFold using scenario-level grouping (exclude sensor-specific fields)
# - Robust encoding + imputation
# - Stable MAPE implementation + group sanity checks
# =========================


# -------------------------
# CELL 0 — CONFIG
# -------------------------
import os
import numpy as np
import pandas as pd

SEED = 42
N_SPLITS = 5

DATA_DIR = "."  # change if needed
TRAIN_PATH  = os.path.join(DATA_DIR, "train.csv")
TEST_PATH   = os.path.join(DATA_DIR, "test.csv")
SAMPLE_PATH = os.path.join(DATA_DIR, "sample_prediction.csv")

OUT_PATH = "prediction.csv"

np.random.seed(SEED)

In [21]:
# =========================
# CELL 2 — LOAD DATA (LOCAL: ../data folder)
# =========================

DATA_DIR = "../data"

print("Current working dir:", os.getcwd())
print("Files in DATA_DIR:", os.listdir(DATA_DIR))

train = pd.read_csv(f"{DATA_DIR}/train.csv")
test = pd.read_csv(f"{DATA_DIR}/test.csv")
sample = pd.read_csv(f"{DATA_DIR}/sample_prediction.csv")

print("Loaded shapes:")
print("train :", train.shape)
print("test  :", test.shape)
print("sample:", sample.shape)

Current working dir: c:\Users\samzt\OneDrive\Documents\COMP3010_BLEVE_Project\notebook
Files in DATA_DIR: ['sample_prediction.csv', 'test.csv', 'train.csv']
Loaded shapes:
train : (10050, 25)
test  : (3203, 24)
sample: (3203, 2)


In [22]:
# -------------------------
# CELL 2 — CLEAN + NORMALIZE COLUMN NAMES
# -------------------------
def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    return df

def drop_index_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in ["Unnamed: 0", "index"]:
        if c in df.columns and df[c].nunique() == len(df):
            df = df.drop(columns=[c])
    return df

def normalize_cats(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in ["Status", "Sensor Position Side"]:
        if c in df.columns:
            s = df[c].astype("string").str.strip().str.lower()
            s = s.replace({"nan": pd.NA, "none": pd.NA, "": pd.NA})
            df[c] = s
    return df

train = normalize_cols(train)
test  = normalize_cols(test)

train = drop_index_cols(train)
test  = drop_index_cols(test)

train = normalize_cats(train)
test  = normalize_cats(test)

train = train.drop_duplicates()
print("After clean:", train.shape, test.shape)

After clean: (10000, 25) (3203, 23)


In [23]:
# -------------------------
# CELL 3 — FEATURE ENGINEERING (safe + consistent)
# Also renames columns to avoid whitespace warnings in LightGBM
# -------------------------
def safe_div(a, b):
    b = np.where(b == 0, np.nan, b)
    return a / b

def fe(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # rename spaces -> underscores once for the whole pipeline
    df.columns = df.columns.str.replace(r"\s+", "_", regex=True)

    def has(cols): return all(c in df.columns for c in cols)

    # tank volume + shape
    if has(["Tank_Width_(m)", "Tank_Length_(m)", "Tank_Height_(m)"]):
        W = df["Tank_Width_(m)"].astype(float)
        L = df["Tank_Length_(m)"].astype(float)
        H = df["Tank_Height_(m)"].astype(float)
        df["tank_vol"] = W * L * H
        df["tank_w_over_l"] = safe_div(W, L)

    # vapour fraction
    if has(["Vapour_Height_(m)", "Tank_Height_(m)"]):
        df["vapour_height_frac"] = safe_div(df["Vapour_Height_(m)"].astype(float),
                                            df["Tank_Height_(m)"].astype(float))

    # obstacle area
    if has(["Obstacle_Width_(m)", "Obstacle_Height_(m)"]):
        df["obs_area"] = df["Obstacle_Width_(m)"].astype(float) * df["Obstacle_Height_(m)"].astype(float)

    # sensor radius
    if has(["Sensor_Position_x_(m)", "Sensor_Position_y_(m)", "Sensor_Position_z_(m)"]):
        x = df["Sensor_Position_x_(m)"].astype(float)
        y = df["Sensor_Position_y_(m)"].astype(float)
        z = df["Sensor_Position_z_(m)"].astype(float)
        df["sensor_r"] = np.sqrt(x*x + y*y + z*z)

    # sensor_r / obstacle distance
    if has(["sensor_r", "Obstacle_Distance_to_BLEVE_(m)"]):
        df["sensor_r_over_obsdist"] = safe_div(df["sensor_r"].astype(float),
                                               df["Obstacle_Distance_to_BLEVE_(m)"].astype(float))

    # hard safety
    df = df.replace([np.inf, -np.inf], np.nan).fillna(0)
    return df

train_fe = fe(train)
test_fe  = fe(test)

# target auto-detect (since we renamed spaces->underscores)
if "Target_Pressure_(bar)" in train_fe.columns:
    TARGET = "Target_Pressure_(bar)"
elif "Target Pressure (bar)" in train_fe.columns:
    TARGET = "Target Pressure (bar)"
else:
    raise KeyError("Target column not found after FE.")

print("Using TARGET:", TARGET)
print("FE shapes:", train_fe.shape, test_fe.shape)

Using TARGET: Target_Pressure_(bar)
FE shapes: (10000, 29) (3203, 27)


In [24]:
# -------------------------
# CELL 4 — SCENARIO GROUPS (leakage-safe)
# Group key must use ONLY scenario/config variables (exclude sensor fields + target)
# -------------------------
SENSOR_FIELDS = [
    "Sensor_ID",
    "Sensor_Position_Side",
    "Sensor_Position_x_(m)",
    "Sensor_Position_y_(m)",
    "Sensor_Position_z_(m)",
    "sensor_r",
    "sensor_r_over_obsdist",
]

exclude_for_group = set(SENSOR_FIELDS + [TARGET])

scenario_cols = [c for c in train_fe.columns if c not in exclude_for_group]

gdf = train_fe[scenario_cols].copy()

# stabilize numeric representation
for c in gdf.columns:
    if pd.api.types.is_numeric_dtype(gdf[c]):
        gdf[c] = gdf[c].round(6)
    gdf[c] = gdf[c].astype(str)

groups = pd.util.hash_pandas_object(gdf, index=False).astype("int64").to_numpy()

u, cnt = np.unique(groups, return_counts=True)
print("unique groups:", len(u), "rows:", len(groups))
print("group size min/median/max:", int(cnt.min()), float(np.median(cnt)), int(cnt.max()))

unique groups: 10000 rows: 10000
group size min/median/max: 1 1.0 1


In [25]:
# -------------------------
# CELL 5 — BUILD X/y (align + encode + constants)
# Train on log1p(y)
# -------------------------
y = train_fe[TARGET].astype(float).to_numpy()
y = np.clip(y, 1e-6, None)
y_log = np.log1p(y)

X_train = train_fe.drop(columns=[TARGET]).copy()
X_test  = test_fe.copy()

# align columns
common = X_train.columns.intersection(X_test.columns)
X_train = X_train[common].copy()
X_test  = X_test[common].copy()

# encode any object/string cols
obj_cols = X_train.select_dtypes(include=["object", "string"]).columns.tolist()
for c in obj_cols:
    X_train[c] = X_train[c].fillna("NA").astype(str).str.strip().str.lower()
    X_test[c]  = X_test[c].fillna("NA").astype(str).str.strip().str.lower()

    cats = pd.Index(pd.concat([X_train[c], X_test[c]], axis=0).unique())
    mapper = {k: i for i, k in enumerate(cats)}
    X_train[c] = X_train[c].map(mapper).astype("int32")
    X_test[c]  = X_test[c].map(mapper).astype("int32")

# final safety
X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0)
X_test  = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)

# drop constant columns
const_cols = [c for c in X_train.columns if X_train[c].nunique(dropna=False) <= 1]
if const_cols:
    X_train = X_train.drop(columns=const_cols)
    X_test  = X_test.drop(columns=const_cols)

print("X shapes:", X_train.shape, X_test.shape)
print("encoded obj cols:", len(obj_cols), "dropped constants:", len(const_cols))

X shapes: (10000, 27) (3203, 27)
encoded obj cols: 2 dropped constants: 0


In [26]:
# -------------------------
# CELL 6 — CV (GroupKFold) + LightGBM (stronger splits) + stable MAPE
# -------------------------
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import GroupKFold
import numpy as np

def stable_mape(y_true, y_pred, eps=1e-6):
    y_true = np.clip(np.asarray(y_true, dtype=float), eps, None)
    y_pred = np.clip(np.asarray(y_pred, dtype=float), eps, None)
    return float(np.mean(np.abs((y_true - y_pred) / y_true)))

params = dict(
    objective="regression",
    n_estimators=30000,
    learning_rate=0.02,
    num_leaves=256,
    max_depth=-1,
    min_data_in_leaf=5,
    subsample=0.85,
    subsample_freq=1,
    colsample_bytree=0.9,
    reg_alpha=0.0,
    reg_lambda=1.0,
    random_state=SEED,
    n_jobs=-1,
    force_col_wise=True,
)

gkf = GroupKFold(n_splits=N_SPLITS)
fold_scores = []
oof = np.zeros(len(X_train), dtype=float)

for fold, (tr, va) in enumerate(gkf.split(X_train, y_log, groups), 1):
    Xtr, Xva = X_train.iloc[tr], X_train.iloc[va]
    ytr, yva = y_log[tr], y_log[va]

    model = LGBMRegressor(**params)
    model.fit(
        Xtr, ytr,
        eval_set=[(Xva, yva)],
        eval_metric="l1",
        callbacks=[lgb.early_stopping(500), lgb.log_evaluation(0)]
    )

    pred_log = model.predict(Xva)
    pred = np.expm1(pred_log)
    pred = np.clip(pred, 1e-6, None)

    m = stable_mape(y[va], pred, eps=1e-6)
    fold_scores.append(m)
    oof[va] = pred

    print(f"Fold {fold} | stable MAPE: {m:.5f} | best_iter: {model.best_iteration_}")

print("CV stable MAPE mean:", float(np.mean(fold_scores)), "std:", float(np.std(fold_scores)))

[LightGBM] [Info] Total Bins 3065
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 27
[LightGBM] [Info] Start training from score 0.267985
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[1944]	valid_0's l1: 0.0223656	valid_0's l2: 0.00147028
Fold 1 | stable MAPE: 99.92645 | best_iter: 1944
[LightGBM] [Info] Total Bins 3058
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 27
[LightGBM] [Info] Start training from score 0.264346
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[19394]	valid_0's l1: 0.0241156	valid_0's l2: 0.00187626
Fold 2 | stable MAPE: 67.30093 | best_iter: 19394
[LightGBM] [Info] Total Bins 3063
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 27
[LightGBM] [Info] Start training from score 0.268117
Training until validation scores don't improve for 500 ro

KeyboardInterrupt: 

In [None]:
# -------------------------
# CELL 7 — TRAIN FULL + PREDICT TEST + SAVE
# -------------------------
final = LGBMRegressor(**params)
final.fit(X_train, y_log, eval_metric="l1")

test_pred_log = final.predict(X_test)
test_pred = np.expm1(test_pred_log)
test_pred = np.clip(test_pred, 1e-6, None)

sub = sample.copy()
sub.iloc[:, 1] = test_pred
sub.to_csv(OUT_PATH, index=False)

print("Saved:", OUT_PATH)
sub.head()

[LightGBM] [Info] Total Bins 3066
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 27
[LightGBM] [Info] Start training from score 0.267983
Saved: prediction.csv


Unnamed: 0,ID,Target Pressure (bar)
0,0,0.093965
1,1,0.09443
2,2,0.095009
3,3,0.07992
4,4,0.072371
