<a href="https://colab.research.google.com/github/phfrebelo/aiml-portfolio/blob/main/Hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Ensemble: LightGBM + XGBoost + CatBoost
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

# --- CONFIG ---
TRAIN_PATH = "train.csv"
TEST_PATH  = "test.csv"
SUB_PATH   = "submission.csv"
SEED = 42
NFOLDS = 5

# --- UTIL ---
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# --- LOAD ---
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

TARGET = "Annual Turnover"
if TARGET not in train.columns:
    for alt in ["Turnover", "turnover", "AnnualTurnover", "Annual_Turnover"]:
        if alt in train.columns:
            TARGET = alt
            break
y = train[TARGET].values
y_log = np.log1p(y)

test_ids = test["Registration Number"] if "Registration Number" in test.columns else test.iloc[:,0]

# --- PREPROCESS (same as before) ---
all_df = pd.concat([train.drop(columns=[TARGET]), test], ignore_index=True)

# Date parsing
date_cols_to_drop = []
for c in all_df.columns:
    if all_df[c].dtype == object:
        if all_df[c].astype(str).str.contains(r"\d{4}|\d{2}/\d{2}/\d{4}", na=False).any():
            try:
                all_df[c] = pd.to_datetime(all_df[c], dayfirst=True, errors="coerce")
            except:
                pass

for c in all_df.select_dtypes(include=["datetime64"]).columns:
    all_df[c + "_year"] = all_df[c].dt.year.fillna(0).astype(int)
    all_df[c + "_month"] = all_df[c].dt.month.fillna(0).astype(int)
    all_df[c + "_day"] = all_df[c].dt.day.fillna(0).astype(int)
    date_cols_to_drop.append(c)

all_df.drop(columns=date_cols_to_drop, inplace=True)

# Categorical encodings
cat_cols = all_df.select_dtypes(include=["object"]).columns.tolist()
for c in cat_cols:
    all_df[c] = all_df[c].fillna("<<MISSING>>")
    freq = all_df[c].value_counts(normalize=True)
    all_df[c + "_freq"] = all_df[c].map(freq).astype(float)
    le = LabelEncoder()
    all_df[c] = le.fit_transform(all_df[c].astype(str))

# Numeric missing + log transforms
num_cols = all_df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols:
    if all_df[c].isna().any():
        all_df[c + "_nan"] = all_df[c].isna().astype(int)
        all_df[c] = all_df[c].fillna(all_df[c].median())

for c in num_cols:
    if all_df[c].min() >= 0 and (all_df[c].skew() > 1):
        all_df[c + "_log"] = np.log1p(all_df[c])

# Split back
train_X = all_df.iloc[:len(train)].copy()
test_X  = all_df.iloc[len(train):].copy()

if "Registration Number" in train_X.columns:
    train_X.drop(columns=["Registration Number"], inplace=True)
if "Registration Number" in test_X.columns:
    test_X.drop(columns=["Registration Number"], inplace=True)

# --- PARAMS ---
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.01,
    "num_leaves": 128,
    "subsample": 0.9,
    "colsample_bytree": 0.8,
    "min_child_weight": 50,
    "reg_alpha": 0.1,
    "reg_lambda": 10.0,
    "n_jobs": -1,
    "seed": SEED,
    "verbose": -1,
}

xgb_params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.01,
    "max_depth": 10,
    "subsample": 0.9,
    "colsample_bytree": 0.8,
    "lambda": 10.0,
    "alpha": 0.1,
    "n_estimators": 20000,
    "random_state": SEED,
    "tree_method": "hist",  # fast
}

cat_params = {
    "loss_function": "RMSE",
    "learning_rate": 0.01,
    "depth": 10,
    "l2_leaf_reg": 10,
    "iterations": 20000,
    "random_seed": SEED,
    "verbose": 500,
    "early_stopping_rounds": 200,
}

# --- CV ENSEMBLE ---
oof_preds = np.zeros(len(train_X))
test_preds = np.zeros(len(test_X))

y_bins = pd.qcut(y_log, q=NFOLDS, labels=False, duplicates="drop")
kf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

for fold, (tr_idx, val_idx) in enumerate(kf.split(train_X, y_bins)):
    print(f"\n=== Fold {fold+1} ===")
    X_tr, X_val = train_X.iloc[tr_idx], train_X.iloc[val_idx]
    y_tr, y_val = y_log[tr_idx], y_log[val_idx]

    # Ensure column order and dtypes match for CatBoost
    X_val = X_val[X_tr.columns].astype(X_tr.dtypes)


    # LightGBM
    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval = lgb.Dataset(X_val, label=y_val)
    lgb_model = lgb.train(
        lgb_params,
        dtrain,
        num_boost_round=20000,
        valid_sets=[dtrain, dval],
        valid_names=["train", "val"],
        callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=500)],
    )
    lgb_val = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
    lgb_test = lgb_model.predict(test_X, num_iteration=lgb_model.best_iteration)

    # XGBoost
    xgb_model = xgb.XGBRegressor(**xgb_params)
    xgb_model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=500,
    )
    xgb_val = xgb_model.predict(X_val)
    xgb_test = xgb_model.predict(test_X)

    # CatBoost
    cat_model = CatBoostRegressor(**cat_params)
    cat_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
    cat_val = cat_model.predict(X_val)
    cat_test = cat_model.predict(test_X)

    # Blend (equal weights)
    val_pred = (lgb_val + xgb_val + cat_val) / 3
    test_pred = (lgb_test + xgb_test + cat_test) / 3

    oof_preds[val_idx] = val_pred
    test_preds += test_pred / NFOLDS

# --- EVALUATION ---
oof_rmse = rmse(y_log, oof_preds)
print("OOF RMSE on log1p target:", oof_rmse)

oof_preds_orig = np.expm1(oof_preds)
oof_rmse_orig = rmse(y, oof_preds_orig)
print("OOF RMSE (original scale):", oof_rmse_orig)

# --- SUBMISSION ---
submission = pd.DataFrame({
    "Registration Number": test_ids.values,
    "Annual Turnover": np.expm1(test_preds).clip(min=0),
})
submission.to_csv(SUB_PATH, index=False)
print(f"Saved submission to {SUB_PATH}")


=== Fold 1 ===
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[687]	train's rmse: 0.321315	val's rmse: 0.496957
[0]	validation_0-rmse:0.56857
[500]	validation_0-rmse:0.50107
[1000]	validation_0-rmse:0.49996
[1500]	validation_0-rmse:0.50159
[2000]	validation_0-rmse:0.50291
[2500]	validation_0-rmse:0.50359
[3000]	validation_0-rmse:0.50392
[3500]	validation_0-rmse:0.50411
[4000]	validation_0-rmse:0.50419
[4500]	validation_0-rmse:0.50426
[5000]	validation_0-rmse:0.50430
[5500]	validation_0-rmse:0.50431
[6000]	validation_0-rmse:0.50431
[6500]	validation_0-rmse:0.50432
[7000]	validation_0-rmse:0.50432
[7500]	validation_0-rmse:0.50432
[8000]	validation_0-rmse:0.50433
[8500]	validation_0-rmse:0.50433
[9000]	validation_0-rmse:0.50432
[9500]	validation_0-rmse:0.50432
[10000]	validation_0-rmse:0.50432
[10500]	validation_0-rmse:0.50433
[11000]	validation_0-rmse:0.50433
[11500]	validation_0-rmse:0.50433
[12000]	validation_0-rmse:0.50433
[12500]	val

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
