In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

OOF_PATH = Path("oof.csv")
TEST_PRED_PATH = Path("test_pred.csv")
SAMPLE_SUB_PATH = Path("sample_submission.csv")

ID_COL = "id"
TARGET_COL = "target"
PRED_COL = "pred"

TARGET_TYPE = "regression"  # "regression" or "binary"

OUTPUT_DIR = Path("./calibrated_submissions")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("Config OK")

In [None]:
oof = pd.read_csv(OOF_PATH)
test_pred = pd.read_csv(TEST_PRED_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

print("oof shape:", oof.shape)
print("test_pred shape:", test_pred.shape)
print("sample_submission shape:", sample_sub.shape)

assert ID_COL in oof.columns
assert TARGET_COL in oof.columns
assert PRED_COL in oof.columns

assert ID_COL in test_pred.columns
assert PRED_COL in test_pred.columns

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, log_loss, roc_auc_score

def regression_metrics(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    return {"rmse": rmse, "mae": mae}

def binary_metrics(y_true, y_pred):
    y_prob = np.clip(y_pred, 1e-6, 1-1e-6)
    logloss = log_loss(y_true, y_prob)
    try:
        auc = roc_auc_score(y_true, y_prob)
    except ValueError:
        auc = np.nan
    return {"logloss": logloss, "auc": auc}

In [None]:
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LinearRegression, LogisticRegression

class IdentityCalibrator:
    def fit(self, x, y):
        return self
    def transform(self, x):
        return x

class LinearCalibrator:
    def __init__(self):
        self.model = LinearRegression()
    def fit(self, x, y):
        self.model.fit(x.reshape(-1, 1), y)
        return self
    def transform(self, x):
        return self.model.predict(x.reshape(-1, 1))

class IsotonicCalibrator:
    def __init__(self):
        self.model = IsotonicRegression(out_of_bounds="clip")
    def fit(self, x, y):
        self.model.fit(x, y)
        return self
    def transform(self, x):
        return self.model.transform(x)

class PlattCalibrator:
    def __init__(self):
        self.model = LogisticRegression(max_iter=1000)
    def fit(self, x, y):
        self.model.fit(x.reshape(-1, 1), y)
        return self
    def transform(self, x):
        p = self.model.predict_proba(x.reshape(-1, 1))[:, 1]
        return p

In [None]:
y_true = oof[TARGET_COL].values.astype(float)
y_raw = oof[PRED_COL].values.astype(float)

calibrators = {
    "raw": IdentityCalibrator(),
    "linear": LinearCalibrator(),
    "isotonic": IsotonicCalibrator(),
}

if TARGET_TYPE == "binary":
    calibrators["platt"] = PlattCalibrator()

results = []

for name, cal in calibrators.items():
    cal.fit(y_raw, y_true)
    y_cal = cal.transform(y_raw)
    if TARGET_TYPE == "binary":
        m = binary_metrics(y_true, y_cal)
    else:
        m = regression_metrics(y_true, y_cal)
    m["calibration"] = name
    results.append(m)

metrics_df = pd.DataFrame(results).set_index("calibration")
metrics_df

In [None]:
test_raw = test_pred[PRED_COL].values.astype(float)

subs = {}

for name, cal in calibrators.items():
    y_test_cal = cal.transform(test_raw)
    if TARGET_TYPE == "binary":
        y_test_cal = np.clip(y_test_cal, 0.0, 1.0)
    sub = sample_sub.copy()
    if ID_COL in sub.columns and ID_COL in test_pred.columns:
        sub[ID_COL] = test_pred[ID_COL].values
    sub[TARGET_COL] = y_test_cal
    out_path = OUTPUT_DIR / f"submission_{name}.csv"
    sub.to_csv(out_path, index=False)
    subs[name] = out_path

subs