In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, make_scorer, cohen_kappa_score
from sklearn.model_selection import cross_val_predict

from xgboost import XGBRegressor

In [2]:
def apply_thresholds(scores, thresholds, min_class):
    thresholds = np.asarray(thresholds, dtype=float)
    return (np.digitize(scores, thresholds) + min_class).astype(int)

def optimize_thresholds(scores, y_true, classes, n_iter=40):
    """
    Coordinate descent to maximize QWK for mapping continuous scores -> ordinal classes.
    thresholds length = len(classes)-1
    """
    classes = list(classes)
    min_c, max_c = classes[0], classes[-1]
    k = len(classes)

    # initialize thresholds as score quantiles
    qs = np.linspace(0, 1, k + 1)[1:-1]
    thr = np.quantile(scores, qs)

    def qwk_for(thr_):
        pred = apply_thresholds(scores, thr_, min_c)
        pred = np.clip(pred, min_c, max_c)
        return cohen_kappa_score(y_true, pred, weights="quadratic")

    best = qwk_for(thr)

    for _ in range(n_iter):
        improved = False
        for i in range(len(thr)):
            cand = np.quantile(scores, np.linspace(0.05, 0.95, 61))
            lo = thr[i - 1] if i > 0 else -np.inf
            hi = thr[i + 1] if i < len(thr) - 1 else np.inf
            cand = cand[(cand > lo) & (cand < hi)]
            if len(cand) == 0:
                continue

            local_best_thr = thr[i]
            local_best = best

            for v in cand:
                thr_try = thr.copy()
                thr_try[i] = v
                val = qwk_for(thr_try)
                if val > local_best:
                    local_best = val
                    local_best_thr = v

            if local_best > best:
                thr[i] = local_best_thr
                best = local_best
                improved = True

        if not improved:
            break

    return thr, best

In [3]:
df = pd.read_csv(r"/Users/loganheydt/Documents/GitHub/Kaggle_competitions/Wine_Quality/data/train.csv")

In [4]:
df_model = df.copy()

# drop ID-like columns (keep adding here if you have others)
drop_cols = []
if "id" in df_model.columns:
    drop_cols.append("id")

y = df_model["quality"].astype(int)
X = (
    df_model.drop(columns=["quality"] + drop_cols, errors="ignore")
            .replace([np.inf, -np.inf], np.nan)   # important if you made ratio features earlier
)

classes = sorted(y.unique().tolist())
y_min, y_max = int(y.min()), int(y.max())

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [6]:
def qwk_rounded(y_true, y_pred):
    y_pred_round = np.clip(np.rint(y_pred), y_min, y_max).astype(int)
    return cohen_kappa_score(y_true, y_pred_round, weights="quadratic")

qwk_scorer = make_scorer(qwk_rounded, greater_is_better=True)

In [7]:
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    ))
])

In [8]:
param_grid = {
    "model__n_estimators": [400, 800],
    "model__learning_rate": [0.03, 0.05, 0.1],
    "model__max_depth": [3, 4, 5],
    "model__subsample": [0.8, 1.0],
    "model__colsample_bytree": [0.8, 1.0],
    "model__reg_lambda": [1.0, 5.0],
}

In [9]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring={"QWK_round": qwk_scorer, "neg_RMSE": "neg_root_mean_squared_error"},
    refit="QWK_round",
    cv=cv,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

grid.fit(X_train, y_train)

print("Best CV QWK (rounded):", grid.best_score_)
print("Best params:", grid.best_params_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best CV QWK (rounded): 0.3324647844237837
Best params: {'model__colsample_bytree': 1.0, 'model__learning_rate': 0.03, 'model__max_depth': 3, 'model__n_estimators': 400, 'model__reg_lambda': 1.0, 'model__subsample': 0.8}


In [10]:
best_pipe = grid.best_estimator_

# OOF continuous predictions on TRAIN
oof_pred_train = cross_val_predict(
    best_pipe, X_train, y_train,
    cv=cv,
    method="predict",
    n_jobs=-1
)

thr, oof_qwk = optimize_thresholds(
    scores=oof_pred_train,
    y_true=y_train.values,
    classes=classes,
    n_iter=50
)

print("\nOptimized thresholds (from TRAIN OOF):", thr.tolist())
print("TRAIN OOF QWK (optimized thresholds):", oof_qwk)


Optimized thresholds (from TRAIN OOF): [5.2870283126831055, 5.296219725608825, 5.611339356899261, 6.0112797212600695, 6.333338165283203]
TRAIN OOF QWK (optimized thresholds): 0.3786838284872458


In [11]:
best_pipe.fit(X_train, y_train)

pred_test = best_pipe.predict(X_test)

# Continuous RMSE (raw reg preds)
rmse_test = np.sqrt(mean_squared_error(y_test, pred_test))

# QWK with rounding
pred_test_round = np.clip(np.rint(pred_test), y_min, y_max).astype(int)
qwk_test_round = cohen_kappa_score(y_test, pred_test_round, weights="quadratic")

# QWK with optimized thresholds
pred_test_thr = apply_thresholds(pred_test, thr, y_min)
pred_test_thr = np.clip(pred_test_thr, y_min, y_max).astype(int)
qwk_test_thr = cohen_kappa_score(y_test, pred_test_thr, weights="quadratic")

print("\n--- TEST RESULTS ---")
print("RMSE (continuous preds):", rmse_test)
print("QWK (rounded preds):     ", qwk_test_round)
print("QWK (opt thresholds):    ", qwk_test_thr)


--- TEST RESULTS ---
RMSE (continuous preds): 0.7362465244075169
QWK (rounded preds):      0.3252330185976233
QWK (opt thresholds):     0.3907146481839521
