In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, make_scorer, cohen_kappa_score

from xgboost import XGBRegressor

In [None]:
df = pd.read_csv(r"/Users/loganheydt/Documents/GitHub/Kaggle_competitions/Wine_Quality/data/train.csv")

In [None]:
df_model = df.copy()

# drop ID-like columns (keep adding here if you have others)
drop_cols = []
if "id" in df_model.columns:
    drop_cols.append("id")

y = df_model["quality"].astype(int)
X = (
    df_model.drop(columns=["quality"] + drop_cols, errors="ignore")
            .replace([np.inf, -np.inf], np.nan)   # important if you made ratio features earlier
)

y_min, y_max = int(y.min()), int(y.max())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
def qwk_rounded(y_true, y_pred):
    y_pred_round = np.clip(np.rint(y_pred), y_min, y_max).astype(int)
    return cohen_kappa_score(y_true, y_pred_round, weights="quadratic")

qwk_scorer = make_scorer(qwk_rounded, greater_is_better=True)

In [None]:
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    ))
])

In [None]:
param_grid = {
    "model__n_estimators": [400, 800],
    "model__learning_rate": [0.03, 0.05, 0.1],
    "model__max_depth": [3, 4, 5],
    "model__subsample": [0.8, 1.0],
    "model__colsample_bytree": [0.8, 1.0],
    "model__reg_lambda": [1.0, 5.0]}

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring={"QWK": qwk_scorer, "neg_RMSE": "neg_root_mean_squared_error"},
    refit="QWK",              # pick best params by QWK
    cv=cv,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

grid.fit(X_train, y_train)

print("Best CV QWK:", grid.best_score_)
print("Best params:", grid.best_params_)

In [None]:
best_model = grid.best_estimator_

pred_test = best_model.predict(X_test)
pred_test_round = np.clip(np.rint(pred_test), y_min, y_max).astype(int)

rmse_test = np.sqrt(mean_squared_error(y_test, pred_test))
qwk_test = cohen_kappa_score(y_test, pred_test_round, weights="quadratic")

print("\n--- TEST RESULTS ---")
print("RMSE (continuous preds):", rmse_test)
print("QWK (rounded preds):", qwk_test)

In [None]:
results = pd.DataFrame(grid.cv_results_)
cols = [
    "mean_test_QWK", "std_test_QWK",
    "mean_test_neg_RMSE", "std_test_neg_RMSE",
    "rank_test_QWK",
] + [c for c in results.columns if c.startswith("param_model__")]

results = results[cols].sort_values("rank_test_QWK").head(15)
display(results)