In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

chydv_hackathon_2025_path = kagglehub.competition_download('chydv-hackathon-2025')

print('Data source import complete.')


In [None]:
!pip install -qq pytabkit

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_squared_error
from scipy.optimize import minimize

from sklearn.ensemble import VotingRegressor


pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import warnings
warnings.filterwarnings("ignore")

In [None]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    # Assumes target classes: 3, 4, 5, 6, 7, 8.
    return np.where(oof_non_rounded < thresholds[0], 3,
                    np.where(oof_non_rounded < thresholds[1], 4,
                             np.where(oof_non_rounded < thresholds[2], 5,
                                      np.where(oof_non_rounded < thresholds[3], 6,
                                               np.where(oof_non_rounded < thresholds[4], 7, 8)))))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

In [None]:
train = pd.read_csv('/kaggle/input/chydv-hackathon-2025/train.csv')
test = pd.read_csv('/kaggle/input/chydv-hackathon-2025/test.csv')
sub = pd.read_csv('/kaggle/input/chydv-hackathon-2025/sample_submission.csv')

In [None]:
train.shape, test.shape, sub.shape

In [None]:
train.head()

In [None]:
train.quality.value_counts()

In [None]:
test.head()

In [None]:
FEATURES = [col for col in train.columns if col != 'id' and col != 'quality']
FEATURES

TARGET = 'quality'

In [None]:
from sklearn.model_selection import KFold
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xgb
print("Using XGBoost version",xgb.__version__)

In [None]:
from catboost import CatBoostRegressor, CatBoostClassifier
import catboost as cb
print("Using CatBoost version",cb.__version__)

In [None]:
from lightgbm import LGBMRegressor
import lightgbm as lgb
print("Using LightGBM version",lgb.__version__)

In [None]:
from pytabkit import RealMLP_TD_Regressor
from pytabkit import TabM_D_Regressor

In [None]:
realmlp_params = {
    'n_cv'                : 5,
    'n_epochs'            : 50,
    'train_metric_name'   : 'rmse',
    'p_drop'              : 0.3,
    'batch_size'          : 1024*4,
    'verbosity'           : 2,
    'lr'                  : 0.01,
    'lr_sched'            : 'cos_sched',
    'max_one_hot_cat_size': 64,
    'embedding_size'      : 8,
    'tfms'                : ["one_hot", "median_center", "robust_scale", "smooth_clip", "embedding", "l1_normalize"],
}

realmlp_model = RealMLP_TD_Regressor(**realmlp_params, random_state=42)

In [None]:
xgb_params = {
    'max_depth'           : 3,
    'learning_rate'       : 0.025,
    'n_estimators'        : 2000,
    'min_child_weight'    : 80,
    'verbosity'           : 0
    }

cat_params = {
    'depth'               : 3,
    'learning_rate'       : 0.1,
    'grow_policy'         : 'Lossguide',
    'verbose'             : False
}

lgb_params = {
    'max_depth': 3,
    'n_estimators': 2000,
    'learning_rate': 0.02,
    'objective': 'regression',
    'verbose': -1,
}


model_xgb = XGBRegressor(**xgb_params, random_state=42)
model_cat = CatBoostRegressor(**cat_params, random_state=42)
model_lgb = LGBMRegressor(**lgb_params, random_state=42)

In [None]:
from sklearn.model_selection import KFold

FOLDS = 5
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

y_test_pred = np.zeros(len(test))

fold_qwks = []
oof_non_rounded = np.zeros(len(train), dtype=float)
oof_rounded = np.zeros(len(train), dtype=int)

y = train[TARGET]

train_scores = []
val_scores = []


for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = y[train_index].copy()
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = y[test_index].copy()
    x_test = test[FEATURES].copy()

    model_cat.fit(
        x_train, y_train,
        eval_set=(x_valid, y_valid),
        verbose=500
    )

    y_train_pred = model_cat.predict(x_train)
    y_val_pred = model_cat.predict(x_valid)
    y_test_pred += model_cat.predict(x_test)

    y_val_pred_rounded = y_val_pred.round(0).astype(int)

    # INFER OOF
    oof_non_rounded[test_index] = y_val_pred
    oof_rounded[test_index] = y_val_pred_rounded

    train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
    val_kappa = quadratic_weighted_kappa(y_valid, y_val_pred_rounded)

    train_scores.append(train_kappa)
    val_scores.append(val_kappa)

    print(f"Fold {i+1} - Train QWK: {train_kappa:.4f}, Validation QWK:{val_kappa:.4f}")


mean_train_qwk = np.mean(train_scores)
mean_val_qwk = np.mean(val_scores)

oof_mask = ~np.isnan(oof_non_rounded)
oof_initial_thresholds = (
            pd.DataFrame({'target': y[oof_mask], 'prediction': oof_non_rounded[oof_mask]})
            .groupby('target')['prediction']
            .mean()
            .iloc[1:]
            .values
            .tolist()
        )

print(f"Initial oof thresholds {oof_initial_thresholds}")

# KappaOptimizer = minimize(
#     evaluate_predictions,
#     x0 = [3.5, 4.5, 5.5, 6.5, 7.5],
#     args = (y, oof_non_rounded),
#     method='Nelder-Mead'
# )

KappaOptimizer = minimize(
    evaluate_predictions,
    x0 = oof_initial_thresholds,
    args = (y, oof_non_rounded),
    method='Nelder-Mead'
)


assert KappaOptimizer.success, "Optimization did not converge."

oof_tuned = threshold_Rounder(oof_non_rounded, KappaOptimizer.x)
optimized_qwk = quadratic_weighted_kappa(y, oof_tuned)
optimized_thresholds = KappaOptimizer.x


print(f"Optimized QWK SCORE: {optimized_qwk:.3f}")

print(f"Mean Train QWK: {mean_train_qwk:.4f}")
print(f"Mean Validation QWK: {mean_val_qwk:.4f}")
print(f"Optimized QWK: {optimized_qwk}")
print(f"Optimized thresholds: {optimized_thresholds}")


# COMPUTE AVERAGE TEST PREDS
y_test_pred /= FOLDS

optimized_y_pred = threshold_Rounder(y_test_pred, optimized_thresholds)

In [None]:
submission = pd.DataFrame({
        'id': sub['id'],
        'quality': optimized_y_pred
    })

submission.to_csv('submission.csv', index=False)

submission.head()