In [1]:
# === base ===
from pathlib import Path
import os
import gc
import warnings

# === data ===
import numpy as np
import pandas as pd

# === metrics & splits ===
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score

# === models (tabular/meta) ===
from catboost import CatBoostClassifier, Pool
# import lightgbm as lgb

warnings.filterwarnings("ignore")

In [2]:
print("CWD:", Path().resolve())

CWD: /Users/oleksandrnovokhatskyi/Documents/Neovercity_Woolf/Deep_Learning/Deep_learning/petfinder_project/kaggle_notebooks


In [3]:
comp_path = Path("../data/artifacts")

trained_path = Path("../data/raw/train.csv")

oof_cnn = comp_path / "cnn_oof.csv"
test_cnn = comp_path / "cnn_test.csv"

oof_nlp = comp_path / "nlp_oof.csv"
test_nlp = comp_path / "nlp_test.csv"

oof_tabular = comp_path / "tabular_oof.csv"
test_tabular = comp_path / "tabular_test.csv"

print("oof_cnn:", oof_cnn.exists())
print("test_cnn:", test_cnn.exists())
print("oof_nlp:", oof_nlp.exists())
print("test_nlp:", test_nlp.exists())
print("trained_path:", trained_path.exists())
print("oof_tabular:", oof_tabular.exists())
print("test_tabular:", test_tabular.exists())

oof_cnn: True
test_cnn: True
oof_nlp: True
test_nlp: True
trained_path: True
oof_tabular: True
test_tabular: True


In [4]:
df_oof_cnn = pd.read_csv(oof_cnn)
df_test_cnn = pd.read_csv(test_cnn)

df_oof_nlp = pd.read_csv(oof_nlp)
df_test_nlp = pd.read_csv(test_nlp)

df_oof_tabular = pd.read_csv(oof_tabular)
df_test_tabular = pd.read_csv(test_tabular)

In [5]:
# print("OOF_CNN")
# print(df_oof_cnn.head(), "\n")
# print("OOF_NLP")
# print(df_oof_nlp.head(), "\n")
print("TEST_CNN")
print(df_test_cnn.head(), "\n")
print("TEST_NLP")
print(df_test_nlp.head())
print("TEST_TABULAR")
print(df_test_tabular.head(), "\n")
print("OOF_TABULAR")
print(df_oof_tabular.head())

TEST_CNN
       PetID    pred_0    pred_1    pred_2    pred_3
0  6697a7f62  0.078886  0.134657  0.144137  0.642320
1  23b64fe21  0.230519  0.412668  0.234642  0.122172
2  41e824cbe  0.057262  0.097586  0.103254  0.741899
3  6c3d7237b  0.154190  0.250104  0.244672  0.351034
4  97b0b5d92  0.178609  0.249129  0.134938  0.437325 

TEST_NLP
       PetID  nlp_proba_1  nlp_proba_2  nlp_proba_3  nlp_proba_4  nlp_pred
0  6697a7f62     0.186414     0.523980     0.277891     0.011716         2
1  23b64fe21     0.192035     0.273034     0.241004     0.293928         4
2  41e824cbe     0.079121     0.136374     0.418947     0.365558         3
3  6c3d7237b     0.118374     0.193876     0.083084     0.604667         4
4  97b0b5d92     0.244353     0.526338     0.110799     0.118510         2
TEST_TABULAR
       PetID    pred_0    pred_1    pred_2    pred_3
0  6697a7f62  0.211756  0.356579  0.178026  0.253638
1  23b64fe21  0.317650  0.234815  0.184570  0.262964
2  41e824cbe  0.122476  0.260343  0.2525

In [6]:
train_df = pd.read_csv(trained_path)
train_df.head()

Unnamed: 0,PetID,Description,AdoptionSpeed
0,d3b4f29f8,Mayleen and Flo are two lovely adorable sister...,2
1,e9dc82251,A total of 5 beautiful Tabbys available for ad...,2
2,8111f6d4a,Two-and-a-half month old girl. Very manja and ...,2
3,693a90fda,Neil is a healthy and active ~2-month-old fema...,2
4,9d08c85ef,Gray kitten available for adoption in sungai p...,2


In [7]:
df_oof_cnn = df_oof_cnn.drop(columns=["fold"])
df_oof_cnn = df_oof_cnn.rename(columns={"pred_0": "cnn_1", "pred_1": "cnn_2", "pred_2": "cnn_3", "pred_3": "cnn_4"})
df_oof_cnn = df_oof_cnn.merge(train_df[["PetID", "AdoptionSpeed"]], on="PetID", how="left")
df_oof_cnn.head()


Unnamed: 0,PetID,cnn_1,cnn_2,cnn_3,cnn_4,AdoptionSpeed
0,d3b4f29f8,0.128506,0.142094,0.249124,0.480276,2
1,e9dc82251,0.469104,0.367073,0.076473,0.08735,2
2,8111f6d4a,0.0311,0.185989,0.384119,0.398792,2
3,693a90fda,0.243184,0.437397,0.172976,0.146443,2
4,9d08c85ef,0.177146,0.409154,0.10083,0.312869,2


In [8]:
df_oof_nlp = df_oof_nlp.drop(columns=["nlp_pred"])
df_oof_nlp = df_oof_nlp.rename(columns={"nlp_proba_1": "nlp_1", "nlp_proba_2": "nlp_2", "nlp_proba_3": "nlp_3", "nlp_proba_4": "nlp_4"})
df_oof_nlp = df_oof_nlp.merge(train_df[["PetID", "AdoptionSpeed"]], on="PetID", how="left")
df_oof_nlp.head()


Unnamed: 0,PetID,nlp_1,nlp_2,nlp_3,nlp_4,AdoptionSpeed
0,d3b4f29f8,0.046295,0.255541,0.57276,0.125404,2
1,e9dc82251,0.031043,0.137292,0.757052,0.074613,2
2,8111f6d4a,0.267836,0.447975,0.159992,0.124198,2
3,693a90fda,0.004819,0.990378,0.004433,0.000371,2
4,9d08c85ef,0.409939,0.294647,0.139724,0.155689,2


In [9]:
df_test_cnn = df_test_cnn.rename(columns={"pred_0": "cnn_1", 
                                          "pred_1": "cnn_2", 
                                          "pred_2": "cnn_3", 
                                          "pred_3": "cnn_4"}
                                )
df_test_cnn.head()

Unnamed: 0,PetID,cnn_1,cnn_2,cnn_3,cnn_4
0,6697a7f62,0.078886,0.134657,0.144137,0.64232
1,23b64fe21,0.230519,0.412668,0.234642,0.122172
2,41e824cbe,0.057262,0.097586,0.103254,0.741899
3,6c3d7237b,0.15419,0.250104,0.244672,0.351034
4,97b0b5d92,0.178609,0.249129,0.134938,0.437325


In [10]:
df_test_nlp = df_test_nlp.drop(columns=["nlp_pred"])
df_test_nlp = df_test_nlp.rename(columns={"nlp_proba_1": "nlp_1",
                                          "nlp_proba_2": "nlp_2",
                                          "nlp_proba_3": "nlp_3",
                                          "nlp_proba_4": "nlp_4"})
df_test_nlp.head()

Unnamed: 0,PetID,nlp_1,nlp_2,nlp_3,nlp_4
0,6697a7f62,0.186414,0.52398,0.277891,0.011716
1,23b64fe21,0.192035,0.273034,0.241004,0.293928
2,41e824cbe,0.079121,0.136374,0.418947,0.365558
3,6c3d7237b,0.118374,0.193876,0.083084,0.604667
4,97b0b5d92,0.244353,0.526338,0.110799,0.11851


In [11]:
df_oof_tabular = df_oof_tabular.rename(columns={"pred_0": "tab_1",
                                                "pred_1": "tab_2",
                                                "pred_2": "tab_3",
                                                "pred_3": "tab_4"})
df_oof_tabular = df_oof_tabular.merge(train_df[["PetID", "AdoptionSpeed"]], on="PetID", how="left")
df_oof_tabular.head()

Unnamed: 0,PetID,tab_1,tab_2,tab_3,tab_4,AdoptionSpeed
0,d3b4f29f8,0.142903,0.372078,0.303965,0.181055,2
1,e9dc82251,0.237147,0.320429,0.236537,0.205887,2
2,8111f6d4a,0.214448,0.240812,0.174104,0.370636,2
3,693a90fda,0.185045,0.303221,0.178406,0.333328,2
4,9d08c85ef,0.177462,0.299493,0.22522,0.297825,2


In [12]:
df_test_tabular = df_test_tabular.rename(columns={"pred_0": "tab_1",
                                                "pred_1": "tab_2",
                                                "pred_2": "tab_3",
                                                "pred_3": "tab_4"})
df_test_tabular.head()

Unnamed: 0,PetID,tab_1,tab_2,tab_3,tab_4
0,6697a7f62,0.211756,0.356579,0.178026,0.253638
1,23b64fe21,0.31765,0.234815,0.18457,0.262964
2,41e824cbe,0.122476,0.260343,0.252516,0.364666
3,6c3d7237b,0.13464,0.136507,0.146309,0.582543
4,97b0b5d92,0.174015,0.349569,0.127205,0.349212


In [13]:
df_train = df_oof_cnn.merge(df_oof_nlp, on=["PetID", "AdoptionSpeed"], how="left")
df_train.head()

Unnamed: 0,PetID,cnn_1,cnn_2,cnn_3,cnn_4,AdoptionSpeed,nlp_1,nlp_2,nlp_3,nlp_4
0,d3b4f29f8,0.128506,0.142094,0.249124,0.480276,2,0.046295,0.255541,0.57276,0.125404
1,e9dc82251,0.469104,0.367073,0.076473,0.08735,2,0.031043,0.137292,0.757052,0.074613
2,8111f6d4a,0.0311,0.185989,0.384119,0.398792,2,0.267836,0.447975,0.159992,0.124198
3,693a90fda,0.243184,0.437397,0.172976,0.146443,2,0.004819,0.990378,0.004433,0.000371
4,9d08c85ef,0.177146,0.409154,0.10083,0.312869,2,0.409939,0.294647,0.139724,0.155689


In [14]:
df_train = df_train.merge(df_oof_tabular, on=["PetID", "AdoptionSpeed"], how="left")
df_train.head()

Unnamed: 0,PetID,cnn_1,cnn_2,cnn_3,cnn_4,AdoptionSpeed,nlp_1,nlp_2,nlp_3,nlp_4,tab_1,tab_2,tab_3,tab_4
0,d3b4f29f8,0.128506,0.142094,0.249124,0.480276,2,0.046295,0.255541,0.57276,0.125404,0.142903,0.372078,0.303965,0.181055
1,e9dc82251,0.469104,0.367073,0.076473,0.08735,2,0.031043,0.137292,0.757052,0.074613,0.237147,0.320429,0.236537,0.205887
2,8111f6d4a,0.0311,0.185989,0.384119,0.398792,2,0.267836,0.447975,0.159992,0.124198,0.214448,0.240812,0.174104,0.370636
3,693a90fda,0.243184,0.437397,0.172976,0.146443,2,0.004819,0.990378,0.004433,0.000371,0.185045,0.303221,0.178406,0.333328
4,9d08c85ef,0.177146,0.409154,0.10083,0.312869,2,0.409939,0.294647,0.139724,0.155689,0.177462,0.299493,0.22522,0.297825


In [15]:
df_test = df_test_cnn.merge(df_test_nlp, on=["PetID"], how="left")
df_test.head()

Unnamed: 0,PetID,cnn_1,cnn_2,cnn_3,cnn_4,nlp_1,nlp_2,nlp_3,nlp_4
0,6697a7f62,0.078886,0.134657,0.144137,0.64232,0.186414,0.52398,0.277891,0.011716
1,23b64fe21,0.230519,0.412668,0.234642,0.122172,0.192035,0.273034,0.241004,0.293928
2,41e824cbe,0.057262,0.097586,0.103254,0.741899,0.079121,0.136374,0.418947,0.365558
3,6c3d7237b,0.15419,0.250104,0.244672,0.351034,0.118374,0.193876,0.083084,0.604667
4,97b0b5d92,0.178609,0.249129,0.134938,0.437325,0.244353,0.526338,0.110799,0.11851


In [16]:
df_test = df_test.merge(df_test_tabular, on=["PetID"], how="left")
df_test.head()

Unnamed: 0,PetID,cnn_1,cnn_2,cnn_3,cnn_4,nlp_1,nlp_2,nlp_3,nlp_4,tab_1,tab_2,tab_3,tab_4
0,6697a7f62,0.078886,0.134657,0.144137,0.64232,0.186414,0.52398,0.277891,0.011716,0.211756,0.356579,0.178026,0.253638
1,23b64fe21,0.230519,0.412668,0.234642,0.122172,0.192035,0.273034,0.241004,0.293928,0.31765,0.234815,0.18457,0.262964
2,41e824cbe,0.057262,0.097586,0.103254,0.741899,0.079121,0.136374,0.418947,0.365558,0.122476,0.260343,0.252516,0.364666
3,6c3d7237b,0.15419,0.250104,0.244672,0.351034,0.118374,0.193876,0.083084,0.604667,0.13464,0.136507,0.146309,0.582543
4,97b0b5d92,0.178609,0.249129,0.134938,0.437325,0.244353,0.526338,0.110799,0.11851,0.174015,0.349569,0.127205,0.349212


In [17]:
assert df_train.shape[0] == 6431
assert df_train["PetID"].nunique() == 6431
assert df_train["AdoptionSpeed"].notna().all()

In [18]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1887 entries, 0 to 1886
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PetID   1887 non-null   object 
 1   cnn_1   1887 non-null   float64
 2   cnn_2   1887 non-null   float64
 3   cnn_3   1887 non-null   float64
 4   cnn_4   1887 non-null   float64
 5   nlp_1   1887 non-null   float64
 6   nlp_2   1887 non-null   float64
 7   nlp_3   1887 non-null   float64
 8   nlp_4   1887 non-null   float64
 9   tab_1   1887 non-null   float64
 10  tab_2   1887 non-null   float64
 11  tab_3   1887 non-null   float64
 12  tab_4   1887 non-null   float64
dtypes: float64(12), object(1)
memory usage: 191.8+ KB


In [19]:
sub = pd.read_csv("submission.csv")
print("submission rows:", len(sub))
print(sub.head())
print(sub.tail())

submission rows: 1887
       PetID  AdoptionSpeed
0  6697a7f62              1
1  23b64fe21              2
2  41e824cbe              4
3  6c3d7237b              4
4  97b0b5d92              1
          PetID  AdoptionSpeed
1882  986e26eeb              4
1883  9b2316d19              4
1884  c60193e34              2
1885  4f7a70728              4
1886  9e758c0b0              1


In [20]:
print("df_test duplicates:", df_test["PetID"].duplicated().sum())
print("sub duplicates:", sub["PetID"].duplicated().sum())

df_test duplicates: 0
sub duplicates: 0


In [21]:
print("df_test_cnn dup:", df_test_cnn["PetID"].duplicated().sum())
print("df_test_nlp dup:", df_test_nlp["PetID"].duplicated().sum())

df_test_cnn dup: 0
df_test_nlp dup: 0


In [22]:
X_train = df_train.drop(columns=["PetID", "AdoptionSpeed"])
y_train = df_train["AdoptionSpeed"] - 1

X_test = df_test.drop(columns=["PetID"])


In [23]:
FEATURES = [c for c in df_train.columns if c.startswith("cnn_") or c.startswith("nlp_")]
assert len(FEATURES) == 8, f"Expected 8 features, got {len(FEATURES)}: {FEATURES}"

X = df_train[FEATURES].copy()
y = df_train["AdoptionSpeed"].astype(int).values  # 1..4
y0 = y - 1  # -> 0..3 (for CatBoost)

X_test = df_test[FEATURES].copy()

# -------------------------
# 2) CV OOF for sanity + final fit
# -------------------------
SEED = 42
N_SPLITS = 5

oof_pred0 = np.zeros(len(df_train), dtype=int)
oof_proba = np.zeros((len(df_train), 4), dtype=np.float32)

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

params = dict(
    loss_function="MultiClass",
    iterations=2000,
    learning_rate=0.03,
    depth=6,
    l2_leaf_reg=3.0,          # (weight decay analogue)
    random_strength=1.0,
    bootstrap_type="Bayesian",
    bagging_temperature=0.5,
    eval_metric="WKappa",
    auto_class_weights="Balanced",
    task_type="CPU",
    random_seed=SEED,
    verbose=False,
)
fold_qwks = []
for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y0), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y0[tr_idx], y0[va_idx]

    train_pool = Pool(X_tr, y_tr)
    valid_pool = Pool(X_va, y_va)

    model = CatBoostClassifier(**params)
    model.fit(
        train_pool,
        eval_set=valid_pool,
        use_best_model=True,
        early_stopping_rounds=200,
        verbose=200,
    )

    proba_va = model.predict_proba(X_va)
    pred_va0 = np.argmax(proba_va, axis=1)

    oof_pred0[va_idx] = pred_va0
    oof_proba[va_idx] = proba_va

    qwk_fold = cohen_kappa_score((y_va + 1), (pred_va0 + 1), weights="quadratic")
    fold_qwks.append(qwk_fold)
    print(f"Fold {fold}: QWK = {qwk_fold:.4f}")

print("\n===== QWK SUMMARY =====")
if len(fold_qwks) == 0:
    print("fold_qwks is EMPTY -> CV loop didn't run or crashed before append().")
else:
    print("fold qwks:", [round(x, 4) for x in fold_qwks])
    print(f"mean QWK: {np.mean(fold_qwks):.4f}")
    print(f"best QWK: {np.max(fold_qwks):.4f} (fold {int(np.argmax(fold_qwks)) + 1})")
    print(f"worst QWK: {np.min(fold_qwks):.4f} (fold {int(np.argmin(fold_qwks)) + 1})")

# OOF QWK по всіх рядках (правильно в 1..4)
oof_pred_1to4 = oof_pred0 + 1
y_1to4 = y0 + 1
overall_qwk = cohen_kappa_score(y_1to4, oof_pred_1to4, weights="quadratic")
print(f"OVERALL OOF QWK: {overall_qwk:.4f}")

qwk_oof = cohen_kappa_score(y, (oof_pred0 + 1), weights="quadratic")
print(f"\nOOF QWK (CatBoost stacking): {qwk_oof:.4f}")


# Save OOF meta features (optional, but useful)
df_cb_oof = pd.DataFrame({
    "PetID": df_train["PetID"].values,
    "cb_1": oof_proba[:, 0],
    "cb_2": oof_proba[:, 1],
    "cb_3": oof_proba[:, 2],
    "cb_4": oof_proba[:, 3],
})
df_cb_oof.to_csv("cb_oof.csv", index=False)
print("saved: cb_oof.csv")

# -------------------------
# 3) Train final model on ALL data and predict TEST
# -------------------------
final_model = CatBoostClassifier(**params)
final_model.fit(Pool(X, y0), verbose=200)

test_proba = final_model.predict_proba(X_test)
test_pred = np.argmax(test_proba, axis=1) + 1  # back to 1..4

# Save test meta features (optional)
df_cb_test = pd.DataFrame({
    "PetID": df_test["PetID"].values,
    "cb_1": test_proba[:, 0],
    "cb_2": test_proba[:, 1],
    "cb_3": test_proba[:, 2],
    "cb_4": test_proba[:, 3],
})
df_cb_test.to_csv("cb_test.csv", index=False)
print("saved: cb_test.csv")

# Final submission
submission = pd.DataFrame({
    "PetID": df_test["PetID"].values,
    "AdoptionSpeed": test_pred.astype(int),
})
submission.to_csv("submission.csv", index=False)
print("saved: submission.csv")

0:	learn: 0.3817957	test: 0.3703606	best: 0.3703606 (0)	total: 58.9ms	remaining: 1m 57s
200:	learn: 0.4438566	test: 0.3979354	best: 0.4279337 (4)	total: 584ms	remaining: 5.22s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.4279337001
bestIteration = 4

Shrink model to first 5 iterations.
Fold 1: QWK = 0.4404
0:	learn: 0.3576851	test: 0.3999070	best: 0.3999070 (0)	total: 3.45ms	remaining: 6.89s
200:	learn: 0.4343551	test: 0.4218788	best: 0.4428680 (18)	total: 493ms	remaining: 4.41s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.4428680228
bestIteration = 18

Shrink model to first 19 iterations.
Fold 2: QWK = 0.4577
0:	learn: 0.3993418	test: 0.3370415	best: 0.3370415 (0)	total: 2.37ms	remaining: 4.75s
200:	learn: 0.4399655	test: 0.3750826	best: 0.3805041 (10)	total: 422ms	remaining: 3.78s
400:	learn: 0.4735441	test: 0.3678912	best: 0.3880141 (230)	total: 841ms	remaining: 3.35s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0

In [24]:
print("fold qwks:", [round(x, 4) for x in fold_qwks])
print(f"mean QWK: {np.mean(fold_qwks):.4f}")
print(f"best QWK: {np.max(fold_qwks):.4f} (fold {int(np.argmax(fold_qwks)) + 1})")
print(f"worst QWK: {np.min(fold_qwks):.4f} (fold {int(np.argmin(fold_qwks)) + 1})")

fold qwks: [0.4404, 0.4577, 0.4073, 0.3965, 0.3813]
mean QWK: 0.4166
best QWK: 0.4577 (fold 2)
worst QWK: 0.3813 (fold 5)
