In [1]:
import re
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score

from catboost import CatBoostClassifier, Pool

In [2]:
SEED = 42
N_SPLITS = 5

DATA_DIR = Path("../data")   
TRAIN_CSV = DATA_DIR / "raw/train.csv"
TEST_CSV  = DATA_DIR / "raw/test.csv"
FOLDS_CSV = DATA_DIR / "artifacts/train_folds.csv"  

ART_DIR = DATA_DIR / "artifacts"
ART_DIR.mkdir(parents=True, exist_ok=True)

OOF_OUT  = ART_DIR / "tabular_oof.csv"
TEST_OUT = ART_DIR / "tabular_test.csv"

print("train dataset:", TRAIN_CSV.exists())
print("test dataset:", TEST_CSV.exists())
print("folds dataset:", FOLDS_CSV.exists())


train dataset: True
test dataset: True
folds dataset: True


In [3]:
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)
folds_df = pd.read_csv(FOLDS_CSV)

# merge fold info (має дати 6431 рядок)
train_df = train_df.merge(folds_df[["PetID", "AdoptionSpeed", "fold"]], on=["PetID", "AdoptionSpeed"], how="inner")

# target 0..3
train_df["y"] = train_df["AdoptionSpeed"].astype(int) - 1

# safety
train_df["Description"] = train_df["Description"].fillna("").astype(str)
test_df["Description"]  = test_df["Description"].fillna("").astype(str)

In [4]:
print("train dataset:")
print(train_df.head(), "\n")
print("test dataset:")
print(test_df.head(), "\n")
print("folds dataset:")
print(folds_df.head(), "\n")

train dataset:
       PetID                                        Description  \
0  d3b4f29f8  Mayleen and Flo are two lovely adorable sister...   
1  e9dc82251  A total of 5 beautiful Tabbys available for ad...   
2  8111f6d4a  Two-and-a-half month old girl. Very manja and ...   
3  693a90fda  Neil is a healthy and active ~2-month-old fema...   
4  9d08c85ef  Gray kitten available for adoption in sungai p...   

   AdoptionSpeed  fold  y  
0              2     3  1  
1              2     3  1  
2              2     2  1  
3              2     4  1  
4              2     3  1   

test dataset:
       PetID                                        Description
0  6697a7f62  This cute little puppy is looking for a loving...
1  23b64fe21  These 3 puppies was rescued from a mechanic sh...
2  41e824cbe  Ara needs a forever home! Believe me, he's a r...
3  6c3d7237b  i rescue this homeless dog 2 years ago but my ...
4  97b0b5d92  We found him at a shopping mall at a very clea... 

folds datase

In [5]:
_re_money = re.compile(r"(\$|rm|usd|eur|pln|£|€)", re.IGNORECASE)

In [6]:
def extract_text_features(series: pd.Series) -> pd.DataFrame:
    s = series.fillna("").astype(str)

    lower = s.str.lower()

    # basic lengths
    n_chars = s.str.len()
    n_words = lower.str.split().str.len()
    
    n_hearts = lower.str.count(r"(❤️|heart|hearts)")

    # punctuation / style
    n_excl = s.str.count(r"!")
    n_dot  = s.str.count(r"\.")
    upper_ratio = (s.str.count(r"[A-Z]") / (n_chars.replace(0, 1))).astype(float)

    # patterns
    has_money = lower.str.contains(_re_money).astype(int)

    # “urgency / adoption”
    kw_urgent = lower.str.contains(r"\b(urgent|asap|please help|rescue|save)\b").astype(int)
    kw_free   = lower.str.contains(r"\b(free|no fee)\b").astype(int)
    kw_fee    = lower.str.contains(r"\b(fee|adoption fee)\b").astype(int)
    kw_spay   = lower.str.contains(r"\b(spay|neuter|steril)\b").astype(int)
    kw_vax    = lower.str.contains(r"\b(vaccin|vax|vaccinated)\b").astype(int)
    kw_train  = lower.str.contains(r"\b(trained|litter|house[- ]trained|toilet)\b").astype(int)
    is_first_person = lower.str.contains(r"\b(i|i'm|im|me|my|mine|we|our|us)\b").astype(int)

    # approximate age mentions
    has_weeks  = lower.str.contains(r"\b\d+\s*(week|weeks|wk|wks)\b").astype(int)
    has_months = lower.str.contains(r"\b\d+\s*(month|months|mo|mos)\b").astype(int)
    has_years  = lower.str.contains(r"\b\d+\s*(year|years|yr|yrs)\b").astype(int)
    kw_positive = lower.str.contains(
    r"\b(friendly|playful|gentle|sweet|loving|cuddly|affectionate|smart|good with kids|good with children)\b"
).astype(int)
    kw_negative = lower.str.contains(
    r"\b(shy|fearful|nervous|aggressive|anxious|not good with kids|not good with children|bites|nippy)\b"
).astype(int)
    kw_baby = lower.str.contains(
    r"\b(puppy|pup|puppies|kitten|kitty|kitties|baby|babies|youngster)\b"
).astype(int)

    kw_adult = lower.str.contains(
        r"\b(adult|grown|mature|senior|older)\b"
    ).astype(int)

    kw_small_cute = lower.str.contains(
        r"\b(tiny|little|small|cutie|sweetie)\b"
    ).astype(int)

    good_with_kids = lower.str.contains(
    r"\b(good|great|ok|okay|fine|friendly|gentle|likes|loves)\b.*\b(kids|kid|children|child|baby|babies)\b"
    ).astype(int)

    not_good_with_kids = lower.str.contains(
        r"\b(not good|no|avoid|cannot|can't|dont|don't)\b.*\b(kids|children|baby|babies)\b"
    ).astype(int)

    good_with_cats = lower.str.contains(
        r"\b(good|great|ok|okay|fine|likes|loves)\b.*\b(cats|cat|kitty|kittens|kitten)\b"
    ).astype(int)

    not_good_with_cats = lower.str.contains(
        r"\b(not good|no|avoid|cannot|can't|dont|don't|prey drive)\b.*\b(cats|cat|kitty|kittens|kitten)\b"
    ).astype(int)

    good_with_dogs = lower.str.contains(
        r"\b(good|great|ok|okay|fine|likes|loves)\b.*\b(dogs|dog|puppies|puppy)\b"
    ).astype(int)

    not_good_with_dogs = lower.str.contains(
        r"\b(not good|no|avoid|cannot|can't|dont|don't)\b.*\b(dogs|dog|puppies|puppy)\b"
    ).astype(int)
    strict_requirements = lower.str.contains(
    r"\b(home check|reference|references|application|application form|fenced yard|indoor only|interview|contract|adoption application)\b"
    ).astype(int)

    reason_moving = lower.str.contains(
    r"\b(moving|relocat|overseas|eviction|landlord|no longer allowed)\b"
    ).astype(int)

    reason_death = lower.str.contains(
        r"\b(owner died|passed away|deceased|owner passed)\b"
    ).astype(int)

    stray_found = lower.str.contains(
        r"\b(found|stray|street|abandoned|dumped|rescued from)\b"
    ).astype(int)


    # small numeric density
    n_digits = s.str.count(r"\d")
    digit_ratio = (n_digits / (n_chars.replace(0, 1))).astype(float)

    feats = pd.DataFrame({
        "n_chars": n_chars,
        "n_words": n_words,
        "kw_baby": kw_baby,
        "n_hearts": n_hearts,
        "n_excl": n_excl,
        "is_first_person": is_first_person,
        "n_dot": n_dot,
        "upper_ratio": upper_ratio,
        "kw_positive": kw_positive,
        "kw_negative": kw_negative,
        "kw_adult": kw_adult,
        "has_money": has_money,
        "kw_urgent": kw_urgent,
        "kw_free": kw_free,
        "kw_fee": kw_fee,
        "kw_spay": kw_spay,
        "kw_vax": kw_vax,
        "kw_train": kw_train,
        "has_weeks": has_weeks,
        "has_months": has_months,
        "has_years": has_years,
        "n_digits": n_digits,
        "digit_ratio": digit_ratio,
        "kw_small_cute": kw_small_cute,
        "good_with_kids": good_with_kids,
        "not_good_with_kids": not_good_with_kids,
        "good_with_cats": good_with_cats,
        "not_good_with_cats": not_good_with_cats,
        "good_with_dogs": good_with_dogs,
        "not_good_with_dogs": not_good_with_dogs,
        "strict_requirements": strict_requirements,
        "reason_moving": reason_moving,
        "reason_death": reason_death,
        "stray_found": stray_found,
    })

    # make sure types are numeric
    feats = feats.apply(pd.to_numeric, errors="coerce").fillna(0.0)
    return feats


In [7]:
X_train = extract_text_features(train_df["Description"])
y_train = train_df["y"].values.astype(int)

X_test  = extract_text_features(test_df["Description"])

print("X_train:", X_train.shape, "X_test:", X_test.shape)

  has_money = lower.str.contains(_re_money).astype(int)
  kw_urgent = lower.str.contains(r"\b(urgent|asap|please help|rescue|save)\b").astype(int)
  kw_free   = lower.str.contains(r"\b(free|no fee)\b").astype(int)
  kw_fee    = lower.str.contains(r"\b(fee|adoption fee)\b").astype(int)
  kw_spay   = lower.str.contains(r"\b(spay|neuter|steril)\b").astype(int)
  kw_vax    = lower.str.contains(r"\b(vaccin|vax|vaccinated)\b").astype(int)
  kw_train  = lower.str.contains(r"\b(trained|litter|house[- ]trained|toilet)\b").astype(int)
  is_first_person = lower.str.contains(r"\b(i|i'm|im|me|my|mine|we|our|us)\b").astype(int)
  has_weeks  = lower.str.contains(r"\b\d+\s*(week|weeks|wk|wks)\b").astype(int)
  has_months = lower.str.contains(r"\b\d+\s*(month|months|mo|mos)\b").astype(int)
  has_years  = lower.str.contains(r"\b\d+\s*(year|years|yr|yrs)\b").astype(int)
  kw_positive = lower.str.contains(
  kw_negative = lower.str.contains(
  kw_baby = lower.str.contains(
  kw_adult = lower.str.contains(

X_train: (6431, 34) X_test: (1891, 34)


  has_months = lower.str.contains(r"\b\d+\s*(month|months|mo|mos)\b").astype(int)
  has_years  = lower.str.contains(r"\b\d+\s*(year|years|yr|yrs)\b").astype(int)
  kw_positive = lower.str.contains(
  kw_negative = lower.str.contains(
  kw_baby = lower.str.contains(
  kw_adult = lower.str.contains(
  kw_small_cute = lower.str.contains(
  good_with_kids = lower.str.contains(
  not_good_with_kids = lower.str.contains(
  good_with_cats = lower.str.contains(
  not_good_with_cats = lower.str.contains(
  good_with_dogs = lower.str.contains(
  not_good_with_dogs = lower.str.contains(
  strict_requirements = lower.str.contains(
  reason_moving = lower.str.contains(
  reason_death = lower.str.contains(
  stray_found = lower.str.contains(


In [8]:
print("df_train rows:", len(train_df), "unique PetID:", train_df["PetID"].nunique())
print("df_test  rows:", len(test_df),  "unique PetID:", test_df["PetID"].nunique())

df_train rows: 6431 unique PetID: 6431
df_test  rows: 1891 unique PetID: 1891


In [9]:
feats = extract_text_features(train_df["Description"])
df_view = pd.concat(
    [train_df[["PetID", "AdoptionSpeed"]].reset_index(drop=True), feats.reset_index(drop=True)],
    axis=1
)
display(df_view.head(10))

  has_money = lower.str.contains(_re_money).astype(int)
  kw_urgent = lower.str.contains(r"\b(urgent|asap|please help|rescue|save)\b").astype(int)
  kw_free   = lower.str.contains(r"\b(free|no fee)\b").astype(int)
  kw_fee    = lower.str.contains(r"\b(fee|adoption fee)\b").astype(int)
  kw_spay   = lower.str.contains(r"\b(spay|neuter|steril)\b").astype(int)
  kw_vax    = lower.str.contains(r"\b(vaccin|vax|vaccinated)\b").astype(int)
  kw_train  = lower.str.contains(r"\b(trained|litter|house[- ]trained|toilet)\b").astype(int)
  is_first_person = lower.str.contains(r"\b(i|i'm|im|me|my|mine|we|our|us)\b").astype(int)
  has_weeks  = lower.str.contains(r"\b\d+\s*(week|weeks|wk|wks)\b").astype(int)
  has_months = lower.str.contains(r"\b\d+\s*(month|months|mo|mos)\b").astype(int)
  has_years  = lower.str.contains(r"\b\d+\s*(year|years|yr|yrs)\b").astype(int)
  kw_positive = lower.str.contains(
  kw_negative = lower.str.contains(
  kw_baby = lower.str.contains(
  kw_adult = lower.str.contains(

Unnamed: 0,PetID,AdoptionSpeed,n_chars,n_words,kw_baby,n_hearts,n_excl,is_first_person,n_dot,upper_ratio,...,good_with_kids,not_good_with_kids,good_with_cats,not_good_with_cats,good_with_dogs,not_good_with_dogs,strict_requirements,reason_moving,reason_death,stray_found
0,d3b4f29f8,2,355,62,0,0,0,0,6,0.019718,...,0,0,0,0,0,0,0,0,0,0
1,e9dc82251,2,304,54,1,0,0,0,5,0.032895,...,0,0,0,0,0,0,0,0,0,0
2,8111f6d4a,2,51,8,0,0,0,0,2,0.039216,...,0,0,0,0,0,0,0,0,0,0
3,693a90fda,2,202,36,1,0,0,1,4,0.019802,...,0,0,0,0,0,0,0,0,0,1
4,9d08c85ef,2,229,41,1,0,0,0,4,0.017467,...,0,0,0,1,0,0,0,0,0,0
5,45af03266,4,444,94,0,0,0,1,4,0.002252,...,0,0,0,0,0,0,0,0,0,1
6,01a07b591,1,580,111,0,0,0,1,7,0.017241,...,0,0,1,1,0,0,0,0,0,0
7,10e723583,3,690,131,1,1,0,1,8,0.027536,...,0,0,0,0,0,0,0,0,0,1
8,324266c95,2,478,97,0,0,7,1,9,0.07113,...,0,0,0,0,0,0,0,0,0,0
9,b6fbfa344,2,4500,839,1,3,0,1,54,0.026667,...,1,0,1,1,0,0,0,0,0,1


In [10]:
oof_probs = np.zeros((len(train_df), 4), dtype=np.float32)
test_probs = np.zeros((len(test_df), 4), dtype=np.float32)

fold_qwks = []
fi_parts = []
for fold in range(N_SPLITS):
    tr_idx = train_df.index[train_df["fold"] != fold].to_numpy()
    va_idx = train_df.index[train_df["fold"] == fold].to_numpy()

    X_tr, y_tr = X_train.iloc[tr_idx], y_train[tr_idx]
    X_va, y_va = X_train.iloc[va_idx], y_train[va_idx]

    model = CatBoostClassifier(
        loss_function="MultiClass",
        eval_metric="MultiClass",
        iterations=2000,
        learning_rate=0.05,
        depth=6,
        l2_leaf_reg=3.0,
        random_seed=SEED,
        verbose=200,
        task_type="CPU",
        od_type="Iter",
        od_wait=200,
    )

    model.fit(
        X_tr, y_tr,
        eval_set=(X_va, y_va),
        use_best_model=True
    )
    train_pool = Pool(X_tr, y_tr, feature_names=X_tr.columns.tolist())
    imp = model.get_feature_importance(train_pool, type="PredictionValuesChange")

    fi_parts.append(pd.DataFrame({
        "feature": X_tr.columns,
        "importance": imp,
        "fold": fold
    }))
    # probs
    oof_probs[va_idx] = model.predict_proba(X_va).astype(np.float32)
    test_probs += model.predict_proba(X_test).astype(np.float32) / N_SPLITS

    # QWK on fold
    va_pred = oof_probs[va_idx].argmax(axis=1)
    qwk = cohen_kappa_score(y_va + 1, va_pred + 1, weights="quadratic")
    fold_qwks.append(qwk)
    print(f"FOLD {fold} QWK: {qwk:.4f}")

fi_all = pd.concat(fi_parts, ignore_index=True)

fi_mean = (fi_all.groupby("feature", as_index=False)["importance"]
        .mean()
        .sort_values("importance", ascending=False))

print(fi_mean.head(40))

print("\n===== QWK SUMMARY =====")
print("fold qwks:", [round(x, 4) for x in fold_qwks])
print(f"mean QWK: {np.mean(fold_qwks):.4f}")
print(f"best QWK: {np.max(fold_qwks):.4f} (fold {int(np.argmax(fold_qwks))})")
print(f"worst QWK: {np.min(fold_qwks):.4f} (fold {int(np.argmin(fold_qwks))})")

# overall oof QWK по всіх 6431 рядках (це найважливіше)
oof_pred = oof_probs.argmax(axis=1)
overall_qwk = cohen_kappa_score(y_train + 1, oof_pred + 1, weights="quadratic")
print(f"OVERALL OOF QWK: {overall_qwk:.4f}")


0:	learn: 1.3809000	test: 1.3811891	best: 1.3811891 (0)	total: 58.3ms	remaining: 1m 56s
200:	learn: 1.2063402	test: 1.3124378	best: 1.3124349 (199)	total: 790ms	remaining: 7.07s
400:	learn: 1.1125344	test: 1.3087863	best: 1.3087639 (399)	total: 1.59s	remaining: 6.36s
600:	learn: 1.0394958	test: 1.3121013	best: 1.3084941 (405)	total: 2.39s	remaining: 5.56s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 1.30849412
bestIteration = 405

Shrink model to first 406 iterations.
FOLD 0 QWK: 0.1375
0:	learn: 1.3816684	test: 1.3822474	best: 1.3822474 (0)	total: 4.32ms	remaining: 8.64s
200:	learn: 1.2050931	test: 1.3149536	best: 1.3149536 (200)	total: 791ms	remaining: 7.08s
400:	learn: 1.1069146	test: 1.3128328	best: 1.3113010 (348)	total: 1.53s	remaining: 6.09s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 1.31130101
bestIteration = 348

Shrink model to first 349 iterations.
FOLD 1 QWK: 0.1547
0:	learn: 1.3814999	test: 1.3820764	best: 1.3820764 (0)	total: 

In [11]:
df_oof = pd.DataFrame({
    "PetID": train_df["PetID"].astype(str).values,
    "pred_0": oof_probs[:, 0],
    "pred_1": oof_probs[:, 1],
    "pred_2": oof_probs[:, 2],
    "pred_3": oof_probs[:, 3],
})
df_oof.to_csv(OOF_OUT, index=False)
print("saved:", OOF_OUT)

df_test = pd.DataFrame({
    "PetID": test_df["PetID"].astype(str).values,
    "pred_0": test_probs[:, 0],
    "pred_1": test_probs[:, 1],
    "pred_2": test_probs[:, 2],
    "pred_3": test_probs[:, 3],
})
df_test.to_csv(TEST_OUT, index=False)
print("saved:", TEST_OUT)

saved: ../data/artifacts/tabular_oof.csv
saved: ../data/artifacts/tabular_test.csv


In [12]:
print(fi_mean.head(40))

                feature  importance
33          upper_ratio   13.304679
0           digit_ratio    9.413044
22                n_dot    8.900240
20              n_chars    8.503147
10              kw_baby    7.082462
25              n_words    6.941275
31          stray_found    4.124012
16              kw_spay    4.119512
8       is_first_person    3.950867
21             n_digits    3.787898
14          kw_positive    3.173892
17             kw_train    2.993406
23               n_excl    2.955239
15        kw_small_cute    2.422638
2        good_with_dogs    2.325298
4             has_money    1.846924
5            has_months    1.739601
18            kw_urgent    1.386506
19               kw_vax    1.316993
6             has_weeks    1.218372
26   not_good_with_cats    1.170325
12              kw_free    1.047369
1        good_with_cats    1.030994
7             has_years    0.981681
24             n_hearts    0.813746
13          kw_negative    0.655876
27   not_good_with_dogs    0