In [None]:
# === CONFIG ===
import os
from pathlib import Path

DATA_DIR = Path(".")
TRAIN_CSV = DATA_DIR / "train.csv"
TEST_CSV = DATA_DIR / "test.csv"
SAMPLE_SUB_CSV = DATA_DIR / "sample_submission.csv"

ID_COL = "Id"          # колонка с ID (совпадает с именем картинки без расширения)
TARGET_COL = "target"  # колонка с таргетом в train

IMAGE_TRAIN_DIR = DATA_DIR / "train_images"  # папка с train-картинками
IMAGE_TEST_DIR = DATA_DIR / "test_images"    # папка с test-картинками
IMAGE_EXT = ".jpg"                           # расширение файлов

TASK_TYPE = "regression"  # "regression" или "binary"

BATCH_SIZE = 64
NUM_WORKERS = 2

OUTPUT_EMB_TRAIN = DATA_DIR / "train_image_embeddings.csv"
OUTPUT_EMB_TEST = DATA_DIR / "test_image_embeddings.csv"
OUTPUT_SUBMISSION = DATA_DIR / "submission_catboost_image.csv"

print("Config OK")

In [None]:
# === IMPORTS & INSTALLS ===
import sys
import numpy as np
import pandas as pd

# timm для удобного доступа к сверточным/vision моделям
try:
    import timm
except ImportError:
    !{sys.executable} -m pip install -q timm
    import timm

try:
    import torch
    import torch.nn as nn
    from torch.utils.data import Dataset, DataLoader
except ImportError:
    raise RuntimeError("PyTorch is required for this notebook")

try:
    from catboost import CatBoostRegressor, CatBoostClassifier, Pool
except ImportError:
    !{sys.executable} -m pip install -q catboost
    from catboost import CatBoostRegressor, CatBoostClassifier, Pool

from PIL import Image
from tqdm.auto import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

In [None]:
# === LOAD DATA ===
train = pd.read_csv(TRAIN_CSV)
test = pd.read_csv(TEST_CSV)

sample_sub = pd.read_csv(SAMPLE_SUB_CSV) if SAMPLE_SUB_CSV.exists() else None

print("train shape:", train.shape)
print("test shape:", test.shape)
print("columns:", train.columns.tolist())

In [None]:
# === IMAGE DATASET ===
import torchvision.transforms as T

# Простые аугментации/препроцесс как в STEPA-стиле:
img_size = 224

img_transform = T.Compose([
    T.Resize((img_size, img_size)),
    T.ToTensor(),
    T.Normalize(mean=[0.5, 0.5, 0.5],
                std=[0.5, 0.5, 0.5]),
])

class ImageDataset(Dataset):
    def __init__(self, df, image_dir, id_col, transform=None):
        self.df = df.reset_index(drop=True)
        self.image_dir = str(image_dir)
        self.id_col = id_col
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_id = str(row[self.id_col])
        fname = image_id + IMAGE_EXT
        path = os.path.join(self.image_dir, fname)
        image = Image.open(path).convert("RGB")
        if self.transform is not None:
            image = self.transform(image)
        return {
            "image": image,
            "id": row[self.id_col],
        }

def build_loader(df, image_dir):
    ds = ImageDataset(df, image_dir=image_dir, id_col=ID_COL, transform=img_transform)
    dl = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
    return dl

train_loader = build_loader(train, IMAGE_TRAIN_DIR)
test_loader = build_loader(test, IMAGE_TEST_DIR)

len(train_loader), len(test_loader)

In [None]:
# === BACKBONE MODEL (EMBEDDINGS) ===
# По мотивам STEPA: берём предобученную vision-модель из timm,
# обрезаем классификационную голову и используем penultimate фичи.

backbone_name = "tf_efficientnet_b0_ns"  # можно поменять под свою версию STEPA
model = timm.create_model(backbone_name, pretrained=True, num_classes=0)  # num_classes=0 -> фичи
model.to(device)
model.eval()

# Проверим размер эмбеддинга
with torch.no_grad():
    dummy = torch.randn(1, 3, img_size, img_size).to(device)
    dummy_emb = model(dummy)
    emb_dim = dummy_emb.shape[1]
print("Embedding dim:", emb_dim)

In [None]:
# === EXTRACT EMBEDDINGS ===
def extract_embeddings(dataloader):
    all_ids = []
    all_embs = []
    for batch in tqdm(dataloader, desc="Extracting embeddings"):
        images = batch["image"].to(device)
        with torch.no_grad():
            embs = model(images)
        embs = embs.cpu().numpy()
        all_embs.append(embs)
        all_ids.extend(batch["id"])
    all_embs = np.concatenate(all_embs, axis=0)
    cols = [f"img_emb_{i}" for i in range(all_embs.shape[1])]
    emb_df = pd.DataFrame(all_embs, columns=cols)
    emb_df.insert(0, ID_COL, all_ids)
    return emb_df

train_emb_df = extract_embeddings(train_loader)
test_emb_df = extract_embeddings(test_loader)

train_emb_df.to_csv(OUTPUT_EMB_TRAIN, index=False)
test_emb_df.to_csv(OUTPUT_EMB_TEST, index=False)

print("Saved embeddings to:")
print(OUTPUT_EMB_TRAIN)
print(OUTPUT_EMB_TEST)

In [None]:
# === MERGE EMBEDDINGS WITH TABULAR DATA ===
train_emb_df = pd.read_csv(OUTPUT_EMB_TRAIN)
test_emb_df = pd.read_csv(OUTPUT_EMB_TEST)

train_merged = train.merge(train_emb_df, on=ID_COL, how="inner")
test_merged = test.merge(test_emb_df, on=ID_COL, how="inner")

print("train_merged shape:", train_merged.shape)
print("test_merged shape:", test_merged.shape)

In [None]:
# === PREPARE DATA FOR CATBOOST ===
feature_cols = [c for c in train_merged.columns if c not in [ID_COL, TARGET_COL]]
X_train = train_merged[feature_cols].copy()
y_train = train_merged[TARGET_COL].copy()
X_test = test_merged[feature_cols].copy()

cat_cols = [c for c in X_train.columns if X_train[c].dtype == "object"]

for c in cat_cols:
    X_train[c] = X_train[c].astype(str)
    X_test[c] = X_test[c].astype(str)

cat_idx = [X_train.columns.get_loc(c) for c in cat_cols]

print("n_features:", X_train.shape[1])
print("n_cat_features:", len(cat_cols))

In [None]:
# === TRAIN CATBOOST ===
if TASK_TYPE == "regression":
    model_cb = CatBoostRegressor(
        loss_function="RMSE",
        iterations=3000,
        learning_rate=0.03,
        depth=8,
        random_seed=42,
        verbose=200,
        task_type="GPU" if device == "cuda" else "CPU",
    )
else:
    model_cb = CatBoostClassifier(
        loss_function="Logloss",
        iterations=3000,
        learning_rate=0.03,
        depth=8,
        random_seed=42,
        verbose=200,
        task_type="GPU" if device == "cuda" else "CPU",
    )

train_pool = Pool(X_train, y_train, cat_features=cat_idx if cat_idx else None)
test_pool = Pool(X_test, cat_features=cat_idx if cat_idx else None)

model_cb.fit(train_pool)
test_pred = model_cb.predict(test_pool)


In [None]:
# === BUILD SUBMISSION ===
if sample_sub is not None and TARGET_COL in sample_sub.columns:
    sub = sample_sub.copy()
    if ID_COL in sub.columns and ID_COL in test_merged.columns:
        sub[ID_COL] = test_merged[ID_COL].values
    sub[TARGET_COL] = test_pred
else:
    if ID_COL in test_merged.columns:
        sub = pd.DataFrame({ID_COL: test_merged[ID_COL].values, TARGET_COL: test_pred})
    else:
        sub = pd.DataFrame({TARGET_COL: test_pred})

sub.to_csv(OUTPUT_SUBMISSION, index=False)
print("Saved submission to:", OUTPUT_SUBMISSION)
sub.head()