In [2]:
import os
import shutil
import subprocess
from pathlib import Path

import numpy as np
import pandas as pd

ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None


def download_from_kaggle(competition):
    command = "kaggle competitions download -c "
    filepath = Path("data/" + competition + ".zip")

    if not filepath.is_file():
        subprocess.run((command + competition).split())
        Path("data").mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(competition + ".zip", "data")
        shutil.move(competition + ".zip", "data")


In [3]:
competition = "playground-series-s3e12"

if not ON_KAGGLE:
    download_from_kaggle(competition)
    train_path = "data/train.csv"
    test_path = "data/test.csv"
else:
    train_path = f"/kaggle/input/{competition}/train.csv"
    test_path = f"/kaggle/input/{competition}/test.csv"

train = pd.read_csv(train_path, index_col="id").rename(columns=str.title)
test = pd.read_csv(test_path, index_col="id").rename(columns=str.title)
orig = pd.read_csv("data/original.csv").rename(columns=str.title)

train_extended = pd.concat([train, orig], ignore_index=True)
train_extended = train_extended.sample(len(train_extended), random_state=42)

In [6]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer


preprocess = make_column_transformer(
    (
        make_pipeline(
            FunctionTransformer(np.sqrt, np.square, feature_names_out="one-to-one"),
            StandardScaler(),
        ),
        ["Calc"],
    ),
    (
        StandardScaler(),
        ["Gravity"],
    ),
    verbose_feature_names_out=False,
)

y = train_extended["Target"]
X = train_extended.drop("Target", axis=1)

pd.DataFrame(
    preprocess.fit_transform(X),
    columns=preprocess.get_feature_names_out(),
).head()


Unnamed: 0,Calc,Gravity
0,-0.322173,0.454656
1,1.195081,0.898802
2,-0.407707,0.454656
3,-1.124181,-1.618025
4,-0.590571,1.046851


In [7]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC

from lightgbm import LGBMClassifier
from pygam import LogisticGAM, s


calc_term = s(0, n_splines=21, lam=0.3, spline_order=2)
grav_term = s(1, n_splines=24, lam=1.2, penalties="l2", basis="cp")
gam = LogisticGAM(calc_term + grav_term)  # type: ignore

svc = SVC(random_state=42, gamma=0.3, C=0.9, probability=True)
lgbm = LGBMClassifier(random_state=42, max_depth=1, n_estimators=280, learning_rate=0.1)

rskfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
scores = np.empty(100)

for k, (train_ids, valid_ids) in enumerate(rskfold.split(X, y)):

    X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
    X_valid, y_valid = X.iloc[valid_ids], y.iloc[valid_ids]

    X_train = preprocess.fit_transform(X_train)
    X_valid = preprocess.transform(X_valid)

    gam.fit(X_train, y_train)
    svc.fit(X_train, y_train)
    lgbm.fit(X_train, y_train)

    y_probas = (
        gam.predict_proba(X_valid)
        + svc.predict_proba(X_valid)[:, 1]
        + lgbm.predict_proba(X_valid)[:, 1]
    ) / 3

    scores[k] = roc_auc_score(y_valid, y_probas)

scores.mean()


0.8277419174740602

In [11]:
y_train_full = train_extended["Target"]
X_train_full = preprocess.fit_transform(train_extended.drop("Target", axis=1))
X_test_full = preprocess.transform(test)

gam.fit(X_train_full, y_train_full)
svc.fit(X_train_full, y_train_full)
lgbm.fit(X_train_full, y_train_full)

y_probas = (
    gam.predict_proba(X_test_full)
    + svc.predict_proba(X_test_full)[:, 1]
    + lgbm.predict_proba(X_test_full)[:, 1]
) / 3

submission = pd.DataFrame(
    {
        "id": test.index,
        "target": y_probas,
    }
).set_index("id")

submission.to_csv("submission.csv")
submission.head()


Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
414,0.10829
415,0.378661
416,0.774
417,0.378661
418,0.326533
