In [4]:
import os
import json
import joblib
import argparse
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

import lightgbm as lgb
import optuna
from optuna.integration import LightGBMPruningCallback

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
class Preprocessor:
    def __init__(self, numeric_estimator=None):
        self.num_cols = []
        self.cat_cols = []
        self.cat_categories = {}
        self.imputer = None
        self.scaler = None
        self.numeric_estimator = numeric_estimator

    def fit(self, X: pd.DataFrame):
        self.num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        self.cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

        # store categories per categorical column (train categories only)
        for c in self.cat_cols:
            vals = X[c].dropna().astype(str).unique().tolist()
            self.cat_categories[c] = vals

        # Iterative imputer with tree estimator (ExtraTreesRegressor) for numeric features
        estimator = self.numeric_estimator or ExtraTreesRegressor(n_estimators=30, n_jobs=-1, random_state=42)
        self.imputer = IterativeImputer(estimator=estimator, max_iter=10, random_state=42)
        if self.num_cols:
            self.imputer.fit(X[self.num_cols])
            self.scaler = StandardScaler()
            self.scaler.fit(self.imputer.transform(X[self.num_cols]))
        else:
            self.imputer = None
            self.scaler = None

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        Xc = X.copy()
        # categorical -> codes using training categories, unseen -> -1
        for c in self.cat_cols:
            Xc[c] = Xc[c].astype(str).where(Xc[c].notnull(), np.nan)
            Xc[c] = pd.Categorical(Xc[c], categories=self.cat_categories[c]).codes
            # pandas codes: -1 for missing/unseen
            Xc[c] = Xc[c].astype(int)

        # numeric imputation + scaling
        if self.num_cols:
            X_num = self.imputer.transform(Xc[self.num_cols])
            X_num = self.scaler.transform(X_num)
            Xc[self.num_cols] = X_num

        # return with stable column order
        return Xc[self.num_cols + self.cat_cols]

In [6]:
def save_json(obj, path):
    with open(path, 'w') as f:
        json.dump(obj, f, indent=2)

# -------------------------
# Training + Optuna
# -------------------------
def main(args):
    out_dir = Path(args.output_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    # load
    train = pd.read_csv(args.train)
    test = pd.read_csv(args.test)
    sample_sub = pd.read_csv(args.sample_submission)

    id_col = args.id_col
    target_col = args.target_col
    features = [c for c in train.columns if c not in [id_col, target_col]]

    X_df = train[features].copy()
    y = train[target_col].copy().astype(str)
    X_test_df = test[features].copy()

    # encode class names (save mapping)
    classes = sorted(y.unique().tolist())
    class_to_idx = {c: i for i, c in enumerate(classes)}
    y_idx = y.map(class_to_idx).values
    joblib.dump(classes, out_dir / "class_names.joblib")

    # Optuna objective
    def objective(trial):
        # hyperparams
        params = {
            "objective": "multiclass",
            "num_class": len(classes),
            "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt", "goss"]),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 20, 256),
            "max_depth": trial.suggest_int("max_depth", 4, 16),
            "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
            "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
            "random_state": 42,
            "n_jobs": -1,
            "verbose": -1
        }
        if params["boosting_type"] == "gbdt":
            params["bagging_fraction"] = trial.suggest_float("bagging_fraction", 0.5, 1.0)
            params["bagging_freq"] = trial.suggest_int("bagging_freq", 1, 7)

        skf = StratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=42)
        fold_scores = []
        for fold, (tr_idx, val_idx) in enumerate(skf.split(X_df, y_idx)):
            X_tr_df = X_df.iloc[tr_idx].reset_index(drop=True)
            X_val_df = X_df.iloc[val_idx].reset_index(drop=True)
            y_tr = y_idx[tr_idx]
            y_val = y_idx[val_idx]

            # fit preprocessor on train fold only (no leakage)
            pre = Preprocessor()
            pre.fit(X_tr_df)
            X_tr = pre.transform(X_tr_df)
            X_val = pre.transform(X_val_df)

            # oversample train fold (SMOTE) to help rare classes
            try:
                sm = SMOTE(random_state=42, n_jobs=-1)
                X_tr_res, y_tr_res = sm.fit_resample(X_tr, y_tr)
            except Exception:
                # fallback if SMOTE fails
                X_tr_res, y_tr_res = X_tr, y_tr

            # model
            model = lgb.LGBMClassifier(**params, n_estimators=args.n_estimators)
            callbacks = []
            if fold == 0:
                callbacks.append(LightGBMPruningCallback(trial, "multi_logloss"))
            model.fit(
                X_tr_res, y_tr_res,
                eval_set=[(X_val, y_val)],
                eval_metric="multi_logloss",
                callbacks=callbacks,
                verbose=False
            )

            y_val_pred = model.predict(X_val)
            f1 = f1_score(y_val, y_val_pred, average="macro")
            fold_scores.append(f1)

        mean_f1 = float(np.mean(fold_scores))
        # we minimize (1 - F1)
        return 1.0 - mean_f1

    study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), study_name=args.study_name, storage=None)
    study.optimize(objective, n_trials=args.n_trials, show_progress_bar=True)

    print("Best value (1 - F1):", study.best_value)
    print("Best params:", study.best_params)
    joblib.dump(study, out_dir / "optuna_study.joblib")

    # Fit final preprocessor on full train and save
    pre_full = Preprocessor()
    pre_full.fit(X_df)
    X_full = pre_full.transform(X_df)
    X_test = pre_full.transform(X_test_df)
    joblib.dump(pre_full, out_dir / "preprocessor.joblib")

    # Retrain top-K trials on full data and save models
    top_k = args.top_k
    trials_sorted = sorted([t for t in study.trials], key=lambda t: t.value)[:top_k]
    model_paths = []
    for i, t in enumerate(trials_sorted):
        params = t.params.copy()
        params.update({"objective": "multiclass", "num_class": len(classes), "random_state": 42, "n_jobs": -1, "verbose": -1})
        # ensure bagging params exist for gbdt
        model = lgb.LGBMClassifier(**params, n_estimators=args.n_estimators)
        # try SMOTE on full (may be beneficial)
        try:
            sm = SMOTE(random_state=42, n_jobs=-1)
            X_res, y_res = sm.fit_resample(X_full, y_idx)
        except Exception:
            X_res, y_res = X_full, y_idx

        model.fit(X_res, y_res)
        path = out_dir / f"model_top_{i}.joblib"
        joblib.dump(model, path)
        model_paths.append(str(path))
        print(f"Saved model {path}")

    joblib.dump(model_paths, out_dir / "model_paths.joblib")
    print("All done. Artifacts in:", out_dir)

    # optional: build averaged predictions and save submission
    probs = None
    for p in model_paths:
        m = joblib.load(p)
        pr = m.predict_proba(X_test)
        probs = pr if probs is None else probs + pr
    probs = probs / len(model_paths)
    pred_idx = np.argmax(probs, axis=1)
    preds = [classes[i] for i in pred_idx]

    submission = pd.DataFrame({id_col: test[id_col]})
    for cls in classes:
        submission[cls] = 0
    for i, cls in enumerate(preds):
        submission.loc[i, cls] = 1
    out_sub = out_dir / "submission_ensemble.csv"
    submission.to_csv(out_sub, index=False)
    print("Saved ensemble submission:", out_sub)

In [7]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--train", default="train.csv")
    parser.add_argument("--test", default="test.csv")
    parser.add_argument("--sample_submission", default="sample_submission.csv")
    parser.add_argument("--output_dir", default="artifacts")
    parser.add_argument("--id_col", default="idx")
    parser.add_argument("--target_col", default="type")
    parser.add_argument("--n_trials", type=int, default=60)
    parser.add_argument("--n_splits", type=int, default=5)
    parser.add_argument("--n_estimators", type=int, default=1500)
    parser.add_argument("--top_k", type=int, default=3)
    parser.add_argument("--study_name", type=str, default="astro_study")
    args = parser.parse_args()
    main(args)

usage: ipykernel_launcher.py [-h] [--train TRAIN] [--test TEST]
                             [--sample_submission SAMPLE_SUBMISSION]
                             [--output_dir OUTPUT_DIR] [--id_col ID_COL]
                             [--target_col TARGET_COL] [--n_trials N_TRIALS]
                             [--n_splits N_SPLITS]
                             [--n_estimators N_ESTIMATORS] [--top_k TOP_K]
                             [--study_name STUDY_NAME]
ipykernel_launcher.py: error: unrecognized arguments: --f=c:\Users\akaze\AppData\Roaming\jupyter\runtime\kernel-v388c161f59f508fa1ff27a0ce1de1c2985dde2a14.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
