# step

- problem type belirlenmesi: başlangıç için binary classification ve normal regresyon, dieğrleri not implemented
- model seçilebilir olmalı. başlangıç için üç seçenek: catboost, lightgbm, xgboost
- parametre arama yöntemi seçilmeli. başlangıç için sadece optuna
- validasyon stratejisi başlangıç için classificationsa stratified kfold, değilse kfold olmalı


In [1]:
import pandas as pd
from sklearn.utils.multiclass import type_of_target
import catboost
import xgboost
import lightgbm
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from sklearn.metrics import f1_score
import numpy as np
import optuna
from functools import partial
from tqdm import tqdm

%load_ext lab_black

In [2]:
df_reg = pd.read_csv("single_column_regression.csv").rename(columns={"target": "label"})
df_cf = pd.read_csv("binary_classification.csv").rename(columns={"income": "label"})

In [3]:
df_cf.label = df_cf["label"].map({"<=50K": 1, ">50K": 0})  # todo: automaticly

In [4]:
# inputs
class CFG:
    # must
    label = "label"
    model = "catboost"
    fold = 5
    trial = 1

    # optional
    task = None  # ["regression","classification"]
    tune = True
    features_to_drop = []
    random_state = 42

In [5]:
class TaskId:
    regression = 0
    classification = 1

In [6]:
# this function detect problem type automaticaly
def detect_problem_type(train_df):
    if CFG.task == "classification":
        return "classification", TaskId.classification
    elif CFG.task == "regression":
        return "regression", TaskId.regression
    else:
        if type_of_target(train_df[CFG.label].values) == "continuous":
            return "regression", TaskId.regression
        elif type_of_target(train_df[CFG.label].values) == "binary":
            return "classification", TaskId.classification
        else:
            raise NotImplementedError

In [8]:
def get_optuna_parameter_space(trial):
    """
    return parameter search space
    """
    # todo: expand
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.25, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 9),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 20, 50),
    }

    return params

In [9]:
def map_to_model_class(task):
    class_map_dict = {
        xgboost.__name__: (xgboost, "XGBoost"),
        catboost.__name__: (catboost, "CatBoost"),
        lightgbm.__name__: (lightgbm, "LightGBM"),
    }
    model, model_name = class_map_dict[CFG.model]
    if task == "regression":
        model_type = "Regressor"
    else:
        model_type = "Classifier"

    return getattr(model, model_name + model_type)

In [10]:
def map_to_validation_strategy(task):
    if task == "classification":
        return StratifiedKFold(
            n_splits=CFG.fold, shuffle=True, random_state=CFG.random_state
        )
    else:
        return KFold(n_splits=CFG.fold, shuffle=True, random_state=CFG.random_state)

In [11]:
def get_cat_and_num_features(df_):
    cat_features = df_.select_dtypes(["object", "category"]).columns
    num_features = df_.select_dtypes(["int", "float"]).columns
    return cat_features, num_features

In [12]:
def optimize(trial, train_df):
    task, task_id = detect_problem_type(train_df)

    cv = map_to_validation_strategy(task)
    params = get_optuna_parameter_space(trial)
    cat_features, num_features = get_cat_and_num_features(train_df)
    label = CFG.label

    input_features = train_df.columns.difference(
        pd.Index(CFG.features_to_drop).union(pd.Index([label]))
    )

    cv_splits = list(cv.split(train_df.index, train_df[label].astype(str)))

    scores = list()
    for fold, (split_train, split_val) in tqdm(enumerate(cv_splits), total=CFG.fold):
        model = map_to_model_class(task)(**params)

        model.fit(
            train_df[input_features].iloc[split_train],
            train_df[label].iloc[split_train],
            cat_features=list(cat_features.difference(pd.Index(CFG.features_to_drop))),
            eval_set=[
                (
                    train_df[input_features].iloc[split_val],
                    train_df[label].iloc[split_val],
                )
            ],
            verbose=100,
        )

        preds = model.predict(train_df[input_features].iloc[split_val])

        score = f1_score(train_df[label].iloc[split_val], preds)  # todo: eval metric
        scores.append(score)

    cv_score = np.mean(scores)
    return cv_score

In [13]:
# train_df = df_cf.copy()

In [14]:
study = optuna.create_study(direction="maximize")  # todo: direction based on metric
study.optimize(partial(optimize, train_df=df_cf), n_trials=CFG.trial)

[32m[I 2023-03-12 17:40:09,393][0m A new study created in memory with name: no-name-ca0bc197-6ce7-4a1e-9d7e-b524a2d462b6[0m
  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

0:	learn: 0.6788544	test: 0.6789308	best: 0.6789308 (0)	total: 180ms	remaining: 50.4s
100:	learn: 0.3100246	test: 0.3162796	best: 0.3162796 (100)	total: 7.38s	remaining: 13.2s
200:	learn: 0.2862236	test: 0.2967349	best: 0.2967349 (200)	total: 15.5s	remaining: 6.17s


 20%|████████████████▊                                                                   | 1/5 [00:22<01:30, 22.50s/it]

280:	learn: 0.2766304	test: 0.2908243	best: 0.2908243 (280)	total: 22.1s	remaining: 0us

bestTest = 0.2908243156
bestIteration = 280

0:	learn: 0.6755048	test: 0.6756854	best: 0.6756854 (0)	total: 89.6ms	remaining: 25.1s
100:	learn: 0.3105019	test: 0.3147526	best: 0.3147526 (100)	total: 7.51s	remaining: 13.4s
200:	learn: 0.2850666	test: 0.2963384	best: 0.2963384 (200)	total: 15.5s	remaining: 6.17s


 40%|█████████████████████████████████▌                                                  | 2/5 [00:44<01:06, 22.31s/it]

280:	learn: 0.2757713	test: 0.2916908	best: 0.2916908 (280)	total: 21.8s	remaining: 0us

bestTest = 0.2916907569
bestIteration = 280

0:	learn: 0.6755773	test: 0.6755015	best: 0.6755015 (0)	total: 81.3ms	remaining: 22.8s
100:	learn: 0.3082984	test: 0.3197663	best: 0.3197663 (100)	total: 8.29s	remaining: 14.8s
200:	learn: 0.2849245	test: 0.3031567	best: 0.3031567 (200)	total: 17.1s	remaining: 6.8s


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [01:09<00:46, 23.44s/it]

280:	learn: 0.2758013	test: 0.2983738	best: 0.2983738 (280)	total: 24.4s	remaining: 0us

bestTest = 0.2983738154
bestIteration = 280

0:	learn: 0.6775185	test: 0.6774771	best: 0.6774771 (0)	total: 73.2ms	remaining: 20.5s
100:	learn: 0.3094787	test: 0.3163580	best: 0.3163580 (100)	total: 8.31s	remaining: 14.8s
200:	learn: 0.2843521	test: 0.2978785	best: 0.2978785 (200)	total: 19.8s	remaining: 7.87s


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [01:37<00:25, 25.43s/it]

280:	learn: 0.2746539	test: 0.2921537	best: 0.2921537 (280)	total: 28s	remaining: 0us

bestTest = 0.2921537467
bestIteration = 280

0:	learn: 0.6777509	test: 0.6777178	best: 0.6777178 (0)	total: 98.7ms	remaining: 27.6s
100:	learn: 0.3113395	test: 0.3138785	best: 0.3138785 (100)	total: 10.4s	remaining: 18.5s
200:	learn: 0.2865757	test: 0.2944082	best: 0.2944082 (200)	total: 21s	remaining: 8.35s


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:06<00:00, 25.34s/it]
[32m[I 2023-03-12 17:42:16,141][0m Trial 0 finished with value: 0.9148121720807205 and parameters: {'learning_rate': 0.014893944705178191, 'max_depth': 9, 'n_estimators': 281, 'early_stopping_rounds': 42}. Best is trial 0 with value: 0.9148121720807205.[0m


280:	learn: 0.2768674	test: 0.2892037	best: 0.2892037 (280)	total: 28.3s	remaining: 0us

bestTest = 0.2892037412
bestIteration = 280



In [15]:
study.best_params

{'learning_rate': 0.014893944705178191,
 'max_depth': 9,
 'n_estimators': 281,
 'early_stopping_rounds': 42}

# schemas
from pydantic import BaseModel
from typing import List, Optional


class CFG(BaseModel):
    """
    input configs
    """
    #required
    train_file_path = "data_samples/binary_classification.csv"
    output_path = "output"

    # optional parameters
    test_file_path: Optional[str] = None
    task :Optional[str] = None
    idx :Optional[str] = None
    label = "income"
    features :Optional[List(str)] = None
    categorical_features = None
    use_gpu = True
    num_folds = 5
    seed = 42
    num_trials = 100