## This example contains sample script for launching models.

In [1]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np

In [2]:
RANDOM_STATE = 42
N_JOBS = 5

### Data

In [3]:
X, y = make_classification(
    n_samples=2_000,
    n_features=20,
    n_informative=5,
    n_redundant=8,
    n_classes=2,
    random_state=RANDOM_STATE
    
)
X = pd.DataFrame(X, columns=[f"some_col_{i}" for i in range(X.shape[1])])

In [4]:
# change some columns to be categorical
cat_columns = ["some_col_0", "some_col_1"]
X[cat_columns] = OrdinalEncoder(dtype=np.int32).fit_transform(X[cat_columns])

In [5]:
display(X.describe())
display(X.info())

Unnamed: 0,some_col_0,some_col_1,some_col_2,some_col_3,some_col_4,some_col_5,some_col_6,some_col_7,some_col_8,some_col_9,some_col_10,some_col_11,some_col_12,some_col_13,some_col_14,some_col_15,some_col_16,some_col_17,some_col_18,some_col_19
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,999.5,999.5,0.466763,-0.003347,-0.32133,0.045434,-0.4984,0.481665,0.500616,-0.332416,-0.318511,-0.007366,0.010894,0.508725,-0.046793,0.029604,-0.110633,0.345648,-0.244391,0.005858
std,577.494589,577.494589,1.581978,1.002127,1.569878,1.001325,1.328187,1.702487,1.473735,1.009229,1.755839,0.979873,1.027341,1.483241,1.694137,1.579587,2.275512,1.404994,1.330875,0.995285
min,0.0,0.0,-6.153423,-4.019121,-4.714675,-3.540961,-5.6404,-4.716066,-4.394886,-5.055327,-6.197736,-3.619537,-3.330786,-5.248759,-6.340368,-5.426865,-7.262757,-4.682227,-5.853322,-3.757435
25%,499.75,499.75,-0.383653,-0.704228,-1.43057,-0.642448,-1.420988,-0.668602,-0.445927,-0.943095,-1.402269,-0.685471,-0.676563,-0.282861,-1.125539,-1.091275,-1.548974,-0.509802,-1.07369,-0.660126
50%,999.5,999.5,0.659006,-0.017209,-0.386346,0.05801,-0.558083,0.495042,0.525441,-0.372427,-0.342493,-0.010548,0.015602,0.657839,0.085869,0.051607,0.091178,0.389856,-0.168456,-0.01596
75%,1499.25,1499.25,1.527399,0.676523,0.847568,0.705146,0.662521,1.650155,1.492717,0.22537,0.752865,0.672224,0.713936,1.494471,1.144412,1.187744,1.551841,1.171661,0.612136,0.676686
max,1999.0,1999.0,6.358007,3.408051,4.893345,3.687467,4.421027,6.137377,5.9731,3.48632,6.130466,3.489648,3.215947,4.715561,4.468373,4.703177,6.534974,5.530511,4.567918,2.962214


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   some_col_0   2000 non-null   int32  
 1   some_col_1   2000 non-null   int32  
 2   some_col_2   2000 non-null   float64
 3   some_col_3   2000 non-null   float64
 4   some_col_4   2000 non-null   float64
 5   some_col_5   2000 non-null   float64
 6   some_col_6   2000 non-null   float64
 7   some_col_7   2000 non-null   float64
 8   some_col_8   2000 non-null   float64
 9   some_col_9   2000 non-null   float64
 10  some_col_10  2000 non-null   float64
 11  some_col_11  2000 non-null   float64
 12  some_col_12  2000 non-null   float64
 13  some_col_13  2000 non-null   float64
 14  some_col_14  2000 non-null   float64
 15  some_col_15  2000 non-null   float64
 16  some_col_16  2000 non-null   float64
 17  some_col_17  2000 non-null   float64
 18  some_col_18  2000 non-null   float64
 19  some_c

None

In [6]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, stratify=y, random_state=RANDOM_STATE, test_size=0.2)

#### Models

In [7]:
import sys; sys.path.append("../")

# automl model
from src.automl.model import AutoML

# custom metric 
# from src.automl.metrics import RocAuc

# for propper logging
from src.automl.loggers import enable_logging_to_dir

'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
[2024-12-02 11:37:14]


In [15]:
# to store logs in file
enable_logging_to_dir()

#! ls -l ml_data/2024_12_02___11_37_13

In [9]:
# constants for the run

TUNING_TIMEOUT = 30 # time for tuning each model
TASK = "classification" # one of ["regression", "classification"]
METRIC = "roc_auc" # either sklearn metirc or a custom metric
STACK = True # whether to perform stacking
BLEND = True # whether to perform blending

In [10]:
model = AutoML(
    task=TASK,
    metric=METRIC,
    n_jobs=N_JOBS,
    tuning_timeout=TUNING_TIMEOUT,
    stack=STACK,
    blend=BLEND
)

In [11]:
model.fit(
    X_train, y_train,
    X_test, y_test,
    save_test=True,
    save_oof=True,
    save_models=True,
    categorical_features=cat_columns
)

[2024-12-02 11:37:16,648] - [   MODEL    ] - 1 out of 9. LogisticRegression
[2024-12-02 11:37:16,650] - [   START    ] - Working with LogisticRegression
[2024-12-02 11:37:16,653] - [   START    ] - Tuning LogisticRegression
[2024-12-02 11:37:17,905] - [   PARAMS   ] - C=0.046415888336127774, metric=0.83259375
[2024-12-02 11:37:17,906] - [BEST PARAMS ] - {'C': 0.046415888336127774, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': 5, 'random_state': 42, 'time_series': False}
[2024-12-02 11:37:17,906] - [    END     ] - Tuning LogisticRegression
[2024-12-02 11:37:17,907] - [   START    ] - Fitting LogisticRegression
[2024-12-02 11:37:17,908] - [    FIT     ] - LogisticRegression fold 0
[2024-12-02 11:37:17,974] - [    FIT     ] - LogisticRegression fold 1
[2024-12-02 11:37:18,038] - [    FIT     ] - LogisticRegression fold 2
[2024-12-02 11:37:18,096] - [    FIT     ] - LogisticRegression fold 3
[2024-12-02 11:37:18,175] - [    FIT     ] - LogisticRegression fold 4
[2024-12-02 11:37

<src.automl.model.main.AutoML at 0xffff26bc42b0>

In [12]:
roc_auc_score(y_test, model.predict(X_test)[:, 1])

0.9806000000000001

In [16]:
!tree ml_data/2024_12_02___11_37_13

[01;34mml_data/2024_12_02___11_37_13[00m
├── [01;34mBlender[00m
│   ├── Blender.joblib
│   ├── Blender.yaml
│   ├── oof_preds.csv
│   └── test_preds.csv
├── [01;34mCatBoostClassification[00m
│   ├── CatBoostClassification.joblib
│   ├── CatBoostClassification.yaml
│   ├── oof_preds.csv
│   └── test_preds.csv
├── [01;34mExtraTreesClassification[00m
│   ├── ExtraTreesClassification.joblib
│   ├── ExtraTreesClassification.yaml
│   ├── oof_preds.csv
│   └── test_preds.csv
├── [01;34mLightGBMClassification[00m
│   ├── LightGBMClassification.joblib
│   ├── LightGBMClassification.yaml
│   ├── oof_preds.csv
│   └── test_preds.csv
├── [01;34mLogisticRegression[00m
│   ├── LogisticRegression.joblib
│   ├── LogisticRegression.yaml
│   ├── oof_preds.csv
│   └── test_preds.csv
├── [01;34mRandomForestClassification[00m
│   ├── RandomForestClassification.joblib
│   ├── RandomForestClassification.yaml
│   ├── oof_preds.csv
│   └── test_preds.csv
├── [01;34mStacker[00m
│   ├── Stacker.j