## This example contains sample script for feature processing and launching models 

In [35]:
import re
from pprint import pp

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier

In [2]:
RANDOM_STATE = 42
N_JOBS = 5
TARGET = "satisfaction"

### Data

In [3]:
df = pd.read_csv("../data/airlines_train.csv").drop(columns="Unnamed: 0").sample(n=10_000, random_state=RANDOM_STATE).assign(cnst=1); df.info()
X, y = df.drop(columns=TARGET), df[TARGET]

# rename X columns to remove "-" symbol (not processed by catboost)
X = X.rename(columns = lambda x:re.sub('-', '', x))
y = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test  = train_test_split(X, y, stratify=y, random_state=RANDOM_STATE, test_size=0.2)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 80638 to 16420
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 10000 non-null  int64  
 1   Gender                             10000 non-null  object 
 2   Customer Type                      10000 non-null  object 
 3   Age                                10000 non-null  int64  
 4   Type of Travel                     10000 non-null  object 
 5   Class                              10000 non-null  object 
 6   Flight Distance                    10000 non-null  int64  
 7   Inflight wifi service              10000 non-null  int64  
 8   Departure/Arrival time convenient  10000 non-null  int64  
 9   Ease of Online booking             10000 non-null  int64  
 10  Gate location                      10000 non-null  int64  
 11  Food and drink                     10000 non-null 

In [4]:
import sys; sys.path.append("../"); sys.path.append("../src/")

from src.automl.feature_processing import PreprocessingPipeline, ValTestsPipeline, CatboostShapFeatureSelector

'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
[2024-12-09 11:54:27]


In [5]:
feat_pipe = PreprocessingPipeline(obj_encoders=["oe"])
val_test_pipe = ValTestsPipeline()

[2024-12-09 11:54:28,204] - [  PREPROC   ] - Успешно заданы шаги pipeline
[2024-12-09 11:54:28,206] - [ VAL TESTS  ] - Успешно заданы шаги pipeline


#### Feature pipeline

In [6]:
print("Initial train data shape", X_train.shape)
X_train = feat_pipe.fit_transform(X_train, y_train)
print("Train data shape after pipeline", X_train.shape)
X_test = feat_pipe.transform(X_test)

Initial train data shape (8000, 24)
[2024-12-09 11:54:28,225] - [Pipeline] .. (step 1 of 6) Processing nan_cols_dropper, total=   0.0
[2024-12-09 11:54:28,253] - [Pipeline] ....... (step 2 of 6) Processing nan_imputer, total=   0.0
[2024-12-09 11:54:28,269] - [  PREPROC   ] - QConstant features to drop: ['cnst']
[2024-12-09 11:54:28,272] - [Pipeline] .... (step 3 of 6) Processing qconst_dropper, total=   0.0
[2024-12-09 11:54:28,285] - [  PREPROC   ] - Corr features to drop: ['Arrival Delay in Minutes', 'Arrival Delay in Minutes']
[2024-12-09 11:54:28,286] - [Pipeline] . (step 4 of 6) Processing corr_cols_dropper, total=   0.0
[2024-12-09 11:54:28,324] - [Pipeline] .... (step 5 of 6) Processing outlier_capper, total=   0.0
[2024-12-09 11:54:28,358] - [Pipeline] ... (step 6 of 6) Processing feature_encoder, total=   0.0
Train data shape after pipeline (8000, 22)


#### ValTest pipeline

In [7]:
## adding evidently shifted by distribution features
X_train["bad_feature"] = np.random.uniform(0, 10, size=X_train.shape[0])
X_test["bad_feature"] = np.random.uniform(5, 15, size=X_test.shape[0])

In [8]:
print("Initial train data shape", X_train.shape)
X_train = val_test_pipe.fit_transform(X_train, X_test)
print("Train data shape after pipeline", X_train.shape)
X_test = val_test_pipe.transform(X_test)

Initial train data shape (8000, 23)
[2024-12-09 11:54:28,811] - [ VAL TESTS  ] - Features not passing psi test to drop: ['bad_feature']
[2024-12-09 11:54:28,812] - [Pipeline] .......... (step 1 of 2) Processing PSI_test, total=   0.3
[2024-12-09 11:54:28,947] - [Pipeline] .. (step 2 of 2) Processing Adversarial_test, total=   0.1
Train data shape after pipeline (8000, 22)


In [9]:
cat_features = X_train.columns[(X_train.columns.str.startswith("OneHotEncoder")) | (X_train.columns.str.startswith("OrdinalEncoder"))].tolist()

In [10]:
cat_features

['OrdinalEncoder__Gender',
 'OrdinalEncoder__Customer Type',
 'OrdinalEncoder__Type of Travel',
 'OrdinalEncoder__Class']

#### Select features

In [11]:
selector = CatboostShapFeatureSelector(
    n_features_to_select=10,
    n_jobs=N_JOBS,
    steps=5
)

X_train = selector.fit_transform(X_train, y_train, categorical_features=cat_features)
X_test = selector.transform(X_test)

# correct cat_features
cat_features = X_train.columns[(X_train.columns.str.startswith("OneHotEncoder")) | (X_train.columns.str.startswith("OrdinalEncoder"))].tolist()

[2024-12-09 11:54:30,337] - [  FEAT SEL  ] - Started feature selection.
[2024-12-09 11:55:04,990] - [  FEAT SEL  ] - Selected features: ['OrdinalEncoder__Customer Type', 'OrdinalEncoder__Type of Travel', 'OrdinalEncoder__Class', 'StandardScaler__Inflight wifi service', 'StandardScaler__Departure/Arrival time convenient', 'StandardScaler__Online boarding', 'StandardScaler__Inflight entertainment', 'StandardScaler__Baggage handling', 'StandardScaler__Checkin service', 'StandardScaler__Inflight service']


### Model

In [12]:
from src.automl.model import AutoML
from src.automl.model.models_lists import linear_models, forest_models, boosting_models, lama_models, lama_nn_models, all_models

In [25]:
for name, cont in zip(["Linear models", "Forest models", "Boostings", "LAMA models", "LAMA_NN models", "All models"],
                      [linear_models, forest_models, boosting_models, lama_models, lama_nn_models, all_models]):
    print(f"{name}:")
    pp(cont)
    print("\n")

Linear models:
{'regression': [<class 'src.automl.model.linear.linear.RidgeRegression'>],
 'classification': [<class 'src.automl.model.linear.linear.LogisticRegression'>]}


Forest models:
{'regression': [<class 'src.automl.model.sklearn_forests.random_forests.RandomForestRegression'>,
                <class 'src.automl.model.sklearn_forests.extra_forests.ExtraTreesRegression'>],
 'classification': [<class 'src.automl.model.sklearn_forests.random_forests.RandomForestClassification'>,
                    <class 'src.automl.model.sklearn_forests.extra_forests.ExtraTreesClassification'>]}


Boostings:
{'regression': [<class 'src.automl.model.catboost.catboost.CatBoostRegression'>,
                <class 'src.automl.model.xgboost.xgboost.XGBRegression'>,
                <class 'src.automl.model.lightgbm.lightgbm.LightGBMRegression'>],
 'classification': [<class 'src.automl.model.catboost.catboost.CatBoostClassification'>,
                    <class 'src.automl.model.xgboost.xgboost.XGBClas

In [27]:
TUNING_TIMEOUT = 60 # time for tuning each model
TASK = "classification" # one of ["regression", "classification"]
METRIC = "roc_auc" # either sklearn metirc or a custom metric
STACK = True # whether to perform stacking
BLEND = True # whether to perform blending
MODELS_LIST = linear_models[TASK] + forest_models[TASK] + boosting_models[TASK] + lama_models[TASK]

In [29]:
model = AutoML(
    task=TASK,
    models_list=MODELS_LIST,
    metric=METRIC,
    n_jobs=N_JOBS,
    tuning_timeout=TUNING_TIMEOUT,
    stack=STACK,
    blend=BLEND
)

In [32]:
model.fit(
    X_train, y_train,
    X_test, y_test,
    save_test=False,
    save_oof=False,
    save_models=False,
    save_params=False,
    categorical_features=cat_features
)

[2024-12-09 12:11:06,768] - [   MODEL    ] - 1 out of 10. LogisticRegression
[2024-12-09 12:11:06,771] - [   START    ] - Working with LogisticRegression
[2024-12-09 12:11:06,772] - [   START    ] - Tuning LogisticRegression
[2024-12-09 12:11:07,937] - [   PARAMS   ] - C=0.3593813663804626, metric=0.9205910173274916
[2024-12-09 12:11:07,938] - [BEST PARAMS ] - {'C': 0.3593813663804626, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': 5, 'random_state': 42, 'time_series': False}
[2024-12-09 12:11:07,938] - [    END     ] - Tuning LogisticRegression
[2024-12-09 12:11:07,938] - [   START    ] - Fitting LogisticRegression
[2024-12-09 12:11:07,941] - [    FIT     ] - LogisticRegression fold 0
[2024-12-09 12:11:07,970] - [    FIT     ] - LogisticRegression fold 1
[2024-12-09 12:11:07,991] - [    FIT     ] - LogisticRegression fold 2
[2024-12-09 12:11:08,012] - [    FIT     ] - LogisticRegression fold 3
[2024-12-09 12:11:08,030] - [    FIT     ] - LogisticRegression fold 4
[2024-12-09 

<src.automl.model.main.AutoML at 0xfffe9efa12b0>

In [33]:
model.predict(X_test)

array([[0.98294761, 0.01705239],
       [0.71917787, 0.28082213],
       [0.98286915, 0.01713085],
       ...,
       [0.98294761, 0.01705239],
       [0.98294761, 0.01705239],
       [0.98294761, 0.01705239]])