In [1]:
import numpy as np
import optuna
from optuna.integration import CatBoostPruningCallback
import logging
import sys

import catboost as cb
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, log_loss, make_scorer, classification_report
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

import warnings
from optuna.exceptions import ExperimentalWarning

warnings.filterwarnings("ignore", category=ExperimentalWarning)

In [2]:
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study_name = "Test11" 
storage_name = "sqlite:///LitmusOptunaTest.sqlite"

In [3]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), 
    direction="minimize",
    study_name=study_name, 
    storage=storage_name,
    load_if_exists=True
    )

[32m[I 2023-04-06 17:20:04,014][0m A new study created in RDB with name: Test11[0m


A new study created in RDB with name: Test11


In [4]:
X, y = make_classification(
    n_samples=5000, # 1000 observations 
    n_features=500, # 5 total features
    n_informative=100, # 3 'useful' features
    n_classes=2, # binary target/label 
    random_state=999 # if you want the same results as mine
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [5]:
y_train = np.where(y_train == 1, "yes", "no")
y_test = np.where(y_test == 1, "yes", "no")

In [6]:
# Default params
clf2 = cb.CatBoostClassifier(verbose=False)
clf2.fit(X_train, y_train)
y_pred2 = clf2.predict(X_test)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

          no       0.91      0.91      0.91       630
         yes       0.91      0.91      0.91       620

    accuracy                           0.91      1250
   macro avg       0.91      0.91      0.91      1250
weighted avg       0.91      0.91      0.91      1250



In [8]:
# CV with pruning approach

ho_objective = lambda trial: get_ho_objective(trial, X_train, y_train, X_test, y_test)

def get_ho_objective(trial: optuna.Trial, X_train, y_train, X_test, y_test) -> float:

    params = {
        "objective": "Logloss",
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1.0, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "random_strength": trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "grow_policy": trial.suggest_categorical(
            "grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]),
        "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        "od_wait": trial.suggest_int("od_wait", 10, 50),
        "eval_metric": "Logloss"
    }
    
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = cb.CatBoostClassifier(**params)

    pruning_callback = CatBoostPruningCallback(trial, "Logloss")
    
    gbm.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=0,
        early_stopping_rounds=10,
        callbacks=[pruning_callback]
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    pred_proba = gbm.predict_proba(X_test)

    return log_loss(y_test, pred_proba)

In [13]:
study.optimize(ho_objective, n_trials=20, timeout=600)

[32m[I 2023-04-06 17:27:44,209][0m Trial 40 pruned. Trial was pruned at iteration 5.[0m


Trial 40 pruned. Trial was pruned at iteration 5.


[32m[I 2023-04-06 17:27:46,143][0m Trial 41 finished with value: 0.3051886953261671 and parameters: {'learning_rate': 0.2583383324547179, 'l2_leaf_reg': 0.48903961314219996, 'colsample_bylevel': 0.09912435428891214, 'depth': 5, 'random_strength': 0.002432338763021007, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'od_type': 'Iter', 'od_wait': 44}. Best is trial 41 with value: 0.3051886953261671.[0m


Trial 41 finished with value: 0.3051886953261671 and parameters: {'learning_rate': 0.2583383324547179, 'l2_leaf_reg': 0.48903961314219996, 'colsample_bylevel': 0.09912435428891214, 'depth': 5, 'random_strength': 0.002432338763021007, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'od_type': 'Iter', 'od_wait': 44}. Best is trial 41 with value: 0.3051886953261671.


[32m[I 2023-04-06 17:27:47,867][0m Trial 42 finished with value: 0.3300489120187404 and parameters: {'learning_rate': 0.3359171233907206, 'l2_leaf_reg': 1.596292208363456, 'colsample_bylevel': 0.09989064303010434, 'depth': 5, 'random_strength': 0.0050648440598956524, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'od_type': 'Iter', 'od_wait': 48}. Best is trial 41 with value: 0.3051886953261671.[0m


Trial 42 finished with value: 0.3300489120187404 and parameters: {'learning_rate': 0.3359171233907206, 'l2_leaf_reg': 1.596292208363456, 'colsample_bylevel': 0.09989064303010434, 'depth': 5, 'random_strength': 0.0050648440598956524, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'od_type': 'Iter', 'od_wait': 48}. Best is trial 41 with value: 0.3051886953261671.


[32m[I 2023-04-06 17:27:49,718][0m Trial 43 finished with value: 0.3205735403668498 and parameters: {'learning_rate': 0.21026845231879437, 'l2_leaf_reg': 0.15903102871316951, 'colsample_bylevel': 0.0846734500441729, 'depth': 6, 'random_strength': 0.0006045201667099682, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'od_type': 'Iter', 'od_wait': 44}. Best is trial 41 with value: 0.3051886953261671.[0m


Trial 43 finished with value: 0.3205735403668498 and parameters: {'learning_rate': 0.21026845231879437, 'l2_leaf_reg': 0.15903102871316951, 'colsample_bylevel': 0.0846734500441729, 'depth': 6, 'random_strength': 0.0006045201667099682, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'od_type': 'Iter', 'od_wait': 44}. Best is trial 41 with value: 0.3051886953261671.


[32m[I 2023-04-06 17:27:50,169][0m Trial 44 pruned. Trial was pruned at iteration 5.[0m


Trial 44 pruned. Trial was pruned at iteration 5.


[32m[I 2023-04-06 17:27:50,855][0m Trial 45 pruned. Trial was pruned at iteration 56.[0m


Trial 45 pruned. Trial was pruned at iteration 56.


[32m[I 2023-04-06 17:27:51,287][0m Trial 46 pruned. Trial was pruned at iteration 5.[0m


Trial 46 pruned. Trial was pruned at iteration 5.


[32m[I 2023-04-06 17:27:51,988][0m Trial 47 pruned. Trial was pruned at iteration 38.[0m


Trial 47 pruned. Trial was pruned at iteration 38.


[32m[I 2023-04-06 17:27:52,414][0m Trial 48 pruned. Trial was pruned at iteration 5.[0m


Trial 48 pruned. Trial was pruned at iteration 5.


[32m[I 2023-04-06 17:27:52,942][0m Trial 49 pruned. Trial was pruned at iteration 30.[0m


Trial 49 pruned. Trial was pruned at iteration 30.


[32m[I 2023-04-06 17:27:53,395][0m Trial 50 pruned. Trial was pruned at iteration 5.[0m


Trial 50 pruned. Trial was pruned at iteration 5.


[32m[I 2023-04-06 17:27:54,714][0m Trial 51 pruned. Trial was pruned at iteration 124.[0m


Trial 51 pruned. Trial was pruned at iteration 124.


[32m[I 2023-04-06 17:27:56,763][0m Trial 52 finished with value: 0.29719934113512864 and parameters: {'learning_rate': 0.18455759521997997, 'l2_leaf_reg': 0.28571180257777, 'colsample_bylevel': 0.09648188035500745, 'depth': 5, 'random_strength': 0.0016371839885351422, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'od_type': 'Iter', 'od_wait': 44}. Best is trial 52 with value: 0.29719934113512864.[0m


Trial 52 finished with value: 0.29719934113512864 and parameters: {'learning_rate': 0.18455759521997997, 'l2_leaf_reg': 0.28571180257777, 'colsample_bylevel': 0.09648188035500745, 'depth': 5, 'random_strength': 0.0016371839885351422, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'od_type': 'Iter', 'od_wait': 44}. Best is trial 52 with value: 0.29719934113512864.


[32m[I 2023-04-06 17:27:59,031][0m Trial 53 finished with value: 0.298021022486473 and parameters: {'learning_rate': 0.18019723220436862, 'l2_leaf_reg': 0.25477304480371643, 'colsample_bylevel': 0.09955793943703921, 'depth': 6, 'random_strength': 0.0006924333653523341, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'od_type': 'Iter', 'od_wait': 44}. Best is trial 52 with value: 0.29719934113512864.[0m


Trial 53 finished with value: 0.298021022486473 and parameters: {'learning_rate': 0.18019723220436862, 'l2_leaf_reg': 0.25477304480371643, 'colsample_bylevel': 0.09955793943703921, 'depth': 6, 'random_strength': 0.0006924333653523341, 'bootstrap_type': 'MVS', 'grow_policy': 'Lossguide', 'od_type': 'Iter', 'od_wait': 44}. Best is trial 52 with value: 0.29719934113512864.


[32m[I 2023-04-06 17:27:59,648][0m Trial 54 pruned. Trial was pruned at iteration 23.[0m


Trial 54 pruned. Trial was pruned at iteration 23.


[32m[I 2023-04-06 17:28:00,103][0m Trial 55 pruned. Trial was pruned at iteration 5.[0m


Trial 55 pruned. Trial was pruned at iteration 5.


[32m[I 2023-04-06 17:28:00,561][0m Trial 56 pruned. Trial was pruned at iteration 5.[0m


Trial 56 pruned. Trial was pruned at iteration 5.


[32m[I 2023-04-06 17:28:01,589][0m Trial 57 pruned. Trial was pruned at iteration 83.[0m


Trial 57 pruned. Trial was pruned at iteration 83.


[32m[I 2023-04-06 17:28:02,034][0m Trial 58 pruned. Trial was pruned at iteration 5.[0m


Trial 58 pruned. Trial was pruned at iteration 5.


[32m[I 2023-04-06 17:28:02,495][0m Trial 59 pruned. Trial was pruned at iteration 5.[0m


Trial 59 pruned. Trial was pruned at iteration 5.


In [55]:
study.trials_dataframe()["datetime_start"].max()

Timestamp('2023-04-06 17:28:02.036938')

In [51]:
a = np.random.random() > 0.9
b = True

if b & a:
    print(a, b)

True True


In [52]:
a = [1,2,3]
b = [5,6,7]

np.concatenate([a, b])

array([1, 2, 3, 5, 6, 7])

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from freqtrade.litmus import feature_selection_helper

In [7]:
feature_selection_helper.get_important_features(model="&-primary_enter_long", pair="SOL/USDT:USDT", pct_additional_features=0.2)

array(['%-adx_128_SOL/USDTUSDT_3m', '%-adx_24_BTC/USDTUSDT_30m',
       '%-adx_24_SOL/USDTUSDT_3m', '%-adx_64_SOL/USDTUSDT_30m',
       '%-adx_64_SOL/USDTUSDT_3m', '%-adx_8_BTC/USDTUSDT_30m',
       '%-adx_8_SOL/USDTUSDT_30m', '%-adx_8_SOL/USDTUSDT_3m',
       '%-adx_8_shift-2_BTC/USDTUSDT_30m',
       '%-adx_8_shift-2_BTC/USDTUSDT_3m',
       '%-adx_8_shift-2_SOL/USDTUSDT_30m',
       '%-adx_8_shift-2_SOL/USDTUSDT_3m',
       '%-adx_8_shift-4_BTC/USDTUSDT_30m',
       '%-adx_8_shift-4_SOL/USDTUSDT_30m', '%-apo_128_BTC/USDTUSDT_3m',
       '%-apo_24_BTC/USDTUSDT_30m', '%-apo_64_BTC/USDTUSDT_30m',
       '%-apo_64_shift-4_SOL/USDTUSDT_30m',
       '%-apo_64_shift-4_SOL/USDTUSDT_3m', '%-apo_8_BTC/USDTUSDT_30m',
       '%-apo_8_BTC/USDTUSDT_3m', '%-apo_8_shift-1_BTC/USDTUSDT_30m',
       '%-apo_8_shift-1_BTC/USDTUSDT_3m',
       '%-apo_8_shift-2_BTC/USDTUSDT_30m',
       '%-apo_8_shift-3_BTC/USDTUSDT_3m', '%-atr_128_SOL/USDTUSDT_30m',
       '%-atr_24_SOL/USDTUSDT_3m', '%-atr_64_BTC/USDTU

In [8]:
len(feature_selection_helper.get_important_features(model="&-primary_enter_long", pair="SOL/USDT:USDT", pct_additional_features=0.2))

588

In [8]:
import time
import pandas as pd

model="&-primary_enter_long"
pair="SOL/USDT:USDT"
pct_additional_features="0.2"

connection_string = "sqlite:///litmus.sqlite"
timestamp_in_past = time.time() - 10 * 24 * 60 * 60
sql = f"""
    SELECT feature_id, important_feature, train_time
    FROM feature_shuffle_selection
    WHERE model = '{model}'
    AND pair = '{pair}'
    AND train_time > '{timestamp_in_past}'"""

try:
    data = pd.read_sql_query(sql=sql, con=connection_string)
except Exception as e:
    logger.info(f"Issue reading from SQL to exclude features {e}")