In [82]:
import datetime
import streamlit as st
import sqlite3
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import time

In [83]:
import os
from pathlib import Path
os.chdir("/Users/mregan/Dev/litmus-04/freqtrade")
print(Path.cwd())

/Users/mregan/Dev/litmus-04/freqtrade


In [84]:
model = "&-primary_enter_long"
pair = "SOL/USDT:USDT"

In [267]:
def exclude_weak_features(model: str, pair: str, loss_ratio_threshold: float,
                          chance_excluded: float, min_num_trials: int):
    """Identify weakest features from prior model feature selection routines
    and exclude these from future training

    :return list of column names that should be excluded for model + pair combo"""

    # Read trial + win data from sqlite
    connection_string = "sqlite:///litmus.sqlite"
    timestamp_in_past = time.time() - 10 * 24 * 60 * 60
    sql = f"""
        SELECT feature_id, importance
        FROM feature_importance_history
        WHERE model = '{model}'
        AND pair = '{pair}'
        AND train_time > '{timestamp_in_past}'"""

    try:
        data = pd.read_sql_query(sql=sql, con=connection_string)
    except Exception as e:
        logger.info(f"Issue reading from SQL to exclude features {e}")
        empty = np.array([])

    data["is_important"] = data["importance"] > 0

    sum_is_good = data.groupby("feature_id")["is_important"].sum()
    count_is_good = data.groupby("feature_id")["is_important"].size()
    summary_df = pd.concat([sum_is_good, count_is_good], keys=["wins", "trials"], axis=1).reset_index()
    summary_df["loss_ratio"] = 1 - summary_df["wins"] / summary_df["trials"]

    # Generate random variate from distribution per feature based on prior inclusion / exclusion
    summary_df["random"] = np.random.random(size=len(summary_df))

    excluded = summary_df.loc[
        (summary_df["loss_ratio"] > loss_ratio_threshold) &
        (summary_df["random"] < chance_excluded) &
        (summary_df["trials"] > min_num_trials)]

    return excluded

In [268]:
x = exclude_weak_features(model, pair, loss_ratio_threshold=0.7, chance_excluded=0.9, min_num_trials=10)
x

Unnamed: 0,feature_id,wins,trials,loss_ratio,random
1,%-adx_100_BTC/USDTUSDT_2h,5,18,0.722222,0.537077
3,%-adx_100_SOL/USDTUSDT_15m,3,18,0.833333,0.512623
9,%-adx_10_BTC/USDTUSDT_15m,0,18,1.000000,0.051065
10,%-adx_10_BTC/USDTUSDT_2h,12,41,0.707317,0.181426
21,%-adx_20_BTC/USDTUSDT_15m,1,14,0.928571,0.526668
...,...,...,...,...,...
929,%-tcp_20_shift-1_BTC/USDTUSDT_15m,5,29,0.827586,0.111791
931,%-tcp_20_shift-1_BTC/USDTUSDT_3m,0,22,1.000000,0.046674
935,%-tcp_20_shift-2_BTC/USDTUSDT_15m,4,22,0.818182,0.689985
936,%-tcp_20_shift-2_BTC/USDTUSDT_2h,3,11,0.727273,0.574862


In [272]:
cols = np.array(["feature_id", "importance"])
x.drop(columns=cols, errors="ignore")

Unnamed: 0,wins,trials,loss_ratio,random
1,5,18,0.722222,0.537077
3,3,18,0.833333,0.512623
9,0,18,1.000000,0.051065
10,12,41,0.707317,0.181426
21,1,14,0.928571,0.526668
...,...,...,...,...
929,5,29,0.827586,0.111791
931,0,22,1.000000,0.046674
935,4,22,0.818182,0.689985
936,3,11,0.727273,0.574862


In [271]:
x

Unnamed: 0,feature_id,wins,trials,loss_ratio,random
1,%-adx_100_BTC/USDTUSDT_2h,5,18,0.722222,0.537077
3,%-adx_100_SOL/USDTUSDT_15m,3,18,0.833333,0.512623
9,%-adx_10_BTC/USDTUSDT_15m,0,18,1.000000,0.051065
10,%-adx_10_BTC/USDTUSDT_2h,12,41,0.707317,0.181426
21,%-adx_20_BTC/USDTUSDT_15m,1,14,0.928571,0.526668
...,...,...,...,...,...
929,%-tcp_20_shift-1_BTC/USDTUSDT_15m,5,29,0.827586,0.111791
931,%-tcp_20_shift-1_BTC/USDTUSDT_3m,0,22,1.000000,0.046674
935,%-tcp_20_shift-2_BTC/USDTUSDT_15m,4,22,0.818182,0.689985
936,%-tcp_20_shift-2_BTC/USDTUSDT_2h,3,11,0.727273,0.574862


In [248]:
# Read trial + win data from sqlite
connection_string = "sqlite:///litmus.sqlite"
timestamp_in_past = time.time() - 10*24*60*60
sql = f"""
    SELECT feature_id, importance
    FROM feature_importance_history
    WHERE model = '{model}'
    AND pair = '{pair}'
    AND train_time > '{timestamp_in_past}'"""

try:
    data = pd.read_sql_query(sql=sql, con=connection_string)
except Exception as e:
    logger.info(f"Issue reading from SQL to exclude features {e}")
    empty = np.array([])
    
data["is_important"] = data["importance"] > 0
sum_is_good = data.groupby("feature_id")["is_important"].sum()
count_is_good = data.groupby("feature_id")["is_important"].size()
summary_df = pd.concat([sum_is_good, count_is_good], keys=["wins", "trials"], axis=1)


In [95]:
summary_df

Unnamed: 0_level_0,wins,trials
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1
%-adx_100_BTC/USDTUSDT_15m,57,87
%-adx_100_BTC/USDTUSDT_2h,5,18
%-adx_100_BTC/USDTUSDT_3m,36,94
%-adx_100_SOL/USDTUSDT_15m,3,18
%-adx_100_SOL/USDTUSDT_2h,99,108
...,...,...
%-tcp_50_shift-2_BTC/USDTUSDT_2h,8,23
%-tcp_50_shift-2_BTC/USDTUSDT_3m,8,16
%-tcp_50_shift-2_SOL/USDTUSDT_15m,6,16
%-tcp_50_shift-2_SOL/USDTUSDT_2h,45,83


In [241]:
np.random.random()

0.14179219277465338

In [63]:
from_unix_date

datetime.datetime(2022, 10, 26, 10, 51, 57, 667453)

In [283]:
feat_history.groupby(pd.cut(feat_history["importance"], 10)).size()

importance
(-0.00605, -0.00359]        5
(-0.00359, -0.00115]      294
(-0.00115, 0.00128]     34518
(0.00128, 0.00372]        796
(0.00372, 0.00615]         54
(0.00615, 0.00859]          6
(0.00859, 0.011]            1
(0.011, 0.0135]             0
(0.0135, 0.0159]            0
(0.0159, 0.0183]            1
dtype: int64

In [34]:
# Scope to pair and model
data = data[(data["pair"] == ) & (data["model"] == )]
data

Unnamed: 0,model,train_time,pair,feature_id,importance,rank
0,&-primary_enter_long,1.675171e+09,SOL/USDT:USDT,%-er_50_SOL/USDTUSDT_3m,0.001301,1.0
1,&-primary_enter_long,1.675171e+09,SOL/USDT:USDT,%-tcp_10_shift-2_SOL/USDTUSDT_3m,0.000837,2.0
2,&-primary_enter_long,1.675171e+09,SOL/USDT:USDT,%-er_10_BTC/USDTUSDT_2h,0.000808,3.0
3,&-primary_enter_long,1.675171e+09,SOL/USDT:USDT,%-raw_volume_gen_shift-1_SOL/USDTUSDT_3m,0.000794,4.0
4,&-primary_enter_long,1.675171e+09,SOL/USDT:USDT,%-tcp_100_shift-1_SOL/USDTUSDT_15m,0.000744,5.0
...,...,...,...,...,...,...
125099,&-primary_enter_long,1.675404e+09,SOL/USDT:USDT,%-bb_width_10_shift-2_SOL/USDTUSDT_2h,-0.000537,196.0
125100,&-primary_enter_long,1.675404e+09,SOL/USDT:USDT,%-rsi_10_shift-2_BTC/USDTUSDT_15m,-0.000597,197.0
125101,&-primary_enter_long,1.675404e+09,SOL/USDT:USDT,%-rocr_100_BTC/USDTUSDT_3m,-0.000704,198.0
125102,&-primary_enter_long,1.675404e+09,SOL/USDT:USDT,%-macdhist_10_shift-2_SOL/USDTUSDT_2h,-0.001023,199.0


In [35]:
data["is_important"] = data["importance"] > 0

In [72]:
sum_is_good = data.groupby("feature_id")["is_important"].sum()
count_is_good = data.groupby("feature_id")["is_important"].size()

df = (sum_is_good / count_is_good)
df

feature_id
%-adx_100_BTC/USDTUSDT_15m           0.655172
%-adx_100_BTC/USDTUSDT_2h            0.277778
%-adx_100_BTC/USDTUSDT_3m            0.382979
%-adx_100_SOL/USDTUSDT_15m           0.166667
%-adx_100_SOL/USDTUSDT_2h            0.916667
                                       ...   
%-tcp_50_shift-2_BTC/USDTUSDT_2h     0.347826
%-tcp_50_shift-2_BTC/USDTUSDT_3m     0.500000
%-tcp_50_shift-2_SOL/USDTUSDT_15m    0.375000
%-tcp_50_shift-2_SOL/USDTUSDT_2h     0.542169
%-tcp_50_shift-2_SOL/USDTUSDT_3m     0.666667
Name: is_important, Length: 913, dtype: float64

In [19]:
data

Unnamed: 0,model,train_time,pair,feature_id,importance,rank,is_important,ratio_is_good
0,&-primary_enter_long,1.675171e+09,SOL/USDT:USDT,%-er_50_SOL/USDTUSDT_3m,0.001301,1.0,True,
1,&-primary_enter_long,1.675171e+09,SOL/USDT:USDT,%-tcp_10_shift-2_SOL/USDTUSDT_3m,0.000837,2.0,True,
2,&-primary_enter_long,1.675171e+09,SOL/USDT:USDT,%-er_10_BTC/USDTUSDT_2h,0.000808,3.0,True,
3,&-primary_enter_long,1.675171e+09,SOL/USDT:USDT,%-raw_volume_gen_shift-1_SOL/USDTUSDT_3m,0.000794,4.0,True,
4,&-primary_enter_long,1.675171e+09,SOL/USDT:USDT,%-tcp_100_shift-1_SOL/USDTUSDT_15m,0.000744,5.0,True,
...,...,...,...,...,...,...,...,...
125699,&-primary_enter_short,1.675413e+09,MATIC/USDT:USDT,%-er_100_BTC/USDTUSDT_15m,-0.000280,196.0,False,
125700,&-primary_enter_short,1.675413e+09,MATIC/USDT:USDT,%-cti_20_MATIC/USDTUSDT_3m,-0.000312,197.0,False,
125701,&-primary_enter_short,1.675413e+09,MATIC/USDT:USDT,%-tcp_20_shift-2_BTC/USDTUSDT_15m,-0.000354,198.0,False,
125702,&-primary_enter_short,1.675413e+09,MATIC/USDT:USDT,%-tcp_20_BTC/USDTUSDT_2h,-0.000372,199.0,False,


In [273]:
from sklearn.metrics import precision_recall_curve

In [275]:
p, r, t = precision_recall_curve([1,1,1,0,0,0], [.7, .3, .4, .1, .8, .6])

In [278]:
p.shape, r.shape, t.shape

((7,), (7,), (6,))

In [279]:
t

array([0.1, 0.3, 0.4, 0.6, 0.7, 0.8])