In [165]:
import time
import pandas as pd
import polars as pl
import sys, os
import datetime as dt
from sklearn.model_selection import TimeSeriesSplit
from pathlib import Path
from pybit.unified_trading import HTTP
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
sys.path.insert(0, os.path.join(os.getcwd(), ".."))
import fin_utilities 
import matplotlib.pyplot as plt
cfg = fin_utilities.__cfg_reading("pred")


#td = TDClient(apikey=cfg['TWELVEDATA']['API'])  

PROJECT_DIR = eval(cfg['PROJECT_PATH'])
DATA_PATH   = PROJECT_DIR / cfg['DATA_FOLDER']

BYBIT_API_KEY = "5u0HfwB5UPJeiQo3WR"
BYBIT_SECRET_KEY = "hjFn5aEvyuVEZ1dnna6R4s1NS1vw3vZdJFIL"

In [169]:
def get_last_timestamp(df: pl.DataFrame):
    return df.get_column("timestamp")[-1]


def format_data(response):    
    data = response.get('list', None)
    if not data:
        return

    data = pl.DataFrame(data, schema={
        'timestamp': pl.Int64,
        'open': pl.Float64,
        'high': pl.Float64,
        'low': pl.Float64,
        'close': pl.Float64,
        'volume': pl.Float64,
        'turnover': pl.Float64,
    }, orient="row")
    
    # Reverse the DataFrame
    data = data.reverse()
    
    # Convert timestamp to datetime
    data = data.with_columns((pl.col('timestamp') * 1000).cast(pl.Datetime).alias('date'))
    
    return data


def get_data_from(
    category: str,
    symbol: str,
    start: int,
    end: int = None,
    interval: int = 60,
):
    df = pl.DataFrame()
    while True:
        response = session.get_kline(category=category, 
                                    symbol=symbol, 
                                    start=start,
                                    interval=interval).get('result')
        
        latest = format_data(response)
        if latest is None or latest.shape[0] == 0:
            break
        
        
        if end is not None and latest.select(pl.col('timestamp').cast(pl.Int64)).to_series().max() > end:
            df = df.filter(pl.col('timestamp').cast(pl.Int64) <= int(end))
            break
        
        start = get_last_timestamp(latest)
        time.sleep(0.1)
        
        df = pl.concat([df, latest])
        print(f'Collecting data starting {dt.datetime.fromtimestamp(start / 1000)}')
        if len(latest) == 1:
            break

    df = df.unique(subset=['timestamp'], keep='last')

    return df.sort("date")


def return_index_if_exists(df_series, curr_idx, val, pos_crit, max_length):
    # Slicing the series from curr_idx + 1 to curr_idx + 1 + max_length
    future_series = df_series.slice(curr_idx + 1, max_length)
    # display(future_series)
    if pos_crit:
        indices = future_series.filter(future_series >= val).arg_min()
    else:
        indices = future_series.filter(future_series <= val).arg_min()

    # Return the relative index if condition met within max_length, else return max_length
    if indices is not None and indices < max_length:
        return indices + 1  # +1 because of slicing offset
    else:
        return max_length

def labelize_output_according_criterion2(
    df, 
    trade="long",
    threshold=0.01, 
    risk_reward_ratio=0.5, 
    max_trade_length=6,
    wrt='close', 
    hl=['high', 'low'],
):

    if trade == "long":
        df_ = df.with_columns([
            (pl.col(wrt) * (1 + threshold / risk_reward_ratio)).alias('TP'),
            (pl.col(wrt) * (1 - threshold)).alias('SL'),

        ])
        pos_crit = True
        neg_crit = False
    elif trade == "short":
        df_ = df.with_columns([
            (pl.col(wrt) * (1 - threshold / risk_reward_ratio)).alias('TP'),
            (pl.col(wrt) * (1 + threshold)).alias('SL'),

        ])
        pos_crit = False
        neg_crit = True

    min_above = []
    min_below = []


    for idx in range(len(df_)):
        # display(df_.slice(idx,1))
        TP = df_["TP"][idx]
        SL = df_["SL"][idx]
        
        if idx != len(df_) - 1:
            candidates_above_minima = [
                return_index_if_exists(
                    df_series=df_[v], 
                    curr_idx=idx,
                    val=TP, 
                    pos_crit=pos_crit, 
                    max_length=max_trade_length
                ) 
                for v in hl
            ]
            candidates_below_minima = [
                return_index_if_exists(
                    df_series=df_[v], 
                    curr_idx=idx,
                    val=SL, 
                    pos_crit=neg_crit, 
                    max_length=max_trade_length
                ) 
                for v in hl
            ]
            
            min_above.append(min(candidates_above_minima))
            min_below.append(min(candidates_below_minima))

        else:
            min_above.append(None)
            min_below.append(None)

    df_ = df_.with_columns(pl.Series('min_above', min_above))
    df_ = df_.with_columns(pl.Series('min_below', min_below))
    
    # Generate the signal
    signal = ((df_['min_above'] < df_['min_below']) & (df_['min_above'] <= max_trade_length)).cast(pl.Int8) * (1 if trade=="long" else -1)
    return signal

def split_target_features(
    df: pl.DataFrame,
    feat_to_exclude: list() = [],
    target_var: str = 'signal'
) -> [pl.DataFrame, pl.Series]:
    """
    Given a DataFrame, split df into features and target.

    Args:
        df: dataframe to be split
        feat_to_exclude: list of columns to exclude from features
        target_var: variable target

    Returns:
        Tuple[pl.DataFrame, pl.Series]: feature and target data
    """
    # Select columns that are not in feat_to_exclude and not the target_var
    features = df.select([col for col in df.columns if col not in (feat_to_exclude + [target_var])])
    labels = df.select([target_var])

    return features, labels

In [170]:
session = HTTP(
    testnet=False,
    api_key=BYBIT_API_KEY,
    api_secret=BYBIT_SECRET_KEY,
)

In [171]:
# COPPIE DISPONIBILI
result = session.get_tickers(
    category="linear"
).get('result')['list']

tickers = [asset['symbol'] for asset in result if asset['symbol'].endswith('USDT')]
print(tickers)

['10000000AIDOGEUSDT', '1000000BABYDOGEUSDT', '1000000MOGUSDT', '1000000PEIPEIUSDT', '10000COQUSDT', '10000LADYSUSDT', '10000NFTUSDT', '10000SATSUSDT', '10000WENUSDT', '1000APUUSDT', '1000BEERUSDT', '1000BONKUSDT', '1000BTTUSDT', '1000CATUSDT', '1000FLOKIUSDT', '1000LUNCUSDT', '1000NEIROCTOUSDT', '1000PEPEUSDT', '1000RATSUSDT', '1000TURBOUSDT', '1000XECUSDT', '1CATUSDT', '1INCHUSDT', 'A8USDT', 'AAVEUSDT', 'ACEUSDT', 'ACHUSDT', 'ADAUSDT', 'AERGOUSDT', 'AEROUSDT', 'AEVOUSDT', 'AGIUSDT', 'AGLDUSDT', 'AIOZUSDT', 'AIUSDT', 'AKROUSDT', 'AKTUSDT', 'ALGOUSDT', 'ALICEUSDT', 'ALPACAUSDT', 'ALPHAUSDT', 'ALTUSDT', 'AMBUSDT', 'ANKRUSDT', 'APEUSDT', 'API3USDT', 'APTUSDT', 'ARBUSDT', 'ARKMUSDT', 'ARKUSDT', 'ARPAUSDT', 'ARUSDT', 'ASTRUSDT', 'ATAUSDT', 'ATHUSDT', 'ATOMUSDT', 'AUCTIONUSDT', 'AUDIOUSDT', 'AVAILUSDT', 'AVAXUSDT', 'AXLUSDT', 'AXSUSDT', 'BADGERUSDT', 'BAKEUSDT', 'BALUSDT', 'BANANAUSDT', 'BANDUSDT', 'BATUSDT', 'BBUSDT', 'BCHUSDT', 'BEAMUSDT', 'BELUSDT', 'BENDOGUSDT', 'BICOUSDT', 'BIGTIMEUSDT

In [181]:
start = int(dt.datetime(2024, 5, 1).timestamp()* 1000)
interval = 60
symbols_list = ["ETHUSDT", "BTCUSDT"]

df_orig = pl.DataFrame()
for symbol in symbols_list:
    print(f"Collecting data for {symbol}")
    tmp_df = get_data_from(
        category='linear',
        symbol=symbol,
        start=start,
        end=None,
        interval=interval,
    )
    tmp_df = tmp_df.with_columns(pl.lit(symbol.replace("USDT","")).alias("symbol"))
    tmp_df = tmp_df.with_columns(labelize_output_according_criterion2(tmp_df, "long").alias("long_signal"))
    tmp_df = tmp_df.with_columns(labelize_output_according_criterion2(tmp_df, "short").alias("short_signal"))

    df_orig = df_orig.vstack(tmp_df)

df_orig.head()

Collecting data for ETHUSDT
Collecting data starting 2024-05-09 07:00:00
Collecting data starting 2024-05-17 14:00:00
Collecting data starting 2024-05-25 21:00:00
Collecting data starting 2024-06-03 04:00:00
Collecting data starting 2024-06-11 11:00:00
Collecting data starting 2024-06-19 18:00:00
Collecting data starting 2024-06-28 01:00:00
Collecting data starting 2024-07-06 08:00:00
Collecting data starting 2024-07-14 15:00:00
Collecting data starting 2024-07-22 22:00:00
Collecting data starting 2024-07-31 05:00:00
Collecting data starting 2024-08-08 12:00:00
Collecting data starting 2024-08-16 19:00:00
Collecting data starting 2024-08-25 02:00:00
Collecting data starting 2024-09-02 09:00:00
Collecting data starting 2024-09-10 16:00:00
Collecting data starting 2024-09-16 15:00:00
Collecting data starting 2024-09-16 15:00:00
Collecting data for BTCUSDT
Collecting data starting 2024-05-09 07:00:00
Collecting data starting 2024-05-17 14:00:00
Collecting data starting 2024-05-25 21:00:00

timestamp,open,high,low,close,volume,turnover,date,symbol,long_signal,short_signal
i64,f64,f64,f64,f64,f64,f64,datetime[μs],str,i8,i8
1714514400000,2986.41,3029.22,2985.01,3008.7,42773.24,128710000.0,2024-04-30 22:00:00,"""ETH""",0,0
1714518000000,3008.7,3028.36,2998.42,3012.8,22603.54,68122000.0,2024-04-30 23:00:00,"""ETH""",0,0
1714521600000,3012.8,3022.1,2985.63,2998.25,28729.55,86223000.0,2024-05-01 00:00:00,"""ETH""",0,0
1714525200000,2998.25,3013.9,2975.46,3000.94,31220.7,93578000.0,2024-05-01 01:00:00,"""ETH""",0,-1
1714528800000,3000.94,3002.79,2955.95,2985.8,40039.63,119170000.0,2024-05-01 02:00:00,"""ETH""",0,-1


In [182]:
df = df_orig

In [183]:
df.slice(278,10)

timestamp,open,high,low,close,volume,turnover,date,symbol,long_signal,short_signal
i64,f64,f64,f64,f64,f64,f64,datetime[μs],str,i8,i8
1715515200000,2927.09,2933.55,2924.27,2928.8,8634.64,25285000.0,2024-05-12 12:00:00,"""ETH""",0,0
1715518800000,2928.8,2936.9,2926.06,2934.91,6086.23,17843000.0,2024-05-12 13:00:00,"""ETH""",0,0
1715522400000,2934.91,2938.5,2925.57,2928.42,7879.38,23095000.0,2024-05-12 14:00:00,"""ETH""",0,0
1715526000000,2928.42,2941.99,2925.2,2936.7,10804.22,31698000.0,2024-05-12 15:00:00,"""ETH""",0,0
1715529600000,2936.7,2955.11,2927.6,2927.61,26844.65,79033000.0,2024-05-12 16:00:00,"""ETH""",0,0
1715533200000,2927.61,2941.77,2919.24,2939.71,14923.21,43748000.0,2024-05-12 17:00:00,"""ETH""",0,0
1715536800000,2939.71,2940.55,2925.75,2927.12,6812.72,19988000.0,2024-05-12 18:00:00,"""ETH""",0,0
1715540400000,2927.12,2929.4,2919.65,2928.0,5051.21,14773000.0,2024-05-12 19:00:00,"""ETH""",0,0
1715544000000,2928.0,2929.03,2921.39,2922.15,3947.71,11548000.0,2024-05-12 20:00:00,"""ETH""",0,0
1715547600000,2922.15,2933.11,2911.38,2927.71,12734.86,37191000.0,2024-05-12 21:00:00,"""ETH""",0,-1


# RIMUOVO TIMESTAMP E ROBE SIMILI

In [153]:
# cols_list_to_drop = ["timestamp","turnover"]
# df = df.select(pl.all().exclude(cols_list_to_drop))
# df

## SCALO FEATURE

In [184]:
def scale_features(df, cols_list):
    if isinstance(cols_list, list):
        for col in cols_list:
            df = df.with_columns(df[col].pct_change().alias(f"{col}_pct_change"))
            df = df.with_columns(((df[col] - df[col].mean()) / df[col].std()).alias(f"{col}_z_score"))
    elif isinstance(cols_list, str):
        col = cols_list
        df = df.with_columns(df[col].pct_change().alias(f"{col}_pct_change"))
        df = df.with_columns(((df[col] - df[col].mean()) / df[col].std()).alias(f"{col}_z_score"))
    else:
        raise ValueError("cols_list must be a list or a string")
    
    return df

In [185]:
df_btc = df.filter(pl.col("symbol")=="BTC")
df_btc = scale_features(df_btc, ["close"])

In [186]:
# Create a list to store the processed dataframes
processed_dfs = []

for symbol in df["symbol"].unique():
    # Filter the dataframe for the current symbol
    tmp_df = df.filter(pl.col("symbol") == symbol)
    
    # Apply scale_features to the filtered dataframe
    tmp_df = scale_features(tmp_df, ["close"])
    
    # Append the processed dataframe to the list
    processed_dfs.append(tmp_df)

# Concatenate all processed dataframes
df = pl.concat(processed_dfs)

# Drop any null values that might have been introduced
df = df.drop_nulls()

# Display the first few rows of the updated dataframe
df.head()

timestamp,open,high,low,close,volume,turnover,date,symbol,long_signal,short_signal,close_pct_change,close_z_score
i64,f64,f64,f64,f64,f64,f64,datetime[μs],str,i8,i8,f64,f64
1714518000000,60562.8,60915.6,60376.0,60655.9,2720.286,164920000.0,2024-04-30 23:00:00,"""BTC""",0,0,0.001537,-0.476067
1714521600000,60655.9,60820.0,60031.8,60213.5,5591.462,337550000.0,2024-05-01 00:00:00,"""BTC""",0,0,-0.007294,-0.576289
1714525200000,60213.5,60380.0,59800.1,60112.5,5227.323,314110000.0,2024-05-01 01:00:00,"""BTC""",0,-1,-0.001677,-0.599169
1714528800000,60112.5,60159.4,59550.0,59910.0,6856.557,409740000.0,2024-05-01 02:00:00,"""BTC""",0,-1,-0.003369,-0.645044
1714532400000,59910.0,60299.0,59883.0,60222.9,2414.526,145080000.0,2024-05-01 03:00:00,"""BTC""",0,-1,0.005223,-0.574159


In [155]:
for symbol in df["symbol"].unique():
    tmp_df = scale_features(df.filter(pl.col("symbol")==symbol), ["close"])
    df_orig = df_orig.vstack(tmp_df)
    df = df.drop_nulls()
    df.head()

timestamp,open,high,low,close,volume,turnover,date,long_signal,short_signal,close_pct_change,close_z_score
i64,f64,f64,f64,f64,f64,f64,datetime[μs],i8,i8,f64,f64
1672531200000,1196.8,1197.1,1193.15,1194.5,19133.21,22864000.0,2023-01-01 00:00:00,0,0,-0.001922,-1.555648
1672534800000,1194.5,1197.05,1194.3,1196.65,9560.9,11432000.0,2023-01-01 01:00:00,0,0,0.0018,-1.552668
1672538400000,1196.65,1197.45,1194.75,1196.35,5603.39,6701600.0,2023-01-01 02:00:00,0,0,-0.000251,-1.553084
1672542000000,1196.35,1196.4,1191.9,1194.75,17862.83,21325000.0,2023-01-01 03:00:00,0,0,-0.001337,-1.555302
1672545600000,1194.75,1194.75,1191.35,1193.95,13255.81,15817000.0,2023-01-01 04:00:00,0,0,-0.00067,-1.556411


In [190]:
df_btc.drop_nulls().equals(df.filter(pl.col("symbol")=="BTC"))

True

# Creo Feature

In [158]:
def extract_date_features(df, date_col):
    df = df.with_columns(
        pl.col(date_col).dt.weekday().alias("dow"),
        pl.col(date_col).dt.month().alias("month"),
        pl.col(date_col).dt.hour().alias("hour"),
    )

    return df

moving_averages_list = [13, 50, 100, 200]
on_cols = ["close_z_score","close"]

for window in moving_averages_list:
    for col in on_cols:
        df = df.with_columns(
            (pl.col(col).rolling_mean(window_size=window)).alias(f"SMA_{col}_{window}"),
        )

df = df.drop_nulls()
df = extract_date_features(df, "date")
df.head(5)


timestamp,open,high,low,close,volume,turnover,date,long_signal,short_signal,close_pct_change,close_z_score,SMA_close_z_score_13,SMA_close_13,SMA_close_z_score_50,SMA_close_50,SMA_close_z_score_100,SMA_close_100,SMA_close_z_score_200,SMA_close_200,dow,month,hour
i64,f64,f64,f64,f64,f64,f64,datetime[μs],i8,i8,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8,i8,i8
1673247600000,1307.55,1312.05,1306.5,1310.75,18593.63,24357000.0,2023-01-09 07:00:00,0,0,0.002447,-1.394507,-1.420652,1291.888462,-1.450146,1270.611,-1.461238,1262.609,-1.491338,1240.895,1,1,7
1673251200000,1310.75,1319.55,1306.95,1308.95,56770.26,74504000.0,2023-01-09 08:00:00,0,0,-0.001373,-1.397002,-1.415886,1295.326923,-1.448904,1271.507,-1.460458,1263.172,-1.490544,1241.46725,1,1,8
1673254800000,1308.95,1325.85,1308.25,1316.35,82895.43,109190000.0,2023-01-09 09:00:00,0,0,0.005653,-1.386745,-1.410864,1298.95,-1.447495,1272.524,-1.459574,1263.81,-1.489715,1242.06575,1,1,9
1673258400000,1316.35,1327.4,1315.45,1321.95,55810.78,73778000.0,2023-01-09 10:00:00,0,0,0.004254,-1.378982,-1.405143,1303.076923,-1.445885,1273.685,-1.458579,1264.5275,-1.488844,1242.69375,1,1,10
1673262000000,1321.95,1323.5,1316.15,1317.5,33606.75,44365000.0,2023-01-09 11:00:00,1,0,-0.003366,-1.385151,-1.399913,1306.85,-1.444409,1274.75,-1.457648,1265.1995,-1.487993,1243.3075,1,1,11


In [161]:
#FEATURES = ["volume","close_pct_change","close_z_score","SMA_close_z_score_13","SMA_close_z_score_50","SMA_close_z_score_100","SMA_close_z_score_200","dow","month","hour"]
FEATURES = ["volume","close_pct_change","close","SMA_close_13","SMA_close_50","SMA_close_100","SMA_close_200","dow","month","hour"]

In [162]:
tscv = TimeSeriesSplit(n_splits=15, test_size=100)
p = {  
    'eval_metric':'auc',
    'tree_method':'hist',
    # 'lambda':5,
    #'gamma':2,
    'max_depth':6,
    # 'scale_pos_weight':2,
    'objective':'binary:logistic',
    'subsample': .8,
    # 'colsample_bytree': .9,
    'min_child_weight':1,
    'eta':0.02,
    'n_estimators':2000
}


def polars_time_series_cross_val(df: pl.DataFrame, target_var: str, n_splits: int = 5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    model = xgb.XGBClassifier(**p)  # Set optimal hyperparameters here

    # Cross-validation to find the best hyperparameters
    for train_index, test_index in tscv.split(df):
        train_data = df.slice(train_index[0], len(train_index))
        test_data = df.slice(test_index[0], len(test_index))
        
        # Splitting features and labels
        train_features, train_labels = split_target_features(train_data, target_var=target_var)
        val_features, val_labels = split_target_features(test_data, target_var=target_var)

        # Fit model (you can include hyperparameter tuning here)
        model.fit(train_features.to_pandas(), train_labels.to_pandas(), 
                  eval_set=[(train_features.to_pandas(), train_labels.to_pandas()), 
                            (val_features.to_pandas(), val_labels.to_pandas())], 
                  verbose=500)
    
    # This loop is only for cross-validation; the model will be re-trained below
    return model

# Final training using the entire dataset
def train_final_model(df: pl.DataFrame, target_var: str):
    model = xgb.XGBClassifier(**p)  # Set optimal hyperparameters here
    features, labels = split_target_features(df, target_var=target_var)
    # Train the final model on the entire dataset
    model.fit(features.to_pandas(), labels.to_pandas(), verbose=1)
    return model

# Usage
# Run cross-validation to tune hyperparameters
cv_model = polars_time_series_cross_val(df.select(FEATURES + ["long_signal"]), target_var='long_signal', n_splits=5)

print("Final Train")
# Train the final model using the entire dataset
final_model = train_final_model(df.select(FEATURES + ["long_signal"]), target_var='long_signal')

[0]	validation_0-auc:0.82764	validation_1-auc:0.57461
[500]	validation_0-auc:0.99899	validation_1-auc:0.53051
[1000]	validation_0-auc:1.00000	validation_1-auc:0.53026
[1500]	validation_0-auc:1.00000	validation_1-auc:0.52946
[1999]	validation_0-auc:1.00000	validation_1-auc:0.52588
[0]	validation_0-auc:0.80938	validation_1-auc:0.52675
[500]	validation_0-auc:0.99723	validation_1-auc:0.55216
[1000]	validation_0-auc:0.99991	validation_1-auc:0.54289
[1500]	validation_0-auc:1.00000	validation_1-auc:0.54359
[1999]	validation_0-auc:1.00000	validation_1-auc:0.54265
[0]	validation_0-auc:0.78668	validation_1-auc:0.46075
[500]	validation_0-auc:0.99637	validation_1-auc:0.47761
[1000]	validation_0-auc:0.99972	validation_1-auc:0.47118
[1500]	validation_0-auc:0.99999	validation_1-auc:0.46845
[1999]	validation_0-auc:1.00000	validation_1-auc:0.46475
[0]	validation_0-auc:0.76582	validation_1-auc:0.56646
[500]	validation_0-auc:0.98949	validation_1-auc:0.58269
[1000]	validation_0-auc:0.99828	validation_1-au

In [133]:
model = xgb.XGBClassifier(**p)  # Set optimal hyperparameters here
features, labels = split_target_features(df.select(FEATURES + ["long_signal"]), target_var='long_signal')
# Train the final model on the entire dataset
model.fit(features.to_pandas(), labels.to_pandas(), verbose=True)
model.get_num_boosting_rounds()

2000

In [163]:
y_hat = model.predict_proba(features.to_pandas())[:,1]

In [164]:
from sklearn.metrics import roc_auc_score
roc_auc_score(labels.to_pandas(),y_hat)

np.float64(1.0)