In [1]:
import logging
import os

import pandas as pd

_DEFAULT_BASE_DIR = os.path.join(os.path.expanduser('~'), 'crypto_data')
_BASE_DIR = os.getenv('CRYPTO_BASE_DIR', _DEFAULT_BASE_DIR)

SPOT_DIR = os.path.join(_BASE_DIR, 'binance_quantclass', 'candle_parquet', 'spot', '1h')
USDT_FUTURES_DIR = os.path.join(_BASE_DIR, 'binance_quantclass', 'candle_parquet', 'usdt_futures', '1h')

In [2]:
def read(type_, symbol):
    if type_ == 'spot':
        df = pd.read_parquet(os.path.join(SPOT_DIR, f'{symbol}.pqt'))
    elif type_ == 'usdt_futures':
        df = pd.read_parquet(os.path.join(USDT_FUTURES_DIR, f'{symbol}.pqt'))

    return df[['candle_begin_time', 'open', 'high', 'low', 'close', 'volume']]


read('spot', 'VIDTUSDT').loc['2022-10-31 02:00:00+00:00':].head()

Unnamed: 0_level_0,candle_begin_time,open,high,low,close,volume
candle_end_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-10-31 02:00:00+00:00,2022-10-31 01:00:00+00:00,0.476,0.4788,0.441,0.4543,1134271.7
2022-10-31 03:00:00+00:00,2022-10-31 02:00:00+00:00,0.4535,0.4731,0.4452,0.4731,1441014.4
2022-11-09 09:00:00+00:00,2022-11-09 08:00:00+00:00,0.044,0.05333,0.02272,0.02511,175821916.0
2022-11-09 10:00:00+00:00,2022-11-09 09:00:00+00:00,0.02481,0.02985,0.0212,0.02355,65485693.0
2022-11-09 11:00:00+00:00,2022-11-09 10:00:00+00:00,0.02355,0.025,0.02258,0.02445,35227664.0


In [3]:
def check(df):    
    df['time_diff'] = df['candle_begin_time'].diff()

    gaps = []
    idxes = df[df['time_diff'] > df['time_diff'].min()].index
    for idx in idxes:
        tail = df.loc[:idx].tail(2)
        begin_time_before = tail.iloc[0]['candle_begin_time']
        begin_time_after = tail.iloc[1]['candle_begin_time']
        time_gap = begin_time_after - begin_time_before
        price_change = tail.iloc[1]['open'] / tail.iloc[0]['close'] - 1
        gaps.append((begin_time_after, time_gap, price_change))

    return pd.DataFrame(gaps, columns=['relist_time', 'time_gap', 'price_change'])

def check_gaps(type_, symbol):
    df = read(type_, symbol)
    df_result = check(df)
    df_result['type'] = type_
    df_result['symbol'] = symbol
    return df_result

print(check_gaps('spot', 'VIDTUSDT').to_markdown(index=False))

| relist_time               | time_gap        |   price_change | type   | symbol   |
|:--------------------------|:----------------|---------------:|:-------|:---------|
| 2021-09-29 09:00:00+00:00 | 0 days 03:00:00 |    0.000516929 | spot   | VIDTUSDT |
| 2022-11-09 08:00:00+00:00 | 9 days 06:00:00 |   -0.906996    | spot   | VIDTUSDT |
| 2023-03-24 14:00:00+00:00 | 0 days 03:00:00 |   -0.00166482  | spot   | VIDTUSDT |


In [4]:
df = read('spot', 'NBTUSDT')
print(df.shape)
print((df.index[-1] - df.index[0]) // pd.Timedelta(hours=1))

display(df.tail())

display(check_gaps('spot', 'NBTUSDT'))

(6796, 6)
9089


Unnamed: 0_level_0,candle_begin_time,open,high,low,close,volume
candle_end_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-03-24 20:00:00+00:00,2023-03-24 19:00:00+00:00,0.00321,0.00328,0.00318,0.00318,14927.0
2023-03-24 21:00:00+00:00,2023-03-24 20:00:00+00:00,0.00327,0.00333,0.00327,0.00328,109760.0
2023-03-24 22:00:00+00:00,2023-03-24 21:00:00+00:00,0.00328,0.00329,0.00312,0.00312,175741.0
2023-03-24 23:00:00+00:00,2023-03-24 22:00:00+00:00,0.00323,0.00325,0.00315,0.00325,40207.0
2023-03-25 00:00:00+00:00,2023-03-24 23:00:00+00:00,0.00325,0.00328,0.00317,0.00328,43650.0


Unnamed: 0,relist_time,time_gap,price_change,type,symbol
0,2022-03-21 04:00:00+00:00,0 days 02:00:00,0.001720,spot,NBTUSDT
1,2022-03-21 20:00:00+00:00,0 days 02:00:00,0.004386,spot,NBTUSDT
2,2022-03-23 19:00:00+00:00,0 days 02:00:00,0.002742,spot,NBTUSDT
3,2022-03-23 21:00:00+00:00,0 days 02:00:00,0.018764,spot,NBTUSDT
4,2022-03-24 19:00:00+00:00,0 days 02:00:00,0.000452,spot,NBTUSDT
...,...,...,...,...,...
1037,2023-03-15 22:00:00+00:00,0 days 02:00:00,0.015060,spot,NBTUSDT
1038,2023-03-17 22:00:00+00:00,0 days 02:00:00,0.015106,spot,NBTUSDT
1039,2023-03-19 22:00:00+00:00,0 days 02:00:00,0.015060,spot,NBTUSDT
1040,2023-03-24 01:00:00+00:00,0 days 02:00:00,0.000000,spot,NBTUSDT


In [5]:
STABLECOINS = {'BKRWUSDT', 'USDCUSDT', 'USDPUSDT', 'TUSDUSDT', 'BUSDUSDT', 'FDUSDUSDT', 'DAIUSDT', 'EURUSDT', 'GBPUSDT',
               'USBPUSDT', 'SUSDUSDT', 'PAXGUSDT', 'AEURUSDT'}

BLACKLIST = {'NBTUSDT'}

def filter_symbols(symbols):
    lev_symbols = {x for x in symbols if x.endswith(('UPUSDT', 'DOWNUSDT', 'BEARUSDT', 'BULLUSDT')) and x != 'JUPUSDT'}
    not_usdt_symbols = {x for x in symbols if not x.endswith('USDT')}

    excludes = set.union(not_usdt_symbols, lev_symbols, STABLECOINS, BLACKLIST).intersection(symbols)

    symbols_filtered = sorted(set(symbols) - excludes)
    return symbols_filtered

def get_filtered_symbols(type_):
    if type_ == 'spot':
        input_dir = SPOT_DIR
    elif type_ == 'usdt_futures':
        input_dir = USDT_FUTURES_DIR
    else:
        raise ValueError('%s is not supported', type_)

    symbols = sorted(os.path.splitext(x)[0] for x in os.listdir(input_dir))
    symbols = filter_symbols(symbols)
    return symbols

get_filtered_symbols('spot')[:5]

['1000SATSUSDT', '1INCHUSDT', 'AAVEUSDT', 'ACAUSDT', 'ACEUSDT']

In [6]:
symbols = get_filtered_symbols('spot')

dfs = [check_gaps('spot',  symbol) for symbol in symbols]
dfs = [df for df in dfs if len(df)]
df_gap = pd.concat(dfs, ignore_index=True)

threshold = pd.Timedelta(days=2)

df_gap_short = df_gap[df_gap['time_gap'] <  threshold]
df_gap_long = df_gap[df_gap['time_gap'] >= threshold].reset_index(drop=True)

print(df_gap_long.to_markdown(index=False))

| relist_time               | time_gap          |   price_change | type   | symbol    |
|:--------------------------|:------------------|---------------:|:-------|:----------|
| 2023-02-22 08:00:00+00:00 | 6 days 06:00:00   |     -0.99      | spot   | BNXUSDT   |
| 2021-03-19 07:00:00+00:00 | 4 days 01:00:00   |     -0.900002  | spot   | BTCSTUSDT |
| 2021-01-23 02:00:00+00:00 | 4 days 01:00:00   |    999         | spot   | COCOSUSDT |
| 2023-05-12 08:00:00+00:00 | 154 days 06:00:00 |     -0.0899796 | spot   | CVCUSDT   |
| 2021-04-02 04:00:00+00:00 | 4 days 01:00:00   |     99.0076    | spot   | DREPUSDT  |
| 2023-09-22 08:00:00+00:00 | 311 days 04:00:00 |     -0.31909   | spot   | FTTUSDT   |
| 2023-03-10 08:00:00+00:00 | 28 days 06:00:00  |      0.840514  | spot   | KEYUSDT   |
| 2022-05-31 06:00:00+00:00 | 18 days 06:00:00  |  19999         | spot   | LUNAUSDT  |
| 2023-07-21 08:00:00+00:00 | 4 days 06:00:00   |     -0.999     | spot   | QUICKUSDT |
| 2024-03-28 08:00:00+00:00 | 8 

In [7]:
print(df_gap_short.describe([.01, .1, .9, .99]).to_markdown())

|       | time_gap                  |   price_change |
|:------|:--------------------------|---------------:|
| count | 7754                      | 7754           |
| mean  | 0 days 02:55:12.612844983 |   -2.27507e-05 |
| std   | 0 days 01:51:53.365951751 |    0.00527826  |
| min   | 0 days 02:00:00           |   -0.0414446   |
| 1%    | 0 days 02:00:00           |   -0.0161004   |
| 10%   | 0 days 02:00:00           |   -0.00461637  |
| 50%   | 0 days 02:00:00           |    0           |
| 90%   | 0 days 05:00:00           |    0.00452761  |
| 99%   | 0 days 11:00:00           |    0.0159473   |
| max   | 1 days 10:00:00           |    0.0586466   |


In [8]:
symbols = get_filtered_symbols('usdt_futures')

dfs = [check_gaps('usdt_futures',  symbol) for symbol in symbols]
dfs = [df for df in dfs if len(df)]
df_gap = pd.concat(dfs, ignore_index=True)

print(df_gap.sort_values('relist_time').to_markdown(index=False))

| relist_time               | time_gap          |   price_change | type         | symbol   |
|:--------------------------|:------------------|---------------:|:-------------|:---------|
| 2019-09-08 19:00:00+00:00 | 0 days 02:00:00   |     0.034477   | usdt_futures | BTCUSDT  |
| 2019-09-09 02:00:00+00:00 | 0 days 03:00:00   |    -0.00721831 | usdt_futures | BTCUSDT  |
| 2019-11-27 10:00:00+00:00 | 0 days 02:00:00   |     0.0691729  | usdt_futures | ETHUSDT  |
| 2021-03-02 02:00:00+00:00 | 0 days 02:00:00   |    -0.00228447 | usdt_futures | BLZUSDT  |
| 2021-03-02 02:00:00+00:00 | 0 days 02:00:00   |     0.00188201 | usdt_futures | CTKUSDT  |
| 2021-03-02 02:00:00+00:00 | 0 days 02:00:00   |    -0.00205245 | usdt_futures | DODOUSDT |
| 2021-03-02 02:00:00+00:00 | 0 days 02:00:00   |     0          | usdt_futures | LITUSDT  |
| 2022-09-27 02:00:00+00:00 | 108 days 17:00:00 |     0.0263975  | usdt_futures | ICPUSDT  |
| 2023-02-22 14:00:00+00:00 | 11 days 15:00:00  |    -0.987751   | usd

In [9]:
df_luna = pd.read_feather('/Users/lostleaf/dev/bwb_backtest/data/数据整理/spot/LUNA/factors/LUNA_PctChange_D.pkl')
tmp = df_luna[df_luna['candle_begin_time'] >= '2022-05-26'].head(10)
print(tmp.to_markdown(index=False))

| candle_begin_time   |   PctChange_3 |   PctChange_7 |   PctChange_10 |   PctChange_14 |
|:--------------------|--------------:|--------------:|---------------:|---------------:|
| 2022-05-26 00:00:00 |       0       |             0 |              0 |      -0.999954 |
| 2022-05-27 00:00:00 |       0       |             0 |              0 |      -0.84375  |
| 2022-05-28 00:00:00 |       0       |             0 |              0 |       0        |
| 2022-05-29 00:00:00 |       0       |             0 |              0 |       0        |
| 2022-05-30 00:00:00 |       0       |             0 |              0 |       0        |
| 2022-05-31 00:00:00 |       0       |             0 |              0 |       0        |
| 2022-06-01 00:00:00 |  177399       |        177399 |         177399 |  177399        |
| 2022-06-02 00:00:00 |  130589       |        130589 |         130589 |  130589        |
| 2022-06-03 00:00:00 |  142049       |        142049 |         142049 |  142049        |
| 2022-06-