In [1]:
import os
import pandas as pd
import numpy as np
import sqlite3
import seaborn as sns

from datetime import datetime
from tqdm.notebook import tqdm
from config import column_names,schedule

In [2]:
def strided_app(a, L, S ):  # Window len = L, Stride len/stepsize = S
    nrows = ((a.size-L)//S)+1
    n = a.strides[0]
    return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))

def to_ms(dt):
    # Convert datetime to milliseconds since epoch
    return int(dt.timestamp() * 1000)

def get_prices(start_date,end_date,db_path = "binance_klines.db"):
    start_time = to_ms(start_date)
    end_time = to_ms(end_date)

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()


    query = """
    SELECT symbol,close_time, open, high, low, close, volume
    FROM klines
    WHERE open_time BETWEEN ? AND ?
        AND symbol IN ('BTCUSDT', 'ETHUSDT','XRPUSDT','BNBUSDT','ADAUSDT','SOLUSDT','MATICUSDT','AVAXUSDT','GMTUSDT','SANDUSDT','AAVEUSDT','DOTUSDT','CRVUSDT','LINKUSDT','NEARUSDT')
    ORDER BY symbol, close_time
    """

    cursor.execute(query, (start_time, end_time))
    rows = cursor.fetchall()

    conn.close()
    df = pd.DataFrame(rows,columns = ['symbol','open_time', 'open', 'high', 'low', 'close', 'volume'])
    df.index = pd.to_datetime(df['open_time'],unit = 'ms')
    df = df.drop(columns = {'open_time'})
    return df

def get_hourly_vol(close: pd.Series, span: int = 100) -> pd.Series:
    """
    Calculate daily volatility using exponentially weighted moving average.
    
    Parameters:
    -----------
    close : pd.Series
        Close prices with datetime index
    span : int
        Span for exponential moving average (default: 100)
    
    Returns:
    --------
    pd.Series
        Daily volatility estimates
    """
    # Calculate daily returns
    returns = close.pct_change().dropna()
    
    # Calculate exponentially weighted volatility
    vol = returns.ewm(span=span).std()
    
    return vol

def remove_overlapping_events(df):
    """
    Remove overlapping events, keeping the first occurrence.
    - If value is 0, always keep it
    - If value is 1 or -1, remove any subsequent overlapping events
    """
    # Create a copy and convert datetime columns
    df_result = df.copy()
    df_result['start_time'] = pd.to_datetime(df_result['start_time'])
    df_result['end_time'] = pd.to_datetime(df_result['end_time'])
    
    # Sort by start_time to process in chronological order
    df_result = df_result.sort_values('start_time').reset_index(drop=True)
    
    # List to store indices of rows to keep
    keep_indices = []
    
    # Track active events (events that haven't ended yet)
    active_events = []
    
    for idx, row in df_result.iterrows():
        current_start = row['start_time']
        current_end = row['end_time']
        current_value = row['value']
        
        # Remove expired events from active_events
        active_events = [event for event in active_events if event['end_time'] > current_start]
        
        # If current event is 0, always keep it
        if current_value == 0:
            keep_indices.append(idx)
        else:
            # Check if current event overlaps with any active non-zero event
            overlaps = False
            for active_event in active_events:
                if active_event['value'] != 0:  # Only check non-zero events
                    # Check for overlap: events overlap if one starts before the other ends
                    if (current_start < active_event['end_time'] and 
                        current_end > active_event['start_time']):
                        overlaps = True
                        break
            
            # If no overlap with non-zero events, keep this event
            if not overlaps:
                keep_indices.append(idx)
                # Add to active events
                active_events.append({
                    'start_time': current_start,
                    'end_time': current_end,
                    'value': current_value
                })
    
    # Return only the rows we want to keep
    return df_result.iloc[keep_indices].reset_index(drop=True)

# Alternative simpler version - only considers same value overlaps
def remove_overlapping_events_simple(df):
    """
    Simpler version: remove overlapping events of the same non-zero value
    """
    df_result = df.copy()
    df_result['start_time'] = pd.to_datetime(df_result['start_time'])
    df_result['end_time'] = pd.to_datetime(df_result['end_time'])
    
    # Sort by start_time
    df_result = df_result.sort_values('start_time').reset_index(drop=True)
    
    keep_indices = []
    last_nonzero_end = {}  # Track last end time for each value
    
    for idx, row in df_result.iterrows():
        current_start = row['start_time']
        current_end = row['end_time']
        current_value = row['value']
        
        # Always keep zeros
        if current_value == 0:
            keep_indices.append(idx)
        else:
            # For non-zero values, check if it overlaps with previous same value
            if (current_value not in last_nonzero_end or 
                current_start >= last_nonzero_end[current_value]):
                keep_indices.append(idx)
                last_nonzero_end[current_value] = current_end
    
    return df_result.iloc[keep_indices].reset_index(drop=True)

def remove_overlapping_events_with_0(df):
    """
    Remove overlapping events, keeping the first occurrence.
    - Remove overlapping events for ALL values (0, 1, -1)
    """
    # Create a copy and convert datetime columns
    df_result = df.copy()
    df_result['start_time'] = pd.to_datetime(df_result['start_time'])
    df_result['end_time'] = pd.to_datetime(df_result['end_time'])
    
    # Sort by start_time to process in chronological order
    df_result = df_result.sort_values('start_time').reset_index(drop=True)
    
    # List to store indices of rows to keep
    keep_indices = []
    
    # Track active events (events that haven't ended yet)
    active_events = []
    
    for idx, row in df_result.iterrows():
        current_start = row['start_time']
        current_end = row['end_time']
        current_value = row['value']
        
        # Remove expired events from active_events
        active_events = [event for event in active_events if event['end_time'] > current_start]
        
        # Check if current event overlaps with any active event
        overlaps = False
        for active_event in active_events:
            # Check for overlap: events overlap if one starts before the other ends
            if (current_start < active_event['end_time'] and 
                current_end > active_event['start_time']):
                overlaps = True
                break
        
        # If no overlap with any active event, keep this event
        if not overlaps:
            keep_indices.append(idx)
            # Add to active events
            active_events.append({
                'start_time': current_start,
                'end_time': current_end,
                'value': current_value
            })
    
    # Return only the rows we want to keep
    return df_result.iloc[keep_indices].reset_index(drop=True)

def window_labeling(df):
    df_result = df.copy()
    df_result[df_result.abs()<3] = 0
    df_result[df_result>=3] = 1
    df_result[df_result<=-3] = -1

    result = []

    for idx, row in df_result.iterrows():
        # Get all values except timestamp column
        values = row.iloc[1:].values  # Skip first column (timestamp)
        for val in values:
            if val == 1:
                result.append(1)
                break
            elif val == -1:
                result.append(-1)
                break
        else:
            result.append(0)  # Only if no 1 or -1 found

    fwd_events = pd.Series(result,index = df_result.index)
    return fwd_events

def labeler(df,th):

    df_result = df.copy()
    df_result[df_result.abs()<th] = 0
    df_result[df_result>=th] = 1
    df_result[df_result<=-th] = -1
    
    result = []
    
    for i in df_result.index:
        start_time = i
        row = df_result.loc[start_time]
    
        if (1 in row.tolist()) | (-1 in row.tolist()):
            for t,r in enumerate(row):
                if r!=0:
                    value = r
                    col = row.index[t]
                    result.append(dict(start_time = start_time,end_time =start_time + pd.Timedelta(f"{t} min"),value = value))
                    break
        else:
            result.append(dict(start_time = start_time,end_time =start_time + pd.Timedelta(f"{59} min"),value = 0))
                
    labels = pd.DataFrame(result)
    return labels

In [3]:
start_date = datetime(2021, 1, 1)
end_date = datetime(2021, 2, 28, 23, 59, 59)

df = get_prices(start_date,end_date)
df['mid'] = df[['high','low']].mean(1)
df['dollar_volume'] = df[['mid','volume']].prod(1)
px_close = df.pivot_table(values = 'close', columns = 'symbol', index = 'open_time')
px_mid = df.pivot_table(values = 'mid', columns = 'symbol', index = 'open_time')
volume = df.pivot_table(values = 'volume', columns = 'symbol', index = 'open_time')
dollar_volume = df.pivot_table(values = 'dollar_volume', columns = 'symbol', index = 'open_time')

px_close = px_close.resample('1 min').last().ffill()
px_mid = px_mid.resample('1 min').last().ffill()
volume = volume.resample('1 min').last().fillna(0.)
dollar_volume = dollar_volume.resample('1 min').last().fillna(0.)

px_close = px_close.dropna(axis=1)
px_mid = px_mid.dropna(axis=1)
volume = volume.loc[px_close.index,px_close.columns]
dollar_volume = dollar_volume.loc[px_close.index,px_close.columns]

In [10]:
Px = px_mid['BTCUSDT']

time_strided = strided_app(Px.index.values,60,1)
Px_strided = pd.DataFrame(strided_app(Px.values,60,1),columns = [f"{i}{'min'}" for i in np.arange(60)],index = time_strided[:,0])
strided_cumret = Px_strided.divide(Px_strided['0min'],axis=0) - 1

mu_sigma = pd.DataFrame(index = strided_cumret.index,columns = ['mu','sigma'])

for i in np.arange(60):
    sigma = strided_cumret['59min'].iloc[i::60].ewm(span=100,adjust=False).std()
    mu = strided_cumret['59min'].iloc[i::60].ewm(span=100,adjust=False).mean()
    mu_sigma.loc[sigma.index,'sigma'] = sigma
    mu_sigma.loc[mu.index,'mu'] = mu

strided_cumret = strided_cumret.dropna()

z_strided_cumret = strided_cumret.subtract(mu_sigma['mu'],axis=0).divide(mu_sigma['sigma'],axis=0).dropna()

labels = labeler(z_strided_cumret,2.5)
labels_no_0_overlap = remove_overlapping_events_with_0(labels)
labels = remove_overlapping_events(labels)

In [15]:
labels.loc[labels['value']==-1]

Unnamed: 0,start_time,end_time,value
25,2021-01-01 06:26:00,2021-01-01 07:10:00,-1
60,2021-01-01 08:01:00,2021-01-01 08:16:00,-1
262,2021-01-01 12:32:00,2021-01-01 12:52:00,-1
334,2021-01-01 14:03:00,2021-01-01 14:53:00,-1
364,2021-01-01 14:53:00,2021-01-01 15:40:00,-1
...,...,...,...
72325,2021-02-23 13:49:00,2021-02-23 14:48:00,-1
73594,2021-02-24 12:36:00,2021-02-24 13:32:00,-1
75679,2021-02-25 23:24:00,2021-02-26 00:03:00,-1
76119,2021-02-26 06:47:00,2021-02-26 07:45:00,-1
