In [1]:
import os
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import multiprocessing
import datetime
import logging
import numpy as np
import pandas as pd


# Configurar o logging para visualizar os logs de informação
logging.basicConfig(level=logging.INFO)

In [2]:
raw_dataset_path = '../datasets/BTCUSDT-Trades/'
output_base_path = '../output'
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
output_path = f'{output_base_path}_v{timestamp}'

In [3]:
# 1. Configuração do Cluster Dask com ajustes para otimização de memória
def setup_dask_cluster(n_workers=3, threads_per_worker=4, memory_limit='16GB'):
    num_cores = multiprocessing.cpu_count()
    print(f"Número de núcleos disponíveis: {num_cores}")

    cluster = LocalCluster(
        n_workers=n_workers,
        threads_per_worker=threads_per_worker,
        memory_limit=memory_limit
    )

    client = Client(cluster)
    print(client)
    return client

# 2. Leitura dos Arquivos Parquet com otimização de memória
def read_parquet_files(raw_dataset_path, file):
    parquet_pattern = os.path.join(raw_dataset_path, file)
    df_dask = dd.read_parquet(
        parquet_pattern,
        columns=['price', 'qty', 'quoteQty', 'time'],
        engine='pyarrow'
    )

    # Otimizar tipos de dados para reduzir uso de memória
    df_dask['price'] = df_dask['price'].astype('float32')
    df_dask['qty'] = df_dask['qty'].astype('float32')
    df_dask['quoteQty'] = df_dask['quoteQty'].astype('float32')
    return df_dask

# 3. Aplicação das Operações Matemáticas
def apply_operations(df_dask):

    df_dask['side'] = np.nan

    # Calcular 'dollar_side' = -1 se isBuyerMaker for True, 1 se False
    df_dask['side'] = np.where(df_dask['quoteQty'].shift() > df_dask['price'], 1, df_dask['side'])

    df_dask['side'] = np.where(df_dask['quoteQty'].shift() < df_dask['price'], -1, df_dask['side'])

    df_dask.at[0, 'side'] = 1

    df_dask['side'].fillna(method='ffill', inplace=True)

    # Calcular 'dollar_imbalance' = trade_dollar * dollar_side
    df_dask['dollar_imbalance'] = df_dask['trade_dollar'] * df_dask['side']

    return df_dask


def assign_side(df):
    df['side'] = np.nan
    df['side'] = np.where(df['price'].shift() > df['price'], 1, df['side'])
    df['side'] = np.where(df['price'].shift() < df['price'], -1, df['side'])
    df.at[0, 'side'] = 1
    df['side'] = df['side'].ffill().astype('int8')
    return df




In [4]:
def create_imbalance_dollar_bars(df_dask, init_T, init_dif, alpha, res):

    exp_T     = init_T
    exp_dif   = init_dif
    threshold = exp_T * init_dif

    bars = []

    # Variáveis de agregação de uma barra
    if len(res) > 0:
        bar_open = res[0]
        bar_high = res[1]
        bar_low = res[2]
        bar_close = res[3]
        bar_start_time = res[4]
        bar_end_time = res[5]
        current_imbalance = res[6]
        buy_volume_usd = res[7]
        total_volume_usd = res[8]
        total_volume = res[9]
    else:
        bar_open = None
        bar_high = -float('inf')
        bar_low = float('inf')
        bar_close = None
        bar_start_time = None
        bar_end_time = None
        current_imbalance = 0
        buy_volume_usd = 0
        total_volume_usd = 0
        total_volume = 0

    price_col = 'price'
    time_col = 'time'
    imbalance_col = 'dollar_imbalance'
    volume_col = 'qty'

    try:
        # IMPORTANT: df_dask is already a pandas slice at map_partitions-level
        # so we can iterate directly over df_dask.iterrows() in local memory for that partition.
        for idx, row in df_dask.iterrows():

            if bar_open is None:
                bar_open = row[price_col]
                bar_start_time = row[time_col]

            # Atualiza valores de OHLC
            trade_price = row[price_col]
            bar_high    = max(bar_high, trade_price)
            bar_low     = min(bar_low, trade_price)
            bar_close   = trade_price


            # Soma o volume (ou outra métrica de desequilíbrio)
            trade_imbalance = row[imbalance_col]

            if row['side'] > 0:
                buy_volume_usd += trade_imbalance

            total_volume += row[volume_col]
            total_volume_usd += abs(trade_imbalance)
            current_imbalance += trade_imbalance
            imbalance = abs(current_imbalance)
            # Verifica se a soma já ultrapassou o threshold
            if imbalance >= threshold:
                bar_end_time = row[time_col]

                # Salvar a barra formada
                bars.append({
                    'start_time': bar_start_time,
                    'end_time': bar_end_time,
                    'open': bar_open,
                    'high': bar_high,
                    'low': bar_low,
                    'close': bar_close,
                    'imbalance_col': current_imbalance,
                    'total_volume_buy_usd': buy_volume_usd,
                    'total_volume_usd': total_volume_usd,
                    'total_volume': total_volume
                })

                # pdbar_T = bar_end_time - bar_start_time
                # bar_T = pdbar_T.total_seconds()

                # Exponential-weighted updates
                if exp_dif == 1:
                    exp_T   = total_volume_usd
                    exp_dif = abs(2 * buy_volume_usd/total_volume_usd - 1)
                else:
                    exp_T   += alpha * (total_volume_usd   - exp_T)
                    exp_dif += alpha * (abs(2 * buy_volume_usd/total_volume_usd - 1) - exp_dif)
                # Reset accumulators
                threshold = exp_T * exp_dif

                # Variáveis de agregação de uma barra
                bar_open = None
                bar_high = -float('inf')
                bar_low = float('inf')
                bar_close = None
                bar_start_time = None
                bar_end_time = None
                current_imbalance = 0
                buy_volume_usd = 0
                total_volume_usd = 0
                total_volume = 0
    finally:
        if current_imbalance == 0:
            res = []
        else:
            res = [bar_open, bar_high, bar_low, bar_close, bar_start_time,
                   bar_end_time, current_imbalance, buy_volume_usd, total_volume_usd, total_volume]

    return bars, exp_T, exp_dif, res

In [5]:
def batch_create_imbalance_dollar_bars(df_dask, init_T, init_dif, res_init, alpha):
    results = pd.DataFrame()
    # df_dask.npartitions
    # Precisamos iterar sobre cada partição em ordem
    total = df_dask.npartitions
    for i in range(0, total):
        print(f'partition {i} of {total-1}')
        part = df_dask.get_partition(i)

        df_part = part.compute()

        # processamos essa partição localmente
        df_proc, exp_T, exp_dif, res = create_imbalance_dollar_bars(df_part, init_T=init_T, init_dif=init_dif, alpha=alpha, res=res_init)
        df_proc = pd.DataFrame(df_proc)
        results = pd.concat([results, df_proc])
        init_T = exp_T  # passagem do estado final para a próxima partição
        init_dif = exp_dif
        res_init = res
    return results, init_T, init_dif, res_init

In [None]:
# Liste todos os arquivos e pastas no diretório
files = os.listdir(raw_dataset_path)

# Filtre apenas os arquivos
file_count = sum(1 for f in files if os.path.isfile(os.path.join(raw_dataset_path, f)))

results = pd.DataFrame()

init_T = 10_000
init_dif = 1
alpha = 0.1
res = []
# file_count
for number in range(1, file_count):
    print(f"Dask n{number} of {file_count-1}")

    file = 'BTCUSDT-Dataset-part-' + str(number) + '.parquet'

    df_dask = read_parquet_files(raw_dataset_path, file)

    df_dask = df_dask.map_partitions(assign_side)

    df_dask['dollar_imbalance'] = df_dask['quoteQty'] * df_dask['side']
    bars, init_T, init_dif, res = batch_create_imbalance_dollar_bars(df_dask, init_T, init_dif, res, alpha)

    results = pd.concat([results, bars])

Dask n1 of 37
partition 0 of 14
partition 1 of 14
partition 2 of 14
partition 3 of 14
partition 4 of 14
partition 5 of 14
partition 6 of 14
partition 7 of 14
partition 8 of 14
partition 9 of 14
partition 10 of 14
partition 11 of 14
partition 12 of 14
partition 13 of 14
partition 14 of 14
Dask n2 of 37
partition 0 of 14
partition 1 of 14
partition 2 of 14
partition 3 of 14
partition 4 of 14
partition 5 of 14
partition 6 of 14
partition 7 of 14
partition 8 of 14
partition 9 of 14
partition 10 of 14
partition 11 of 14
partition 12 of 14
partition 13 of 14
partition 14 of 14
Dask n3 of 37
partition 0 of 14
partition 1 of 14
partition 2 of 14
partition 3 of 14
partition 4 of 14
partition 5 of 14
partition 6 of 14
partition 7 of 14
partition 8 of 14
partition 9 of 14
partition 10 of 14
partition 11 of 14
partition 12 of 14
partition 13 of 14
partition 14 of 14
Dask n4 of 37
partition 0 of 13
partition 1 of 13
partition 2 of 13
partition 3 of 13
partition 4 of 13
partition 5 of 13
partition 6

In [7]:
output_path = f'{output_path}-{alpha}-{init_T}.xlsx'
results.to_excel(output_path)

In [None]:
import pandas as pd
import plotly.graph_objects as go


df = results.copy()
df['end_time'] = pd.to_datetime(df['end_time'])

# Criando o gráfico de candlestick
fig = go.Figure(data=[go.Candlestick(x=df['end_time'],
                                       open=df['open'],
                                       high=df['high'],
                                       low=df['low'],
                                       close=df['close'])])

# Adicionando título e rótulos
fig.update_layout(title='Gráfico de Candlestick',
                  xaxis_title='Data',
                  yaxis_title='Preço',
                  xaxis_rangeslider_visible=False)

# Exibindo o gráfico
fig.show()

In [None]:
import mplfinance as mpf
import matplotlib
import pandas as pd
matplotlib.use('TkAgg')  # Ou outro backend interativo

df = results.copy()

# Convertendo a coluna 'start_time' para o índice do DataFrame
df['end_time'] = pd.to_datetime(df['end_time'])
df.set_index('end_time', inplace=True)

# Selecionando apenas as colunas necessárias para o mplfinance
df = df[['open', 'high', 'low', 'close']]

# Renomeando as colunas para um formato mais amigável
df.rename(columns={
    'open': 'Open',
    'high': 'High',
    'low': 'Low',
    'close': 'Close'
}, inplace=True)


# Plotando o gráfico OHLC em preto e branco
fig, ax = mpf.plot(df, type='candle', title='Gráfico OHLC', volume=False, show_nontrading=False)

# Configurando a escala logarítmica
ax[0].set_yscale('log')

In [None]:
import pandas as pd
import plotly.graph_objects as go

df = results.copy()

df['end_time'] = pd.to_datetime(df['end_time'])

# Criando uma nova coluna com data e hora formatadas como strings (YYYY-MM-DD HH:MM:SS%S)
df['end_time_str'] = df['end_time'].dt.strftime('%Y-%m-%d %H:%M')

# Criando o gráfico de candlestick
fig = go.Figure(data=[go.Candlestick(x=df['end_time_str'],
                                       open=df['open'],
                                       high=df['high'],
                                       low=df['low'],
                                       close=df['close'],
                                       increasing_line_color='slateblue',
                                       decreasing_line_color='black')])

# Adicionando título e rótulos
fig.update_layout(title='Gráfico de Candlestick',
                  xaxis_title='Data',
                  yaxis_title='Preço',
                  xaxis_rangeslider_visible=True,
                  plot_bgcolor='white',
                  paper_bgcolor='whitesmoke')


fig.update_xaxes(type='category', tickangle=-45)

fig.update_yaxes(
    type='log',
    tickmode='auto'# Escala logarítmica
)

# Exibindo o gráfico
fig.show()