In [20]:
import pandas as pd
from graphsParsing.graphs_parsing_tools import BinanceDataCollector
import os


# Define parameters for graph data collection
symbol = 'BTCUSDT'
interval = '5m'  
days_back = 730 

In [22]:
# Load the CSV file into a DataFrame
news = pd.read_csv('./train_news_dataset/merged_news.csv')

# Convert 'date_time' to datetime format and filter out rows with invalid date formats
news['date_length'] = news['date_time'].apply(lambda x: len(x)) 
filtered_news = news[news['date_length'] == 19] # filtering for date_time strings of length 19 (to convert to datetime, because 3 of them are not the same format)
filtered_news = filtered_news.drop(columns=['date_length'])
filtered_news['date_time'] = pd.to_datetime(filtered_news['date_time'])

filtered_news.to_csv('newsAndPriceMapping/news.csv', index=False)

# Sort the DataFrame by 'date_time'
filtered_news = filtered_news.sort_values('date_time')

In [None]:
# Collect historical data using BinanceDataCollector <------- UNCOMMENT THIS PART TO RUN DATA COLLECTION 
# collector = BinanceDataCollector()

# symbol = 'BTCUSDT'  
# interval = '5m'     
# days_back = 365 

# print("Начинаем сбор данных...")
# raw_data = collector.collect_historical_data(symbol, interval, days_back)

# if raw_data:
#     # Process the raw data into a DataFrame, drop nans and add indicators
#     df = collector.process_data_to_dataframe(raw_data)
#     df = collector.fix_data()
#     df = collector.add_all_indicators_finta()
    
#     # Save the DataFrame to a CSV file
#     filename = f'{symbol}_{interval}_{days_back}days.csv'
#     collector.save_data(df, filename)

#     print(f"Данные успешно собраны и сохранены в файл {filename}")
# else:
#     print("Не удалось собрать данные")

Начинаем сбор данных...
Данные успешно собраны и сохранены в файл BTCUSDT_5m_365days.csv


In [23]:
# Load the graph data from a CSV file
dirname = 'data'
filename = os.path.join(dirname, f'BTCUSDT_{interval}_{days_back}days.csv')

graph_data = pd.read_csv(filename)
graph_data = graph_data.rename(columns={'open_time':'date_time'}) # Rename 'open_time' to 'date_time' for consistency
graph_data['date_time'] = pd.to_datetime(graph_data['date_time']) # Convert 'date_time' to datetime format

graph_data.sort_values('date_time', inplace=True) 

In [None]:
# Merge graph data with news data based on date_time, using binary search for efficiency
from tqdm import tqdm
import pandas as pd
import numpy as np
import csv
import os


# Constants for merging
RESULT_FILENAME = 'result_data.csv'
DIRNAME = 'result_train_data'
THRESHOLD = pd.Timedelta('1d').value 
SEP = '\n' # Separator for news items per graph point

# Create directory for results if it doesn't exist
os.makedirs(DIRNAME, exist_ok=True)
PATH_TO_FILE = os.path.join(DIRNAME, RESULT_FILENAME)

# Prepare news data for merging
filtered_news = filtered_news.reset_index(drop=True)
news_texts = filtered_news.apply(lambda row: row['title'] if pd.isna(row['article_text']) else row['title'] + ' ' + row['article_text'], axis=1).astype(str).values # Combine title and article_text into a single string for each news item <------- may experiment with title and article_text separator (or just use title or article_text)
news_texts = filtered_news.apply(lambda row: row['title'], axis=1).astype(str).values # Use only title for news items for space efficiency <------- delete this line if you want to use both title and article_text

# Extract date_time values from graph data and news data
graph_date = graph_data['date_time'].reset_index(drop=True).values
news_date = filtered_news['date_time'].values


# Perform binary search to find the starting indices of news articles for each graph date as anchors
start_indices = np.searchsorted(graph_date, news_date, side='left')

prev_left = 0
columns = list(graph_data.columns) + ['news']

# Writing merged data to CSV file
with open(PATH_TO_FILE, 'w', newline='', encoding='utf-8') as f:

    writer = csv.writer(f)
    writer.writerow(columns)

    # Iterate through each date in the graph data and find corresponding news using binary search (searchsorted)
    for i in tqdm(range(len(graph_date))):
        current_date = graph_date[i]
        right_idx = start_indices[i]
        
        min_date = current_date - THRESHOLD
        left_idx = np.searchsorted(news_date[prev_left:right_idx], min_date, side='left') + prev_left
        news_to_append = 'NONE'

        # Check if there are news articles within the threshold
        if left_idx < right_idx:
            news_slice = news_texts[left_idx:right_idx]
            news_to_append = SEP.join(news_slice) # <------- may experiment with different separators for news items
        
        row_to_append = list(graph_data.iloc[i].values) + [news_to_append]
        writer.writerow(row_to_append)
        prev_left = left_idx

 51%|█████     | 106321/210030 [00:09<00:09, 10702.05it/s]


KeyboardInterrupt: 

# With vectorized articles text

In [24]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import csv
import os
import pickle
from typing import Optional, List, Union
import logging

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model.eval()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [25]:
RESULT_FILENAME = 'result_data_vectorized_news.csv'
DIRNAME = 'result_train_data'
THRESHOLD = pd.Timedelta('1d').value
MAX_LENGTH = 1024
SEP = '[SEP]'

In [29]:
news_date

array(['2011-06-22T10:56:00.000000000', '2011-06-22T10:56:00.000000000',
       '2011-06-22T10:56:00.000000000', ...,
       '2025-06-03T22:31:49.000000000', '2025-06-03T22:31:49.000000000',
       '2025-06-04T14:01:05.000000000'],
      shape=(239161,), dtype='datetime64[ns]')

In [31]:
start_indices

array([     0,      0,      0, ..., 199453, 199453, 199639],
      shape=(239161,))

In [27]:
# Create directory for results if it doesn't exist
os.makedirs(DIRNAME, exist_ok=True)
PATH_TO_FILE = os.path.join(DIRNAME, RESULT_FILENAME)

# Prepare news data for merging
filtered_news = filtered_news.reset_index(drop=True)
news_texts = filtered_news.apply(lambda row: row['title'] if pd.isna(row['article_text']) else row['title'] + ': ' + row['article_text'], axis=1).astype(str).values # Combine title and article_text into a single string for each news item <------- may experiment with title and article_text separator (or just use title or article_text)

# Extract date_time values from graph data and news data
graph_date = graph_data['date_time'].reset_index(drop=True).values
news_date = filtered_news['date_time'].values


# Perform binary search to find the starting indices of news articles for each graph date as anchors
start_indices = np.searchsorted(graph_date, news_date, side='left')

prev_left = 0
columns = list(graph_data.columns) + ['news']

# Writing merged data to CSV file
with open(PATH_TO_FILE, 'w', newline='', encoding='utf-8') as f:

    writer = csv.writer(f)
    writer.writerow(columns)

    # Iterate through each date in the graph data and find corresponding news using binary search (searchsorted)
    for i in tqdm(range(len(graph_date))):
        current_date = graph_date[i]
        right_idx = start_indices[i]
        
        min_date = current_date - THRESHOLD
        left_idx = np.searchsorted(news_date[prev_left:right_idx], min_date, side='left') + prev_left
        news_to_append = 'NONE'

        # Check if there are news articles within the threshold
        if left_idx < right_idx:
            news_slice = news_texts[left_idx:right_idx]
            news_to_append = SEP.join(news_slice) # <------- may experiment with different separators for news 
            
            tokenized_data = tokenizer(
                news_to_append,
                return_tensors='np',
                max_length=MAX_LENGTH,
                truncation=True,
            )

            outputs = model(**tokenized_data)
            embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()

            print(embedding)
            break
        else:
            embedding = np.zeros(768)

        row_to_append = list(graph_data.iloc[i].values) + [embedding]
        writer.writerow(row_to_append)
        prev_left = left_idx

 14%|█▍        | 29483/210030 [01:26<08:51, 339.56it/s]


KeyboardInterrupt: 