In [1]:
import pandas as pd
from graphsParsing.graphs_parsing_tools import BinanceDataCollector
import os


# Define parameters for graph data collection
symbol = 'BTCUSDT'
interval = '5m'  
days_back = 730  

In [3]:
# Load the CSV file into a DataFrame
news = pd.read_csv('result_news_data.csv')

# Convert 'date_time' to datetime format and filter out rows with invalid date formats
news['date_length'] = news['date_time'].apply(lambda x: len(x)) 
filtered_news = news[news['date_length'] == 19] # filtering for date_time strings of length 19 (to convert to datetime, because 3 of them are not the same format)
filtered_news = filtered_news.drop(columns=['date_length'])
filtered_news['date_time'] = pd.to_datetime(filtered_news['date_time'])

filtered_news.to_csv('newsAndPriceMapping/news.csv', index=False)

# Sort the DataFrame by 'date_time'
filtered_news = filtered_news.sort_values('date_time')

In [None]:
# Load the graph data from a CSV file
dirname = 'data'
filename = os.path.join(dirname, f'BTCUSDT_{interval}_{days_back}days.csv')

graph_data = pd.read_csv(filename)
graph_data = graph_data.rename(columns={'open_time':'date_time'}) # Rename 'open_time' to 'date_time' for consistency
graph_data['date_time'] = pd.to_datetime(graph_data['date_time']) # Convert 'date_time' to datetime format

graph_data.sort_values('date_time', inplace=True) 

In [4]:


# collector = BinanceDataCollector()


# print(f"\n{'='*50}")
# print(f"Собираем данные для {symbol}")
# print(f"{'='*50}")

# raw_data = collector.collect_historical_data(symbol, interval, days_back)

# if raw_data:
#     df = collector.process_data_to_dataframe(raw_data)
#     filename = f'{symbol}_{interval}_{days_back}days.csv'
#     collector.save_data(df, filename)
    
#     print(f"✅ Данные для {symbol} сохранены: {len(df)} записей")
# else:
#     print(f"❌ Ошибка при сборе данных для {symbol}")
# 

In [8]:
filtered_news

Unnamed: 0.1,Unnamed: 0,date_time,title,article_text
28332,0,2011-06-22 10:56:00,Compromised account leads to massive Bitcoin s...,"Bitcoin, for those not aware, is a completely ..."
127732,99400,2011-06-22 10:56:00,Compromised account leads to massive Bitcoin s...,"Bitcoin, for those not aware, is a completely ..."
158358,130026,2011-06-22 10:56:00,Compromised account leads to massive Bitcoin s...,"Bitcoin, for those not aware, is a completely ..."
127733,99401,2012-02-01 18:02:32,Bitcoin May Be The Currency Of The Future,Have you heard of Bitcoin? If you're a fan of ...
158359,130027,2012-02-01 18:02:32,Bitcoin May Be The Currency Of The Future,Have you heard of Bitcoin? If you're a fan of ...
...,...,...,...,...
127728,99396,2025-06-03 20:05:48,Why Is Coinbase (COIN) Stock Soaring Today,Shares of blockchain infrastructure company Co...
127729,99397,2025-06-03 21:37:42,How Glassnode’s data revolution empowers wealt...,How Glassnode’s data revolution empowers wealt...
158357,130025,2025-06-03 22:31:49,NYSE Arca submits filing for listing of Truth ...,(Reuters) -NYSE Arca on Tuesday submitted a fi...
127730,99398,2025-06-03 22:31:49,NYSE Arca submits filing for listing of Truth ...,(Reuters) -NYSE Arca on Tuesday submitted a fi...


In [None]:
# Merge graph data with news data based on date_time, using binary search for efficiency
from tqdm import tqdm
import pandas as pd
import numpy as np
import csv
import os


# Constants for merging
RESULT_FILENAME = 'result_data.csv'
DIRNAME = 'result_train_data'
THRESHOLD = pd.Timedelta('1d').value 
SEP = '\n' # Separator for news items per graph point

# Create directory for results if it doesn't exist
os.makedirs(DIRNAME, exist_ok=True)
PATH_TO_FILE = os.path.join(DIRNAME, RESULT_FILENAME)

# Prepare news data for merging
filtered_news = filtered_news.reset_index(drop=True)
news_texts = filtered_news.apply(lambda row: row['title'] if pd.isna(row['article_text']) else row['title'] + ' ' + row['article_text'], axis=1).astype(str).values # Combine title and article_text into a single string for each news item <------- may experiment with title and article_text separator (or just use title or article_text)
news_texts = filtered_news.apply(lambda row: row['title'], axis=1).astype(str).values # Use only title for news items for space efficiency <------- delete this line if you want to use both title and article_text

# Extract date_time values from graph data and news data
graph_date = graph_data['date_time'].reset_index(drop=True).values
news_date = filtered_news['date_time'].values


# Perform binary search to find the starting indices of news articles for each graph date as anchors
start_indices = np.searchsorted(graph_date, news_date, side='left')

prev_left = 0
columns = list(graph_data.columns) + ['news']

# Writing merged data to CSV file
with open(PATH_TO_FILE, 'w', newline='', encoding='utf-8') as f:

    writer = csv.writer(f)
    writer.writerow(columns)

    # Iterate through each date in the graph data and find corresponding news using binary search (searchsorted)
    for i in tqdm(range(len(graph_date))):
        current_date = graph_date[i]
        right_idx = start_indices[i]
        
        min_date = current_date - THRESHOLD
        left_idx = np.searchsorted(news_date[prev_left:right_idx], min_date, side='left') + prev_left
        news_to_append = 'NONE'

        # Check if there are news articles within the threshold
        if left_idx < right_idx:
            news_slice = news_texts[left_idx:right_idx]
            news_to_append = SEP.join(news_slice) # <------- may experiment with different separators for news items
        
        row_to_append = list(graph_data.iloc[i].values) + [news_to_append]
        writer.writerow(row_to_append)
        prev_left = left_idx

100%|██████████| 210030/210030 [01:18<00:00, 2672.57it/s]
