In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

from typing import Callable
from pandas import DataFrame

from analysis.analyzers.multi_ticker_data_loader import MultiTickerDataLoader
from analysis.featurizers.ohlcv_to_ohlcv_and_deltas import ohlcv_to_ohlcv_and_deltas
from analysis.segmenters.striding_segmenter import striding_segmenter_builder

# EXPERIMENTAL PARAMETER
transformation: Callable[..., DataFrame] = ohlcv_to_ohlcv_and_deltas

# EXPERIMENTAL PARAMETERS
length = 10
stride = 8

segmenter = striding_segmenter_builder(
    length=length,
    stride=stride,
)

m: MultiTickerDataLoader = MultiTickerDataLoader(
    tickers=['QQQ', 'SPY'],
)

m.load()

In [3]:
import pandas as pd
# import xarray as xr

def build_windows(df: DataFrame) -> list[DataFrame]:
    stride = 8
    length = 10

    windows: list[DataFrame] = []

    for i in range(0, len(df) - length + 1, stride):
        segment = df.iloc[i:i + length]

        segment_dates = segment['date']
        segment_dates = pd.to_datetime(segment['date']).dt.date  # Convert to date (drop time)
        all_same_day = segment_dates.nunique() == 1
        
        if not all_same_day:
            continue

        windows.append(segment)
    
    return windows
            
m.build_windows(window_builder=build_windows)


In [4]:
from pandas.core.frame import DataFrame
from analysis.analyzers.multi_ticker_data_loader import InputOutput

def build_input_output(
        windows: list[DataFrame]
) -> list[InputOutput]:
    featurized_data: DataFrame = m.data['QQQ']['raw']

    inputs_outputs: list[InputOutput] = []

    for window in windows:
        window_start_time: pd.Timestamp = window['date'].iloc[0]
        
        dataframe_of_rows_with_index_in_raw_data_having_window_start_time: DataFrame = featurized_data[
            featurized_data['date'] == window_start_time
        ]

        if dataframe_of_rows_with_index_in_raw_data_having_window_start_time.empty:
            continue

        index_in_raw_data_with_window_start_time: int = dataframe_of_rows_with_index_in_raw_data_having_window_start_time.index[0]

        previous_40_rows: DataFrame = featurized_data.iloc[
            index_in_raw_data_with_window_start_time - 40: index_in_raw_data_with_window_start_time
        ]

        # test if all 40 rows are on the same day
        previous_40_rows_dates = pd.to_datetime(previous_40_rows['date']).dt.date
        all_same_day = previous_40_rows_dates.nunique() == 1

        if not all_same_day:
            continue

        input_output: InputOutput = InputOutput(
            input=previous_40_rows,
            output=window
        )

        inputs_outputs.append(input_output)
        

    return inputs_outputs

m.build_inputs_outputs(io_builder=build_input_output)

### As a sanity check, we visualize Input and Outputs generated thus far:

In [49]:
# Plot the input and the output next to each other, for 5 random entries. Have the inputs on the left, the outputs on the right, and lay out the entries vertically.

import random
# import matplotlib.pyplot as plt

from pandas import DataFrame
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from analysis.analyzers.multi_ticker_data_loader import InputOutput

# from datetime import datetime


inputs_outputs_to_sample_from: list[InputOutput] = m.data['QQQ']['labeled']

num_samples: int = 5

fig: go.Figure = make_subplots(rows=num_samples, cols=2, subplot_titles=('Input', 'Output'))

sample_counter: int = 0
for i in random.sample(range(len(inputs_outputs_to_sample_from)), num_samples):
    input_output: InputOutput = inputs_outputs_to_sample_from[i]
    
    input: DataFrame = input_output['input']

    input_fig = go.Figure(
        data=[
            go.Candlestick(
                x=input['date'],
                open=input['open'],
                high=input['high'],
                low=input['low'],
                close=input['close']
            )
        ]
    )

    output_fig = go.Figure(
        data=[
            go.Candlestick(
                x=input_output['output']['date'],
                open=input_output['output']['open'],
                high=input_output['output']['high'],
                low=input_output['output']['low'],
                close=input_output['output']['close']
            )
        ]
    )

    sample_counter += 1
    for trace in input_fig.data:
        fig.add_trace(trace, row=sample_counter, col=1)
    for trace in output_fig.data:
        fig.add_trace(trace, row=sample_counter, col=2)

_: go.Figure = fig.update_layout(
    height=300*num_samples,
    width=1000,
    title_text="Input and Output Candlestick Charts"
)

_: go.Figure = fig.update_xaxes(rangeslider_visible=False)

fig.show()

### Next up, we're going to run train a model on all the data (all io pairs for all symbols)