In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)

if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [2]:
# for plotting, run: pip install pandas matplotlib
from tqdm.notebook import tqdm
import yaml

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from chronos import ChronosPipeline
from sklearn.metrics import mean_absolute_percentage_error as MAPE

from src.data.preprocessing import read_data

# Toy example

In [3]:
# pipeline = ChronosPipeline.from_pretrained(
#     "amazon/chronos-t5-small",
#     device_map="cuda:2",  # use "cpu" for CPU inference and "mps" for Apple Silicon
#     torch_dtype=torch.bfloat16,
# )

# df = pd.read_csv("https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/AirPassengers.csv")
# df.head()

# # context must be either a 1D tensor, a list of 1D tensors,
# # or a left-padded 2D tensor with batch as the first dimension
# context = torch.tensor(df["#Passengers"])
# prediction_length = 12
# forecast = pipeline.predict(
#     context,
#     prediction_length,
#     num_samples=20,
#     temperature=1.0,
#     top_k=50,
#     top_p=1.0,
# ) # forecast shape: [num_series, num_samples, prediction_length]

# # visualize the forecast
# forecast_index = range(len(df), len(df) + prediction_length)
# low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)

# plt.figure(figsize=(8, 4))
# plt.plot(df["#Passengers"], color="royalblue", label="historical data")
# plt.plot(forecast_index, median, color="tomato", label="median forecast")
# plt.fill_between(forecast_index, low, high, color="tomato", alpha=0.3, label="80% prediction interval")
# plt.legend()
# plt.grid()
# plt.show()

In [4]:
pipeline = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-small",
    device_map="cuda",
    torch_dtype=torch.bfloat16,
)

df = pd.read_csv("https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/AirPassengers.csv")

# context must be either a 1D tensor, a list of 1D tensors,
# or a left-padded 2D tensor with batch as the first dimension
context = torch.tensor(df["#Passengers"])
embeddings, tokenizer_state = pipeline.embed(context)



In [5]:
context.shape, embeddings.shape

(torch.Size([144]), torch.Size([1, 145, 512]))

# our data example

## config

In [6]:
col_agg_finctions = {'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'}

train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

use_pct_changes = False

## data preprocessing 

In [7]:
with open('configs/best_stocks_nans_rate.yaml') as f:
    best_stocks = yaml.load(f, Loader=yaml.FullLoader)
best_stocks = list(best_stocks.keys())

In [8]:
df = read_data('data/all_tickers.csv')
df_best = df.query("Stock in @best_stocks")
df_best.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,Datetime,Stock,Day_week
61,2024-01-30,22:58:00,188.13,188.17,188.11,188.14,1500,2024-01-30 22:58:00,AAPL,Tuesday
62,2024-01-30,22:57:00,188.22,188.22,188.08,188.11,5700,2024-01-30 22:57:00,AAPL,Tuesday
63,2024-01-30,22:56:00,188.22,188.27,188.215,188.22,2600,2024-01-30 22:56:00,AAPL,Tuesday
64,2024-01-30,22:55:00,188.15,188.2,188.14,188.2,2100,2024-01-30 22:55:00,AAPL,Tuesday
65,2024-01-30,22:54:00,188.18,188.18,188.14,188.18,2013,2024-01-30 22:54:00,AAPL,Tuesday


In [9]:
col_agg_finctions = {'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'}

df_agg = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper(freq='h')],
).agg(col_agg_finctions)

df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume
Stock,Datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
AAPL,2023-01-30 19:00:00,144.07,144.30,143.50,143.95,215404
AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383
...,...,...,...,...,...,...
XOM,2024-01-30 18:00:00,103.69,103.70,102.95,103.22,159389
XOM,2024-01-30 19:00:00,103.79,103.96,103.49,103.67,90110
XOM,2024-01-30 20:00:00,103.68,103.98,103.63,103.76,54009
XOM,2024-01-30 21:00:00,104.15,104.16,103.67,103.68,61363


In [10]:
df_original = df_agg.reset_index()

if use_pct_changes:
    df = df_agg.groupby('Stock').pct_change().reset_index()
else:
    df = df_original
df.head()

Unnamed: 0,Stock,Datetime,Open,High,Low,Close,Volume
0,AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
1,AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
2,AAPL,2023-01-30 19:00:00,144.07,144.3,143.5,143.95,215404
3,AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
4,AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383


## chronos preprocessing

In [11]:
X_baseline = df.set_index('Datetime').groupby(
    ['Stock', pd.Grouper( freq='h')],
).agg({'Close': 'mean'}).reset_index()
X_baseline.head()

Unnamed: 0,Stock,Datetime,Close
0,AAPL,2023-01-30 17:00:00,145.13
1,AAPL,2023-01-30 18:00:00,144.96
2,AAPL,2023-01-30 19:00:00,143.95
3,AAPL,2023-01-30 20:00:00,144.17
4,AAPL,2023-01-30 21:00:00,143.38


In [12]:
X_chron = X_baseline.pivot(index='Datetime', columns='Stock')\
    .reset_index()
X_chron.columns = X_chron.columns.droplevel()
X_chron.columns = ['Datetime'] + X_chron.columns.tolist()[1:]
X_chron

Unnamed: 0,Datetime,AAPL,ABBV,ABT,AMD,BAC,CMCSA,CRM,CSCO,CVX,...,NVDA,PEP,PFE,PG,TSLA,V,VZ,WFC,WMT,XOM
0,2023-01-30 17:00:00,145.13,145.87,109.76,74.53,35.260,39.03,164.52,48.220,178.010,...,200.84,169.710,43.780,141.130,178.970,229.26,40.800,45.98,141.88,115.28
1,2023-01-30 18:00:00,144.96,146.61,110.23,73.77,35.490,39.41,166.21,48.460,176.480,...,198.50,171.000,43.960,141.840,175.830,229.51,41.140,46.26,142.17,114.03
2,2023-01-30 19:00:00,143.95,146.80,110.39,72.76,35.420,39.42,165.19,48.290,176.740,...,196.07,170.800,43.975,141.530,171.580,229.50,41.140,46.37,141.94,114.92
3,2023-01-30 20:00:00,144.17,146.38,110.52,73.67,35.520,39.45,165.21,48.340,175.855,...,196.32,169.830,43.735,140.990,172.570,229.84,41.105,46.51,141.88,113.87
4,2023-01-30 21:00:00,143.38,145.65,110.21,73.14,35.420,39.40,165.44,48.255,175.470,...,194.44,169.440,43.615,140.660,171.870,229.57,41.080,46.42,141.47,113.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1502,2024-01-30 18:00:00,190.93,163.81,113.31,175.55,34.370,46.06,288.45,52.060,149.000,...,626.72,167.655,27.130,156.170,193.520,275.82,41.990,50.75,164.41,103.22
1503,2024-01-30 19:00:00,188.49,163.69,112.58,176.54,34.470,46.49,286.98,52.070,149.370,...,631.15,168.330,26.955,156.685,194.285,277.51,42.240,50.82,164.78,103.67
1504,2024-01-30 20:00:00,188.76,163.99,112.84,176.88,34.810,46.60,287.54,52.085,149.440,...,632.51,168.140,27.380,156.985,193.960,277.70,42.360,51.05,165.17,103.76
1505,2024-01-30 21:00:00,187.74,164.34,113.26,173.13,34.835,46.71,287.60,52.190,149.150,...,626.00,168.475,27.170,157.290,191.520,277.76,42.410,51.18,165.45,103.68


In [13]:
test_indexes = X_chron[X_chron['Datetime'] > test_start].index
X_chron[X_chron['Datetime'] > test_start]

Unnamed: 0,Datetime,AAPL,ABBV,ABT,AMD,BAC,CMCSA,CRM,CSCO,CVX,...,NVDA,PEP,PFE,PG,TSLA,V,VZ,WFC,WMT,XOM
1143,2023-11-01 17:00:00,171.85,141.99,95.630,104.40,26.510,42.010,203.19,52.210,145.48,...,416.23,164.440,30.650,150.010,201.310,238.79,35.480,39.880,164.00,106.62
1144,2023-11-01 18:00:00,171.42,142.50,95.725,106.06,26.565,41.960,203.82,52.130,145.30,...,419.00,164.400,30.750,149.730,198.660,238.28,35.455,39.850,164.24,106.89
1145,2023-11-01 19:00:00,171.92,142.55,95.360,106.10,26.500,41.605,202.57,51.910,145.27,...,418.76,164.570,30.820,149.660,200.650,238.01,35.320,39.720,164.29,106.56
1146,2023-11-01 20:00:00,171.97,142.51,95.240,106.35,26.245,41.490,201.56,51.920,144.83,...,416.48,164.640,30.720,149.970,201.640,237.21,35.330,39.455,164.26,106.30
1147,2023-11-01 21:00:00,172.43,142.81,95.820,107.11,26.370,41.590,202.68,52.020,145.07,...,418.67,164.930,30.680,150.180,202.510,237.96,35.400,39.610,164.78,106.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1502,2024-01-30 18:00:00,190.93,163.81,113.310,175.55,34.370,46.060,288.45,52.060,149.00,...,626.72,167.655,27.130,156.170,193.520,275.82,41.990,50.750,164.41,103.22
1503,2024-01-30 19:00:00,188.49,163.69,112.580,176.54,34.470,46.490,286.98,52.070,149.37,...,631.15,168.330,26.955,156.685,194.285,277.51,42.240,50.820,164.78,103.67
1504,2024-01-30 20:00:00,188.76,163.99,112.840,176.88,34.810,46.600,287.54,52.085,149.44,...,632.51,168.140,27.380,156.985,193.960,277.70,42.360,51.050,165.17,103.76
1505,2024-01-30 21:00:00,187.74,164.34,113.260,173.13,34.835,46.710,287.60,52.190,149.15,...,626.00,168.475,27.170,157.290,191.520,277.76,42.410,51.180,165.45,103.68


In [14]:
pipeline = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-base",
    device_map="cuda:1",  # use "cpu" for CPU inference and "mps" for Apple Silicon
    torch_dtype=torch.bfloat16,
)



config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/806M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [15]:
prediction_length = 1

test_indexes = X_chron[X_chron['Datetime'] > test_start].index
X_chron_stocks = X_chron.drop(columns=['Datetime'])

seq_len_test = len(test_indexes)
n_stocks = X_chron_stocks.shape[1]
y_pred_all = np.zeros((seq_len_test, n_stocks))
y_test_all = np.zeros((seq_len_test, n_stocks))

for i, test_idx in enumerate(tqdm(test_indexes)):
    X_stock_test = X_chron_stocks.iloc[:test_idx-1]
    y_true = X_chron_stocks.iloc[test_idx].values
    chron_input = torch.tensor(X_stock_test.values.T)
    
    forecast = pipeline.predict(
        chron_input,
        prediction_length,
        num_samples=50,
        temperature=1.0,
        top_k=50,
        top_p=1.0,
    ) 

    pred = np.median(forecast.numpy(), axis=1).flatten()
    y_pred_all[i] = y_true
    y_test_all[i] += pred

y_pred_all = y_pred_all.flatten()
y_test_all = y_test_all.flatten()
print('MAPE на тесте', MAPE(y_test_all, y_pred_all))

  0%|          | 0/364 [00:00<?, ?it/s]

MAPE на тесте 0.008460443725481551
