In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)

if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [2]:
# for plotting, run: pip install pandas matplotlib
from tqdm.notebook import tqdm
import yaml

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from chronos import ChronosPipeline
from sklearn.metrics import mean_absolute_percentage_error as MAPE

from src.data.load_data import pipeline_data

%load_ext autoreload
%autoreload 2

# Toy example

In [3]:
# pipeline = ChronosPipeline.from_pretrained(
#     "amazon/chronos-t5-small",
#     device_map="cuda:2",  # use "cpu" for CPU inference and "mps" for Apple Silicon
#     torch_dtype=torch.bfloat16,
# )

# df = pd.read_csv("https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/AirPassengers.csv")
# df.head()

# # context must be either a 1D tensor, a list of 1D tensors,
# # or a left-padded 2D tensor with batch as the first dimension
# context = torch.tensor(df["#Passengers"])
# prediction_length = 12
# forecast = pipeline.predict(
#     context,
#     prediction_length,
#     num_samples=20,
#     temperature=1.0,
#     top_k=50,
#     top_p=1.0,
# ) # forecast shape: [num_series, num_samples, prediction_length]

# # visualize the forecast
# forecast_index = range(len(df), len(df) + prediction_length)
# low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)

# plt.figure(figsize=(8, 4))
# plt.plot(df["#Passengers"], color="royalblue", label="historical data")
# plt.plot(forecast_index, median, color="tomato", label="median forecast")
# plt.fill_between(forecast_index, low, high, color="tomato", alpha=0.3, label="80% prediction interval")
# plt.legend()
# plt.grid()
# plt.show()

In [4]:
pipeline = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-small",
    device_map="cuda",
    torch_dtype=torch.bfloat16,
)

df = pd.read_csv("https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/AirPassengers.csv")

# context must be either a 1D tensor, a list of 1D tensors,
# or a left-padded 2D tensor with batch as the first dimension
context = torch.tensor(df["#Passengers"])
embeddings, tokenizer_state = pipeline.embed(context)



In [5]:
context.shape, embeddings.shape

(torch.Size([144]), torch.Size([1, 145, 512]))

# our data example

## config

In [6]:
col_agg_finctions = {'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'}

train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

use_pct_changes = False

## data preprocessing 

In [7]:
df = pipeline_data(col_agg_finctions=col_agg_finctions)

## chronos preprocessing

In [8]:
X_baseline = df.reset_index()

In [9]:
X_chron = X_baseline.pivot(index='Datetime', columns='Stock')\
    .reset_index()
X_chron.columns = X_chron.columns.droplevel()
X_chron.columns = ['Datetime'] + X_chron.columns.tolist()[1:]
X_chron

Unnamed: 0,Datetime,AAPL,ABBV,ABT,AMD,BAC,CMCSA,CRM,CSCO,CVX,...,NVDA,PEP,PFE,PG,TSLA,V,VZ,WFC,WMT,XOM
0,2023-01-30 17:00:00,145.15,146.32,110.130,73.950,35.490,39.370,166.12,48.450,176.79,...,130674,6845,101778,8754,619472,17282,133297,79262,13907,44389
1,2023-01-30 18:00:00,143.97,146.74,110.350,72.760,35.410,39.410,165.20,48.285,176.90,...,155991,22557,155382,17376,858003,34358,146068,150905,23264,47074
2,2023-01-30 19:00:00,144.07,146.36,110.500,73.605,35.495,39.450,165.18,48.320,176.02,...,92639,21045,185167,16732,434266,19724,127442,107253,16433,49605
3,2023-01-30 20:00:00,143.38,145.67,110.260,73.185,35.420,39.410,165.18,48.270,175.45,...,81168,23416,99458,17082,264365,10681,91513,99694,14201,38770
4,2023-01-30 21:00:00,143.31,145.89,110.210,72.960,35.420,39.410,165.66,48.310,175.45,...,61279,12312,101437,14687,382839,13021,93335,96519,11797,31317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1502,2024-01-30 18:00:00,188.41,163.79,112.660,176.630,34.490,46.485,287.00,52.080,149.36,...,205395,24443,729516,39876,443406,20571,158275,101913,21437,159389
1503,2024-01-30 19:00:00,188.66,163.95,112.800,176.530,34.810,46.605,287.25,52.085,149.52,...,79521,10234,371461,32265,295428,20999,92370,49361,14833,90110
1504,2024-01-30 20:00:00,187.98,164.39,113.230,173.550,34.810,46.690,287.51,52.190,149.12,...,130748,8674,232921,19543,237706,14822,101060,152049,19587,54009
1505,2024-01-30 21:00:00,188.04,164.61,113.410,170.660,34.810,46.625,288.47,52.200,149.73,...,112609,10492,199257,37485,186085,11235,61075,44407,18679,61363


In [10]:
pipeline = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-mini",
    device_map="cuda:2",  # use "cpu" for CPU inference and "mps" for Apple Silicon
    torch_dtype=torch.bfloat16,
)



config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/81.8M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [11]:
prediction_length = 1

test_indexes = X_chron[X_chron['Datetime'] > test_start].index
X_chron_stocks = X_chron.drop(columns=['Datetime'])

seq_len_test = len(test_indexes)
n_stocks = X_chron_stocks.shape[1]
y_pred_all = np.zeros((seq_len_test, n_stocks))
y_test_all = np.zeros((seq_len_test, n_stocks))

for i, test_idx in enumerate(tqdm(test_indexes)):
    X_stock_test = X_chron_stocks.iloc[:test_idx-1]
    y_true = X_chron_stocks.iloc[test_idx].values
    chron_input = torch.tensor(X_stock_test.values.T)
    
    forecast = pipeline.predict(
        chron_input,
        prediction_length,
        num_samples=50,
        temperature=1.0,
        top_k=50,
        top_p=1.0,
    ) 

    pred = np.median(forecast.numpy(), axis=1).flatten()
    y_pred_all[i] = y_true
    y_test_all[i] += pred

y_pred_all = y_pred_all.flatten()
y_test_all = y_test_all.flatten()
print('MAPE на тесте', MAPE(y_test_all, y_pred_all))

  0%|          | 0/364 [00:00<?, ?it/s]

MAPE на тесте 0.11131749302428685
