In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as plx

from stooq import *
import json
from sklearn.metrics import r2_score

In [2]:
# Altere estas variáveis caso os arquivos não tenham sido encontrados
stooq_dir = 'data/stooq'
fundamentals_dir = 'data/sec/companyfacts'
tickers_path = 'data/sec/company_tickers.json'

In [3]:
period = '5 min'
market = 'us'
sample_companies = [
    'MSFT.US',
    'AVGO.US',
    'NVDA.US',
    'TSM.US',
    'TSLA.US',
    'META.US',
    'IBM.US',
    'AAPL.US',
    'ASML.US',
    'AMZN.US',
    'GOOG.US',
    'LLY.US',
    'WMT.US',
    'JPM.US',
    'V.US',
    'UNH.US',
    'XOM.US',
    'ORCL.US',
    'MA.US'
]

In [4]:
cik_by_ticker = None
with open(tickers_path, 'r') as file:
    cik_by_ticker = json.load(file)
cik_by_ticker = { value['ticker'] + '.US': value['cik_str'] for _key, value in cik_by_ticker.items() }

In [5]:
import av_api as av
import json
import os

In [6]:
ALPHA_VANTAGE_KEY = os.environ['ALPHA_VANTAGE_KEY']
api = av.AlphaVantage(ALPHA_VANTAGE_KEY)

In [7]:
#av_response = api.get_earnings(symbol='LLY')

In [8]:
#lly_earnings = json.loads(av_response)
#lly_earnings = pd.DataFrame(lly_earnings['quarterlyEarnings'])
#lly_earnings.columns = ['period_end', 'date', 'eps', 'estimated_eps', 'surprise', 'surprise_pct', 'report_time']
#lly_earnings.set_index('date', inplace=True)
#lly_earnings.sort_index(inplace=True)
#lly_earnings['eps_ms'] = lly_earnings['eps'].rolling(4).sum()
#lly_earnings.index = pd.to_datetime(lly_earnings.index)
#lly_earnings.to_parquet('data/alpha_vantage/lly_earnings.parquet')

In [9]:
lly_earnings = pd.read_parquet('data/alpha_vantage/lly_earnings.parquet')

In [10]:
lly_earnings['report_time'].value_counts()

report_time
pre-market    114
Name: count, dtype: int64

In [11]:
lly_earnings

Unnamed: 0_level_0,period_end,eps,estimated_eps,surprise,surprise_pct,report_time,eps_ms
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1996-04-15,1996-03-31,0.36,0.36,0,0,pre-market,
1996-07-16,1996-06-30,0.32,0.32,0,0,pre-market,
1996-10-22,1996-09-30,0.38,0.32,0.06,18.75,pre-market,
1997-01-27,1996-12-31,0.34,0.34,0,0,pre-market,1.40
1997-04-21,1997-03-31,0.41,0.4,0.01,2.5,pre-market,1.45
...,...,...,...,...,...,...,...
2023-08-08,2023-06-30,2.11,1.98,0.13,6.5657,pre-market,7.80
2023-11-02,2023-09-30,0.1,-0.13,0.23,176.9231,pre-market,5.92
2024-02-06,2023-12-31,2.49,2.22,0.27,12.1622,pre-market,6.32
2024-04-30,2024-03-31,2.58,2.46,0.12,4.878,pre-market,7.28


In [12]:
prices = build_prices_table(stooq_dir, period, market)
add_one_cent_noise(prices)

Unnamed: 0,<TICKER>,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>
3784872,A.US,2024-08-26,153000,141.110,141.11,140.5300,140.910380,18411.0
3784873,A.US,2024-08-26,153500,140.800,140.90,140.6550,140.715247,13321.0
3784874,A.US,2024-08-26,154000,140.810,140.96,140.6200,140.721586,6411.0
3784875,A.US,2024-08-26,154500,140.850,141.05,140.7250,140.882872,5830.0
3784876,A.US,2024-08-26,155000,140.875,141.02,140.5900,140.594525,6260.0
...,...,...,...,...,...,...,...,...
8318828,ZZZ.US,2024-10-07,153000,24.280,24.28,24.2800,24.284390,105.0
8318829,ZZZ.US,2024-10-08,164000,24.320,24.32,24.3200,24.329704,110.0
8318830,ZZZ.US,2024-10-08,170000,24.400,24.40,24.3999,24.400740,200.0
8318831,ZZZ.US,2024-10-09,154500,24.380,24.38,24.3800,24.382044,103.0


In [13]:
#prices = pd.read_parquet('data/stooq/5 min/us/all_stocks_processed.parquet')
prices.columns = [col[1:-1].lower() for col in prices.columns]
prices = prices[ prices['ticker'] == 'LLY.US' ].copy()

In [14]:
prices['date'] = pd.to_datetime(prices['date'].astype(str))
prices['time'] = pd.to_timedelta(prices['time'].astype(str).apply(lambda x: f'{x[0:2]}:{x[2:4]}:{x[4:6]}'))
prices['datetime'] = prices.date + prices.time
#prices['prev_close'] = prices['close'].shift()
#prices['returns'] = prices['close'] / prices['prev_close']
prices.set_index('datetime', inplace=True)

In [15]:
prices = prices.merge(lly_earnings, 'outer', left_index=True, right_index=True)
prices['eps_ms'] = prices['eps_ms'].ffill()

In [16]:
final_df = prices[['close', 'eps_ms']].dropna()
final_df['log_adj_ey'] = np.log(1 + final_df['eps_ms']/final_df['close'].shift())
final_df.replace([-np.inf, np.inf], np.nan, inplace=True)
final_df.dropna(inplace=True)

In [17]:
final_df['log_close'] = np.log(final_df['close'])
final_df['log_return'] = np.log(final_df['close']) - np.log(final_df['close']).shift()

In [18]:
interest_rate = pd.read_csv(f'{stooq_dir}/daily/macro/us/inrtus.m.txt')
interest_rate = interest_rate[['<DATE>', '<CLOSE>']].copy()

In [19]:
interest_rate.rename(columns={'<DATE>': 'date', '<CLOSE>': 'interest'}, inplace=True) 
interest_rate['date'] = pd.to_datetime(interest_rate['date'].astype(str))
interest_rate.set_index('date', inplace=True)
interest_rate.sort_index(inplace=True)
final_df = final_df.join(interest_rate, how='outer')
final_df['interest'] = final_df['interest'].ffill()

In [20]:
final_df['log_interest'] = np.log((final_df['interest'] / 100) + 1)

In [21]:
final_df.dropna(inplace=True)

In [22]:
for i in range(0, 32):
    final_df[f'log_return_l{i}'] = final_df['log_close'] - final_df['log_close'].shift(i)

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [24]:
final_df.dropna(inplace=True)

In [25]:
#X = pd.concat([final_df.iloc[:, -30:], final_df['log_adj_ey']], axis='columns')
X = final_df.iloc[:, -30:]
y = final_df['log_return']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

In [27]:
model = DecisionTreeRegressor(min_samples_leaf=16).fit(X_train, y_train)

In [28]:
r2_score(y_train, model.predict(X_train))

0.4573504517377407

In [29]:
r2_score(y_test, model.predict(X_test))

0.4068093609214062

In [44]:
ccp_path = model.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = ccp_path.ccp_alphas
train_r2s = []
test_r2s = []
for alpha in ccp_alphas:
    model_i = DecisionTreeRegressor(ccp_alpha=alpha).fit(X_train, y_train)
    train_r2s.append(r2_score(y_train, model_i.predict(X_train)))
    test_r2s.append(r2_score(y_test, model_i.predict(X_test)))

In [48]:
ccp_df = pd.DataFrame({
    'train_r2': train_r2s,
    'test_r2': test_r2s
})

In [50]:
plx.line(
    data_frame=ccp_df.melt(ignore_index=False).reset_index(names=['ccp_alpha_index']),
    #title='Cummulative returns - Model vs Buy and hold',
    x='ccp_alpha_index',
    y='value',
    color='variable',
    height=720,
    width=1280,
)





In [51]:
model = DecisionTreeRegressor(ccp_alpha=ccp_alphas[33]).fit(X_train, y_train)

In [52]:
results_test = pd.DataFrame({
    'real': y_test,
    'predicted': model.predict(X_test),
    'error': model.predict(X_test) - y_test,
})

In [53]:
plx.line(
    data_frame=results_test.melt(ignore_index=False).reset_index(names=['date']),
    #title='Cummulative returns - Model vs Buy and hold',
    x='date',
    y='value',
    color='variable',
    height=720,
    width=1280,
)




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [54]:
plx.histogram(
    data_frame=results_test.melt(),
    #title='Comparação de erros - Modelo vs Random walk',
    x='value',
    color='variable',
    barmode='overlay',
    nbins=256,
    height=720,
    width=1280,
)





In [55]:
results_test['model_return'] = ((results_test.predicted > 0).astype(int) * results_test.real).cumsum()
results_test['cumm_return'] = results_test['real'].cumsum()

In [56]:
plx.line(
    data_frame=results_test[['model_return', 'cumm_return']].melt(ignore_index=False).reset_index(names=['date']),
    #title='Cummulative returns - Model vs Buy and hold',
    x='date',
    y='value',
    color='variable',
    height=720,
    width=1280,
)




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result

