In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as plx
from pathlib import Path
from random import random
from datetime import timedelta

from sec import *
import json
from tqdm.notebook import tqdm

In [2]:
period = 'daily'
market = 'us'
sample_companies = [
    'MSFT.US',
    'AVGO.US',
    'NVDA.US',
    'TSM.US',
    'TSLA.US',
    'META.US',
    'IBM.US',
    'AAPL.US',
    'ASML.US'
]

In [3]:
prices = pd.read_parquet(Path(f'data/stooq/{period}/{market}/all_stocks_processed.parquet'))
prices.columns = [col[1:-1].lower() for col in prices.columns]
prices['delta_date'] = prices.date - prices.groupby('ticker')['date'].shift()
prices['daily_log_return'] = prices.log_return / prices.delta_date.dt.days

In [4]:
plx.line(
    data_frame=prices[prices['ticker'].isin(sample_companies)], 
    x='date', 
    y='daily_log_return', 
    color='ticker',
    log_y=False,
    height=720,
    width=1280
)

  sf: grouped.get_group(s if len(s) > 1 else s[0])
  v = v.dt.to_pydatetime()


In [5]:
target_cols = [
    'NetIncomeLoss'
    'OperatingIncomeLoss',
    'OperatingExpenses',
    'InterestExpense',
    'IncomeTaxesPaidNet',
    'DepreciationAndAmortization'
]

In [6]:
cik_by_ticker = None
with open('data/sec/company_tickers.json', 'r') as file:
    cik_by_ticker = json.load(file)
cik_by_ticker = { value['ticker'] + '.US': value['cik_str'] for _key, value in cik_by_ticker.items() }

In [7]:
prices['cik'] = prices.ticker.map(cik_by_ticker)
prices.dropna(inplace=True)
prices['cik'] = prices.cik.astype(int)

In [8]:
fundamentals = []
for company_cik in tqdm(prices.cik.unique()):
    company_data = load_company(company_cik)
    if company_data is None: continue
    try: company_data = get_time_series(company_data, 'EarningsPerShareDiluted', 'USD/shares')
    except KeyError: continue
    convert_datetimes(company_data)
    add_time_offsets(company_data)
    try: drop_unused_columns(company_data)
    except KeyError: continue
    quarters, years = get_current_quarters_years(company_data)
    company_data = [add_q4(quarters, years[years.fy == fy]) for fy, fy_quarters in quarters.groupby('fy')]
    try: company_data = pd.concat(company_data, ignore_index=True)
    except ValueError: continue
    company_data['cik'] = company_cik
    fundamentals.append(company_data)
fundamentals = pd.concat(fundamentals, ignore_index=True)

  0%|          | 0/5719 [00:00<?, ?it/s]

In [9]:
sample_ciks = [cik_by_ticker[ticker] for ticker in sample_companies]
fundamentals['cik'] = fundamentals['cik'].astype('object')
plx.scatter(
    data_frame = fundamentals[fundamentals['cik'].isin(sample_ciks)] ,
    x = 'filed',
    y = 'val',
    color = 'cik',
    log_y=False,
    height=720,
    width=1280
)




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [10]:
fundamentals['cik'] = fundamentals['cik'].astype(int)
fundamentals.drop(columns=['end', 'fy', 'fp'], inplace=True)
fundamentals.rename({'val': 'eps', 'filed': 'date'}, inplace=True, axis='columns')
fundamentals.set_index(['cik', 'date'], inplace=True)
fundamentals.drop_duplicates(inplace=True)

In [11]:
prices.set_index(['cik', 'date'], inplace=True)
prices.drop(columns=['open', 'high', 'low', 'vol', 'delta_date', 'daily_log_return'], inplace=True)
prices.drop_duplicates(inplace=True)

In [12]:
final_table = prices.merge(fundamentals, 'outer', left_index=True, right_index=True)
final_table.sort_index(inplace=True)
final_table = final_table.groupby(level='cik').ffill()
final_table.dropna(inplace=True)

In [14]:
final_table

Unnamed: 0_level_0,Unnamed: 1_level_0,ticker,close,return,log_close,log_return,eps,delay,period
cik,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1750,2010-09-23,AIR.US,16.267357,0.972177,2.789160,-0.028218,0.35,23 days,91 days
1750,2010-09-24,AIR.US,17.441464,1.072176,2.858850,0.069690,0.35,23 days,91 days
1750,2010-09-27,AIR.US,16.611855,0.952435,2.810117,-0.048734,0.35,23 days,91 days
1750,2010-09-28,AIR.US,16.876700,1.015943,2.825934,0.015817,0.35,23 days,91 days
1750,2010-09-29,AIR.US,17.003561,1.007517,2.833423,0.007489,0.35,23 days,91 days
...,...,...,...,...,...,...,...,...,...
2001184,2024-05-24,PACS.US,30.030389,1.027722,3.402210,0.027345,0.38,43 days,90 days
2001184,2024-05-28,PACS.US,29.786058,0.991864,3.394040,-0.008169,0.38,43 days,90 days
2001184,2024-05-29,PACS.US,29.453542,0.988837,3.382814,-0.011226,0.38,43 days,90 days
2001184,2024-05-30,PACS.US,29.969700,1.017524,3.400187,0.017373,0.38,43 days,90 days


In [13]:
final_table['log_adj_earnings_yield'] = np.log(1 + final_table.eps/final_table.price)

SyntaxError: invalid syntax (1322498193.py, line 1)

In [None]:
final_table.dropna(inplace=True)