In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as plx
import seaborn as sns
import json
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
prices = pd.read_parquet('data/stooq/daily/us/all_stocks_processed.parquet')

In [3]:
prices.columns = [col[1:-1].lower() for col in prices.columns]

In [4]:
stooq_dir = 'data/stooq'
fundamentals_dir = 'data/sec/companyfacts'
tickers_path = 'data/sec/company_tickers.json'

In [5]:
period = 'daily'
market = 'us'
sample_companies = [
    'MSFT.US',
    'AVGO.US',
    'NVDA.US',
    'TSM.US',
    'TSLA.US',
    'META.US',
    'IBM.US',
    'AAPL.US',
    'ASML.US',
    'AMZN.US',
    'GOOG.US',
    'LLY.US',
    'WMT.US',
    'JPM.US',
    'V.US',
    'UNH.US',
    'XOM.US',
    'ORCL.US',
    'MA.US'
]

In [6]:
cik_by_ticker = None
with open(tickers_path, 'r') as file:
    cik_by_ticker = json.load(file)
cik_by_ticker = { value['ticker'] + '.US': value['cik_str'] for _key, value in cik_by_ticker.items() }

In [7]:
prices['cik'] = prices.ticker.map(cik_by_ticker)
prices.dropna(inplace=True)
prices['cik'] = prices.cik.astype(int)

In [8]:
prices.drop(columns=['delta_date', 'daily_log_return'], inplace=True, errors='ignore')
prices.set_index(['cik', 'date'], inplace=True)
prices.drop_duplicates(inplace=True)

In [13]:
prices = prices[~prices.index.duplicated(keep='first')]

In [15]:
prices

Unnamed: 0_level_0,Unnamed: 1_level_0,ticker,open,high,low,close,vol,return,log_close,log_return
cik,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1090872,1999-11-19,A.US,27.8972,27.9371,25.8613,26.239520,1.677358e+07,0.917694,3.267267,-0.085891
1090872,1999-11-22,A.US,26.8370,28.5858,26.0278,28.595501,7.242576e+06,1.089788,3.353249,0.085983
1090872,1999-11-23,A.US,27.6102,28.3377,25.9889,25.994932,6.579458e+06,0.909057,3.257902,-0.095348
1090872,1999-11-24,A.US,26.0637,27.2445,25.9889,26.682859,5.332648e+06,1.026464,3.284021,0.026120
1090872,1999-11-26,A.US,26.5569,26.9585,26.4752,26.769284,1.904229e+06,1.003239,3.287255,0.003234
...,...,...,...,...,...,...,...,...,...,...
846475,2024-05-24,ZYXI.US,10.2900,10.5200,10.2500,10.525519,8.810700e+04,1.022485,2.353803,0.022236
846475,2024-05-28,ZYXI.US,10.5300,10.5450,10.0100,10.110169,1.407030e+05,0.960539,2.313542,-0.040261
846475,2024-05-29,ZYXI.US,10.1100,10.1100,9.9300,10.029619,1.221660e+05,0.992033,2.305543,-0.007999
846475,2024-05-30,ZYXI.US,10.0500,10.1200,9.8800,9.920865,1.627040e+05,0.989157,2.294640,-0.010903


In [19]:
#pivot pivot_table unstack
log_returns = prices.reset_index().pivot(index='date', columns='cik', values='log_return')

In [26]:
log_returns = log_returns[log_returns.index >= datetime(2000, 1, 1)]

cik,1750,1800,2230,2488,2809,2969,3197,3453,3499,3545,...,1999480,2000178,2000775,2001184,2002473,2006191,2006291,2007919,2011053,2012807
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-03,,-0.036254,,0.068469,-0.035224,-0.022325,,,,,...,,,,,,,,,,
2000-01-04,,-0.029980,,-0.058368,-0.024721,-0.031163,,,,,...,,,,,,,,,,
2000-01-05,,-0.001553,,0.025262,-0.009985,0.005819,,,,,...,,,,,,,,,,
2000-01-06,,0.034195,,0.065040,0.019108,0.064439,,,,,...,,,,,,,,,,
2000-01-07,,0.010813,,0.015137,0.025977,0.067212,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-24,0.017624,-0.002751,0.037910,0.036261,0.005838,0.006087,0.042361,0.013134,0.000300,-0.020495,...,0.093668,0.004377,0.000143,0.027345,0.076848,-0.093221,-0.000664,,0.013258,0.001924
2024-05-28,-0.007561,-0.019387,-0.007260,0.031102,0.017854,0.000093,-0.015420,0.021362,-0.003498,0.054667,...,-0.050539,-0.069538,-0.000105,-0.008169,0.029664,-0.071414,,,-0.054536,-0.002014
2024-05-29,-0.021096,-0.012066,-0.019281,-0.038435,-0.022630,-0.016599,-0.015875,-0.018097,0.000531,-0.025017,...,0.021955,0.095012,0.000311,-0.011226,-0.007070,-0.019136,0.001763,,-0.109740,-0.000179
2024-05-30,0.007423,0.009951,-0.005123,0.009693,0.017695,0.000869,-0.011145,0.022099,0.003583,0.029211,...,-0.044995,-0.019329,0.000274,0.017373,-0.005718,0.017565,-0.000196,0.130525,0.135277,0.000973


In [35]:
least_missing = (log_returns.isna().sum() / len(log_returns)).sort_values().index

In [39]:
log_returns = log_returns.drop(columns=least_missing[500:]).dropna()

In [44]:
scaler = StandardScaler()
scaled_log_returns = scaler.fit_transform(log_returns)

In [45]:
pca = PCA()
pca_results = pca.fit_transform(scaled_log_returns)

In [50]:
pd.DataFrame(pca.components_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,-0.042298,-0.035718,-0.017226,-0.054678,-0.040962,-0.058821,-0.055227,-0.041431,-0.057269,-0.051709,...,-0.054020,-0.049888,-0.044958,-0.054345,-0.050754,-0.037441,-0.051874,-0.041793,-0.043534,-0.018621
1,0.074826,-0.016739,0.033666,0.030475,0.154794,-0.028535,-0.014521,-0.025230,-0.028979,-0.012024,...,-0.017319,-0.048246,0.027837,-0.002991,-0.032829,-0.003182,0.014517,-0.024291,-0.053919,0.037951
2,0.031568,-0.004286,-0.159810,-0.026127,-0.005907,0.031832,0.014636,0.022710,0.010771,0.000837,...,-0.009704,-0.088741,-0.000696,0.007529,-0.002330,0.009412,0.049373,-0.008377,-0.141657,-0.150766
3,-0.059506,-0.098407,-0.018770,-0.029597,0.061421,0.047800,0.070189,0.061696,0.070854,-0.103780,...,0.005116,0.004395,0.018945,-0.007939,0.007247,-0.085823,0.075118,-0.113202,0.040376,-0.010223
4,0.062232,-0.014461,-0.051661,0.039542,-0.015096,0.007088,0.027437,0.021018,0.063596,-0.015268,...,-0.008943,0.016433,0.046767,-0.007866,-0.020754,-0.060176,-0.057605,-0.037455,0.036319,-0.054389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,-0.005696,0.001278,0.003931,0.013643,-0.102327,0.114229,-0.069923,-0.012120,-0.022218,-0.063104,...,-0.011645,-0.039825,0.020273,-0.012390,-0.030342,0.014783,0.061937,-0.002657,0.058278,0.002190
496,0.017241,0.006017,-0.068589,-0.002049,-0.064859,0.028944,-0.014503,-0.056818,0.016796,-0.060012,...,-0.012109,-0.007150,-0.029457,-0.051273,0.012563,0.000079,-0.011362,0.008765,0.034955,0.015817
497,0.006968,0.005815,-0.022804,0.004959,-0.032158,0.034223,-0.016008,-0.003091,0.042622,-0.012233,...,-0.023430,-0.031083,0.026404,-0.013943,0.014342,-0.013749,-0.010242,-0.000575,0.014662,0.023093
498,-0.007771,-0.000374,0.055226,-0.008094,-0.071821,-0.029078,-0.018979,0.032318,0.050684,-0.004453,...,-0.042886,-0.019631,-0.008982,0.012810,0.005008,0.004384,-0.030278,-0.007805,-0.013416,-0.047203


In [47]:
pd.DataFrame(pca_results)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,4.996027,-2.077481,-0.097859,1.597446,-0.978524,-0.167301,-2.122911,0.245579,-2.419496,1.023471,...,0.418863,0.005134,-0.321072,0.243361,0.019988,-0.275409,-0.092655,-0.179238,0.454512,-0.045185
1,1.154181,0.352998,-2.502272,1.436186,-0.105040,-1.168089,-0.461103,-0.341891,0.696059,0.622931,...,0.420896,0.070516,-0.287012,-0.338531,-0.017388,-0.113269,0.080142,-0.143686,-0.007579,0.149034
2,-0.201359,-0.364242,-1.644586,2.855798,0.632316,-1.969428,-1.592959,-0.517469,-0.878483,-1.332954,...,-0.147991,-0.178133,0.011883,-0.388333,-0.261389,0.422007,0.474341,-0.175892,0.043535,-0.111784
3,-1.986522,1.524980,2.285625,-1.709101,-0.687960,0.181966,0.202800,3.045166,-1.828815,2.286209,...,0.364558,0.118170,0.242378,0.009637,0.211228,0.289303,-0.244144,-0.375584,0.246951,0.076150
4,5.806270,-0.831894,-1.810455,2.414610,0.112376,2.055816,-1.096209,-2.231595,-0.581378,2.163686,...,-0.410124,-0.124513,-0.112860,-0.064250,0.137186,-0.278046,-0.360556,0.114042,-0.001948,-0.096690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4052,-0.622753,-0.884559,0.496509,-1.579957,0.508800,0.507485,0.843028,2.273093,-4.588432,1.996038,...,0.047216,0.318341,0.134800,-0.211423,0.034356,-0.434273,-0.211291,-0.249233,-0.230169,0.099190
4053,-2.969231,-4.046551,1.540657,-1.882025,-3.474476,-1.225213,-1.702331,0.923940,2.625985,-1.512902,...,-0.430751,0.180576,-0.212002,-0.237762,-0.034878,-0.155106,0.018498,-0.004518,0.192050,-0.317030
4054,-2.256691,-2.331790,2.301085,-2.110897,-0.428159,0.104803,0.634786,3.611365,0.863674,-0.777546,...,0.052228,0.466476,0.077494,0.129832,-0.342647,-0.092252,-0.112094,-0.108557,-0.331374,-0.458240
4055,-8.946885,-0.192885,0.203034,2.716721,-1.059646,-0.707998,-2.925695,-2.241117,2.899356,-0.657003,...,-0.277510,-0.133590,0.059531,0.351176,-0.146816,0.079233,-0.110805,0.366123,-0.169009,-0.112815
