In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as plx
import seaborn as sns
import json
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
prices = pd.read_parquet('data/stooq/daily/us/all_stocks_processed.parquet')

In [3]:
prices.columns = [col[1:-1].lower() for col in prices.columns]

In [4]:
stooq_dir = 'data/stooq'
fundamentals_dir = 'data/sec/companyfacts'
tickers_path = 'data/sec/company_tickers.json'

In [5]:
period = 'daily'
market = 'us'
sample_companies = [
    'MSFT.US',
    'AVGO.US',
    'NVDA.US',
    'TSM.US',
    'TSLA.US',
    'META.US',
    'IBM.US',
    'AAPL.US',
    'ASML.US',
    'AMZN.US',
    'GOOG.US',
    'LLY.US',
    'WMT.US',
    'JPM.US',
    'V.US',
    'UNH.US',
    'XOM.US',
    'ORCL.US',
    'MA.US'
]

In [6]:
cik_by_ticker = None
with open(tickers_path, 'r') as file:
    cik_by_ticker = json.load(file)
cik_by_ticker = { value['ticker'] + '.US': value['cik_str'] for _key, value in cik_by_ticker.items() }

In [7]:
prices['cik'] = prices.ticker.map(cik_by_ticker)
prices.dropna(inplace=True)
prices['cik'] = prices.cik.astype(int)

In [8]:
prices.drop(columns=['delta_date', 'daily_log_return'], inplace=True, errors='ignore')
prices.set_index(['cik', 'date'], inplace=True)
prices.drop_duplicates(inplace=True)

In [9]:
prices = prices[~prices.index.duplicated(keep='first')]

In [10]:
prices

Unnamed: 0_level_0,Unnamed: 1_level_0,ticker,open,high,low,close,vol,return,log_close,log_return
cik,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1090872,1999-11-19,A.US,27.8972,27.9371,25.8613,26.239520,1.677358e+07,0.917694,3.267267,-0.085891
1090872,1999-11-22,A.US,26.8370,28.5858,26.0278,28.595501,7.242576e+06,1.089788,3.353249,0.085983
1090872,1999-11-23,A.US,27.6102,28.3377,25.9889,25.994932,6.579458e+06,0.909057,3.257902,-0.095348
1090872,1999-11-24,A.US,26.0637,27.2445,25.9889,26.682859,5.332648e+06,1.026464,3.284021,0.026120
1090872,1999-11-26,A.US,26.5569,26.9585,26.4752,26.769284,1.904229e+06,1.003239,3.287255,0.003234
...,...,...,...,...,...,...,...,...,...,...
846475,2024-05-24,ZYXI.US,10.2900,10.5200,10.2500,10.525519,8.810700e+04,1.022485,2.353803,0.022236
846475,2024-05-28,ZYXI.US,10.5300,10.5450,10.0100,10.110169,1.407030e+05,0.960539,2.313542,-0.040261
846475,2024-05-29,ZYXI.US,10.1100,10.1100,9.9300,10.029619,1.221660e+05,0.992033,2.305543,-0.007999
846475,2024-05-30,ZYXI.US,10.0500,10.1200,9.8800,9.920865,1.627040e+05,0.989157,2.294640,-0.010903


In [11]:
#pivot pivot_table unstack
log_returns = prices.reset_index().pivot(index='date', columns='cik', values='log_return')

In [12]:
log_returns = log_returns[log_returns.index >= datetime(2000, 1, 1)]

In [13]:
least_missing = (log_returns.isna().sum() / len(log_returns)).sort_values().index

In [14]:
log_returns = log_returns.drop(columns=least_missing[500:]).dropna()

In [15]:
scaler = StandardScaler()
scaled_log_returns = scaler.fit_transform(log_returns)

In [16]:
pca = PCA()
pca_results = pca.fit_transform(scaled_log_returns)

In [17]:
pd.DataFrame(pca.components_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,-0.042468,-0.035757,-0.017270,-0.054759,-0.041048,-0.058784,-0.055185,-0.041431,-0.057289,-0.051784,...,-0.054029,-0.049892,-0.045076,-0.054360,-0.050753,-0.037454,-0.051674,-0.041854,-0.043615,-0.018689
1,0.070531,-0.019216,0.030710,0.027363,0.153691,-0.025618,-0.012336,-0.022988,-0.028003,-0.014890,...,-0.017077,-0.049632,0.027024,-0.003420,-0.032091,-0.004103,0.019191,-0.026790,-0.056646,0.035760
2,0.046610,0.024916,-0.139325,-0.014786,-0.023666,0.014837,-0.006810,0.002705,-0.010047,0.030861,...,-0.009880,-0.082574,-0.006636,0.009686,-0.004966,0.033581,0.024966,0.025326,-0.146004,-0.133705
3,-0.047395,-0.095217,-0.074739,-0.035735,0.047827,0.059639,0.074791,0.071620,0.077361,-0.100270,...,0.003229,-0.019538,0.021779,-0.004649,0.008379,-0.080425,0.081059,-0.111573,-0.002755,-0.063308
4,0.047097,-0.021612,-0.006030,0.039832,0.013386,-0.012035,0.011417,0.001328,0.043251,-0.018059,...,-0.009211,0.016441,0.017859,-0.008908,-0.023256,-0.059018,-0.035176,-0.038703,0.039501,-0.018141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.019169,0.001584,-0.073200,-0.008073,-0.033313,-0.069611,0.029836,0.026348,-0.040086,0.054181,...,-0.001557,0.042120,-0.041382,-0.010290,-0.027596,-0.010409,-0.004234,0.004657,0.067726,0.020741
496,-0.010186,-0.004969,0.063043,0.007441,0.031779,-0.035225,0.030822,0.058642,-0.024436,0.069647,...,0.018110,0.010241,0.016685,0.049415,-0.012563,-0.011630,-0.003686,0.000551,-0.019338,-0.036507
497,-0.008815,-0.002554,0.032923,0.004377,-0.083672,-0.032543,-0.031171,0.022142,0.032332,0.011156,...,-0.036759,-0.015533,-0.015753,0.010823,0.001085,0.006509,-0.020709,-0.005667,-0.025771,-0.026866
498,0.009978,0.006064,-0.019321,0.000884,-0.072932,0.051102,-0.042973,0.002397,0.014896,-0.023507,...,-0.036578,-0.038564,0.025662,-0.020003,0.005404,-0.015483,0.000415,0.002921,0.021129,0.027737


In [18]:
pd.DataFrame(pca_results)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,5.184184,-1.889406,-0.618805,1.525649,-0.362282,1.203348,-1.889437,0.152979,-2.637133,-0.556005,...,0.482965,-0.148874,0.081910,0.467740,-0.068557,-0.114508,0.110155,0.330243,-0.228952,-0.016403
1,1.249674,0.274830,-2.593997,0.865094,0.212342,1.620428,-0.149892,-0.360259,0.839170,-1.230341,...,0.415650,0.264239,0.215831,0.100710,-0.029262,0.263044,-0.043095,0.115269,-0.232292,0.139131
2,-0.186701,-0.269139,-2.516690,2.438506,0.679993,2.302760,0.218378,-0.914040,-0.854078,0.802891,...,-0.098704,-0.126986,0.234999,-0.665097,0.083249,0.541536,-0.644050,0.162057,-0.282789,-0.139668
3,-1.894654,1.439447,2.492957,-1.009079,-0.456612,-0.072823,0.142823,3.142126,-1.662884,-0.929763,...,0.327235,-0.320743,-0.041043,0.082807,0.437285,-0.063018,0.107869,0.259046,-0.355474,0.065958
4,5.667705,-0.600537,-2.435502,1.740388,0.096593,-0.920988,-2.077921,-2.170854,-0.026517,-0.257503,...,-0.025651,0.078627,-0.120478,0.203738,-0.248641,-0.035137,0.474224,-0.252683,-0.045742,-0.068841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4052,-0.679558,-0.593990,0.854585,-1.481010,-0.104013,-1.260474,0.538218,1.285391,-4.117738,-0.551009,...,-0.448165,0.306819,-0.087494,0.376955,-0.092856,-0.174519,0.004985,-0.067509,-0.137679,-0.004192
4053,-2.970623,-4.458079,2.091004,-1.741603,-2.463599,2.648809,-0.987476,1.064339,1.355844,-1.020767,...,-0.408758,0.696482,0.131752,0.003066,0.014147,0.463013,0.224749,0.078729,-0.009563,-0.272834
4054,-2.219343,-2.129968,2.816400,-2.173345,-0.698223,-0.122671,0.170218,3.531528,1.029913,2.221214,...,-0.134187,-0.007148,0.091962,-0.280743,0.151360,-0.121401,0.042776,-0.116905,-0.276015,-0.519262
4055,-9.236993,-0.208522,-0.536263,2.897397,-0.549887,3.096932,-2.482306,-1.573741,3.083470,-0.881959,...,-0.372467,-0.186193,0.166920,-0.346438,0.102858,0.081131,0.171114,-0.014023,0.325137,-0.096902
