In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RepeatedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import seaborn as sns
from scipy.stats import spearmanr
from datetime import datetime, timedelta
from functools import partial

In [None]:
stocks = [yf.Ticker(i) for i in "MSFT GOOGL AAPL TSLA META NVDA INTC PYPL KO AMZN".split()]

Calculate factor set for each stock, price trailing 12 months.

In [119]:
t1 = datetime.now()
t0 = t1 - timedelta(days=365)

def ffill_latest(d, quarterly):
    for date in quarterly.index:
        if date < d:
            return quarterly.loc[date]
        
    return float('nan')

mapper = lambda data, df: df.index.map(partial(ffill_latest, quarterly=data))

def factors(tickers, start, end) -> pd.DataFrame:
    all = {}
    
    for t in tickers:
        price = t.history(start=start, end=end)["Close"]
        price.index = price.index.tz_localize(None)
        
        inc = t.quarterly_financials
        bs = t.quarterly_balance_sheet
        cf = t.quarterly_cashflow
        
        df_factors = pd.DataFrame(index=price.index)
        df_factors["Returns"] = price.pct_change().dropna()
        mapped = partial(mapper, df=df_factors)
        
        ### value
        # earnings yield-P/E ratio
        df_factors["EPS"] = mapped(inc.loc["Net Income"]) / t.info["sharesOutstanding"]
        df_factors["P/E"] = price / df_factors["EPS"]
        df_factors["PEG"] = (price / (df_factors["EPS"].pct_change() + 1)) / 100 # quarterly
        
        # p/s, p/b
        df_factors["P/S"] = (price * mapped(inc.loc["Diluted Average Shares"])) / mapped(inc.loc["Total Revenue"])   
        df_factors["P/B"] = (price * mapped(inc.loc["Diluted Average Shares"])) / mapped(bs.loc["Tangible Book Value"])
        
        ### Quality
        # debt/equity
        df_factors["D/E"] = mapped(bs.loc["Total Debt"] / bs.loc["Common Stock Equity"])
        df_factors["P/FCF"] = (price * mapped(inc.loc["Diluted Average Shares"])) / mapped(cf.loc["Free Cash Flow"])
        
        all[t.ticker] = df_factors
        
    return all
        
factorset = factors(stocks, t0, t1)

In [None]:
s = StandardScaler()
for k, v in factorset.items():
    a = v[["Returns"]]
    b = v.drop(columns=["Returns"])
    t = s.fit_transform(X=a, y=b)
    
    print(v, t, t.shape)
    
    
# why normalise x data?

"First pass" Spearman correlation test