In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import itertools
import os
import time
import datetime as dt 
import yfinance as yf
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import seaborn as sns

from functions.datareader import pull_stock_data

<h1 style="color:orange">Load data using YFinance and save to local</h1>

In [18]:
# source for yfinance: https://pypi.org/project/yfinance/
start = dt.date(2015, 1, 1)
end = dt.date(2022, 2, 1)

# * parameters
ANNUALIZATION_FACTOR = 252

In [11]:
sectors = {
    'banking': ['BBL', 'KBANK', 'SCB', 'BAY', 'TISCO', 'KTB', 'KKP', 'TTB'],
    'hospital': ['BDMS', 'BCH', 'BH', 'THG', 'PR9', 'EKH', 'IMH'],
    'infrastructure': ['AOT', 'BAFS', 'BTS', 'BEM', 'DMT'],
    'consumer_staple': ['BJC', 'OR', 'CPALL', 'CRC', 'MAJOR', 'GFPT', 'CPF', 'MAKRO', 'M'],
    'consumer_discretionary': ['COM7', 'CPW', 'SYNEX'],
    'technology': ['BBIK', 'IIG', 'BE8'],
    'industrial': ['WHA', 'AMATA'],
    'material': ['SCGP', 'HMPRO', 'GLOBAL', 'DOHOME', 'TPIPL', 'STGT', 'THMUI'],
    'utilities': ['RATCH', 'BGRIM', 'GULF', 'TPIPP', 'EGCO', 'EA', 'BANPU', 'ACE'],
    'petrochemistry': ['PTTGC', 'BCP', 'IRPC', 'IVL'],
    'real_estate': ['SIRI', 'QH', 'AP', 'SPALI', 'ORI', 'LALIN'],
    'hotel': ['MINT', 'CENTEL', 'ERW', 'AWC', 'SPA'],
    'consumer_finance': ['SAWAD', 'KTC', 'AEONTS', 'TIDLOR', 'MTC'],
    'insurance': ['BLA']
}

all_tickers = sectors.values()
all_tickers = [v + '.BK' for s in all_tickers for v in s]

In [8]:
# all_meta = yf.Tickers(all_tickers)
# all_price_df = all_meta.history(period = 'max')
# all_price_df.tail(1)

In [109]:
# # Price
# parent_dir = './data/set'
# for t in all_tickers:
#     t_trim = t.replace('.BK', '')
#     ticker_dir = f'{parent_dir}/{t_trim}'
#     if not os.path.exists(ticker_dir):
#         os.mkdir(ticker_dir)
    
#     ticker_cols = [c for c in all_price_df.columns if c[1] == t]
#     ticker_df = all_price_df[ticker_cols].dropna(axis = 0)
#     ticker_df.columns = [c[0].lower() for c in ticker_df.columns]
#     ticker_df.insert(0, 'ticker', t_trim)
#     ticker_df.index.name = 'date'

#     price_dir = f'{ticker_dir}/price'
#     if not os.path.exists(price_dir):
#         os.mkdir(price_dir)
#     years = sorted(list(set(ticker_df.index.year)))
#     for y in years:
#         year_df = ticker_df[ticker_df.index.year == y]
#         year_df.to_parquet(f'{price_dir}/{str(y)}.parquet')

<h1 style="color:salmon">Load data from local and use only close price</h1>

In [2]:
raw_df = pd.read_parquet('./data/set')

# convert index from datetime to date
raw_df.index = raw_df.index.date
raw_df.index.name = 'date'

raw_df.tail()

Unnamed: 0_level_0,ticker,close,dividends,high,low,open,stock splits,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-02-17,WHA,3.86,0.0,3.86,3.82,3.84,0.0,30699900.0
2023-02-20,WHA,3.86,0.0,3.88,3.84,3.86,0.0,15987800.0
2023-02-21,WHA,4.0,0.0,4.0,3.9,3.9,0.0,326198100.0
2023-02-22,WHA,3.94,0.0,4.0,3.92,3.96,0.0,136125400.0
2023-02-23,WHA,3.94,0.0,3.96,3.9,3.94,0.0,52493989.0


In [78]:
# map sector using the sector dict above
vk = [(k, v) for k, v in sectors.items()]
sector_mapper = {sub: s[0]for s in vk for sub in s[1]}
raw_df['sector'] = raw_df['ticker'].map(sector_mapper)


<p>We'll define a starting period of our analysis. Therefore we'll need to know whether there are stocks of which the daily price data doesn't exist yet in that period.</p>

In [89]:
start_date = dt.date(2015, 1, 1)

first_date_df = raw_df[['ticker']].reset_index().groupby('ticker').min()
first_date_df['is_available_since_start'] = first_date_df['date'].apply(lambda x: True if x <= start_date else False)
# first_date_df.head()

available_tickers = first_date_df[first_date_df['is_available_since_start'] == True].reset_index()['ticker']

# create a sector_mapper of available stocks
sector_mapper_available = {s:sector_mapper[s] for s in sector_mapper if s in available_tickers.values}

In [14]:
# filter only selected stocks
df = raw_df.reset_index().merge(available_tickers, left_on = 'ticker', right_on = 'ticker').set_index('date')
df.head()

Unnamed: 0_level_0,ticker,close,dividends,high,low,open,stock splits,volume,sector
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2001-12-11,AEONTS,4.823158,0.0,5.126979,4.405404,4.481359,0.0,78469000.0,consumer_finance
2001-12-12,AEONTS,4.709227,0.0,5.013048,4.709227,4.937093,0.0,31076500.0,consumer_finance
2001-12-13,AEONTS,4.557315,0.0,4.785181,4.557315,4.747203,0.0,11258000.0,consumer_finance
2001-12-14,AEONTS,4.823158,0.0,4.899113,4.443382,4.557315,0.0,19788500.0,consumer_finance
2001-12-17,AEONTS,4.823158,0.0,4.899113,4.823158,4.899113,0.0,3423000.0,consumer_finance


In [15]:
close_df = df[['ticker', 'sector', 'close']]
close_df.head()

Unnamed: 0_level_0,ticker,sector,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-12-11,AEONTS,consumer_finance,4.823158
2001-12-12,AEONTS,consumer_finance,4.709227
2001-12-13,AEONTS,consumer_finance,4.557315
2001-12-14,AEONTS,consumer_finance,4.823158
2001-12-17,AEONTS,consumer_finance,4.823158


In [69]:
# calculate daily return 
close_df_pivot = close_df.reset_index().pivot(index = 'date', columns = 'ticker', values = 'close')
daily_return_df = close_df_pivot.pct_change()
daily_return_df = daily_return_df[daily_return_df.index >= start_date]
daily_return_df.head(2)

ticker,AEONTS,AMATA,AOT,AP,BAFS,BANPU,BAY,BBL,BCH,BCP,...,RATCH,SAWAD,SIRI,SPA,SPALI,SYNEX,TISCO,TPIPL,TTB,WHA
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-05,-0.004608,0.025478,-0.031915,0.0,-0.024,-0.012048,-0.050279,-0.020619,0.018987,-0.015748,...,0.012766,-0.012048,-0.029412,0.004902,0.0,0.0,0.017647,0.023392,-0.034247,-0.015873
2015-01-06,0.00463,0.0,-0.003663,-0.008403,0.016394,0.00813,-0.005882,-0.002631,0.012422,-0.008,...,-0.012605,0.01626,-0.006061,0.019512,-0.016598,0.007092,0.00578,0.057143,-0.014184,-0.032258


In [70]:
# annualized calculcate covariance matrix
return_covmat = daily_return_df.cov().mul(ANNUALIZATION_FACTOR)
return_covmat.tail(2)

ticker,AEONTS,AMATA,AOT,AP,BAFS,BANPU,BAY,BBL,BCH,BCP,...,RATCH,SAWAD,SIRI,SPA,SPALI,SYNEX,TISCO,TPIPL,TTB,WHA
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TTB,0.031732,0.038884,0.035766,0.030306,0.024133,0.034683,0.033979,0.042454,0.020531,0.034032,...,0.016915,0.036495,0.033885,0.036185,0.024121,0.02843,0.026887,0.026025,0.097691,0.035513
WHA,0.036842,0.067245,0.035203,0.03092,0.025327,0.037163,0.028645,0.028221,0.024452,0.032616,...,0.022144,0.048281,0.038888,0.037035,0.026746,0.035365,0.021732,0.030092,0.035513,0.108748


In [71]:
# sns.heatmap(return_covmat)

<h2 style="color:cream">Create Factors</h2>

In [72]:
# Average return across stock universe
daily_average_return_df = daily_return_df.mean(axis = 'columns').to_frame()
daily_average_return_df.columns = ['avg_return']

# Median return across stock universe
daily_median_return_df = daily_return_df.median(axis = 'columns').to_frame()
daily_median_return_df.columns = ['median_return']

In [92]:
# 1 year momemtum (1 year return)
return_1yr = close_df_pivot.pct_change(periods = 252).dropna(axis = 0)

# demean by sector
momemtum_demean = None

for stocks in sectors.values():
    sector_stocks = [s for s in stocks if s in return_1yr.columns]
    sector_return = return_1yr[sector_stocks]
    sector_avg = sector_return.mean(axis = 1)
    sector_demean = sector_return - np.broadcast_to(sector_avg.values.reshape(-1, 1), sector_return.shape)
    momemtum_demean = sector_return if momemtum_demean is None else momemtum_demean.merge(sector_return, left_index = True, right_index = True)

# rank 
momentum_rank = momemtum_demean.rank(axis = 1, method = 'max')
momentum_rank.head()

# z-score
momentum_factor_df = momentum_rank.apply(zscore, axis = 1)

momentum_factor_df.head(3)

ticker,BBL,KBANK,BAY,TISCO,KTB,KKP,TTB,BDMS,BCH,BH,...,LALIN,MINT,CENTEL,ERW,SPA,SAWAD,KTC,AEONTS,MTC,BLA
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-12-09,-0.919239,-1.555635,-1.484924,0.070711,-1.272792,-0.353553,-1.131371,0.919239,-0.070711,1.343503,...,-0.212132,0.989949,1.272792,-0.636396,1.626346,1.697056,1.414214,-0.424264,1.555635,0.494975
2015-12-11,-1.06066,-1.626346,-1.414214,0.212132,-1.272792,-0.141421,-1.131371,0.848528,0.070711,1.414214,...,-0.282843,0.777817,1.343503,-0.424264,1.626346,1.697056,1.272792,-0.070711,1.555635,0.141421
2015-12-14,-1.131371,-1.484924,-1.414214,-0.070711,-1.272792,-0.424264,-1.06066,0.989949,0.141421,1.414214,...,-0.353553,0.777817,1.343503,-0.707107,1.626346,1.697056,1.272792,0.0,1.555635,0.070711


In [93]:
# Mean reversion factor (-1 * moving average)
N_DATE = 5
SMOOTH = False

ma_df = close_df.rolling(N_DATE).mean().mul(-1).dropna(axis = 0)

  ma_df = close_df.rolling(N_DATE).mean().mul(-1).dropna(axis = 0)


<h1 style="color:Orange">Example of Factor Returns on Multiple Stocks</h1>

In [94]:
def get_factor_exposure(factor_return, asset_return, return_residual:bool = False):
    lr = LinearRegression()
    lr.fit(factor_return, asset_return)
    if return_residual:
        return lr.coef_, lr.intercept_
    else:
        return lr.coef_

In [113]:
# use average daily return and median daily return to predict AOT prices 
test_tickers = ['AOT', 'KBANK', 'SIRI']
selected_return_df = daily_return_df[test_tickers].reset_index(drop = True)

# use 2-days lagging of return (keep the date index separately since we'll use it in visualization)
N_DATE_DELAYED = 2

selected_return_df = selected_return_df.iloc[N_DATE_DELAYED:].reset_index(drop = True)
return_dates = daily_return_df.index[N_DATE_DELAYED:]

factor_dates = daily_average_return_df.index[:-N_DATE_DELAYED]
factor_df = daily_average_return_df.merge(daily_median_return_df, left_index = True, right_index = True)
factor_df = factor_df.iloc[:-N_DATE_DELAYED]

In [115]:
factor_df.head()

Unnamed: 0_level_0,avg_return,median_return
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-05,-0.009533,-0.012048
2015-01-06,1e-05,-0.003663
2015-01-07,0.014169,0.010554
2015-01-08,0.014865,0.012048
2015-01-09,0.005102,0.0


In [116]:
assert selected_return_df.shape[0] == factor_df.shape[0], 'Number of rows do not match'

In [117]:
# fit OLS
factor_names = factor_df.columns
factor_exposures = []
# specific_returns = []
for ticker in test_tickers:
    single_return_df = selected_return_df[ticker]
    factor_exposure, specific_return = get_factor_exposure(factor_df, single_return_df, return_residual = True)
    factor_exposures.append(factor_exposure)
    # specific_returns.append(specific_return)
factor_exposure_df = pd.DataFrame(factor_exposures, columns = factor_names, index = selected_return_df.columns)

In [118]:
factor_exposure_df

Unnamed: 0_level_0,avg_return,median_return
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
AOT,-0.004079,0.021017
KBANK,0.115004,-0.077341
SIRI,0.161802,-0.101381


In [119]:
res = []
for ticker in test_tickers:
    ticker_exposure = factor_exposure_df.loc[ticker]
    ticker_common_return_df = (factor_df * ticker_exposure).sum(axis = 1)
    res.append(ticker_common_return_df)
# common_return_df = pd.DataFrame(res, columns = return_dates, index = factor_exposure_df.index).T
common_return_df = pd.DataFrame(res).T
common_return_df.columns = factor_exposure_df.index 
common_return_df.index = return_dates

In [120]:
common_return_df.head()

ticker,AOT,KBANK,SIRI
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-07,-0.000214,-0.000165,-0.000321
2015-01-08,-7.7e-05,0.000284,0.000373
2015-01-09,0.000164,0.000813,0.001223
2015-01-12,0.000193,0.000778,0.001184
2015-01-13,-2.1e-05,0.000587,0.000825


In [67]:
# visualize common return
# common_return_df.plot(figsize = (8, 2), alpha = 0.5)

In [133]:
# create a dict of stock-wise dataframe containing common and specific returns
ticker_returns = dict()
specific_variances = []
for ticker in test_tickers:
    tmp_common = common_return_df[[ticker]]
    tmp_real = daily_return_df[[ticker]]
    tmp_ticker_return = tmp_real.merge(tmp_common, left_index = True, right_index = True, how = 'inner')
    tmp_ticker_return.columns = ['return', 'common_return']
    tmp_ticker_return['specific_return'] = tmp_ticker_return.apply(lambda row: row['return'] - row['common_return'], axis = 1)
    
    # TODO: calculate specific variance of return of each stock
    ticker_specific_variance = np.var(tmp_ticker_return['specific_return'], ddof = 1)
    specific_variances.append(ticker_specific_variance)

    ticker_returns[ticker] = tmp_ticker_return

# TODO: convert specific variance list into a diagonal matrix
specific_variances = np.diag(specific_variances)

In [129]:
ticker_returns['AOT']

Unnamed: 0_level_0,return,common_return,specific_return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-07,0.040441,-0.000214,0.040655
2015-01-08,0.021202,-0.000077,0.021279
2015-01-09,0.006920,0.000164,0.006756
2015-01-12,0.000000,0.000193,-0.000193
2015-01-13,-0.003436,-0.000021,-0.003416
...,...,...,...
2023-02-17,-0.003436,-0.000083,-0.003354
2023-02-20,0.006897,0.000006,0.006890
2023-02-21,0.000000,-0.000088,0.000088
2023-02-22,-0.013699,0.000105,-0.013804


<h2 style="color:yellow">Variance of each stock</h2>

In [130]:
# calculate covariance matrix of factor returns
factor_covmat = np.cov(factor_df.values[:,0], factor_df.values[:,1], ddof = 1)

In [131]:
factor_covmat

array([[1.08257436e-04, 9.57075443e-05],
       [9.57075443e-05, 9.02027252e-05]])

In [132]:
factor_exposure_df

Unnamed: 0_level_0,avg_return,median_return
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
AOT,-0.004079,0.021017
KBANK,0.115004,-0.077341
SIRI,0.161802,-0.101381


In [134]:
specific_variances

array([[0.00028358, 0.        , 0.        ],
       [0.        , 0.00035327, 0.        ],
       [0.        , 0.        , 0.00045288]])

In [137]:
# calculate covariance matrix of stocks (BFB + S)
ticker_covmat = (factor_exposure_df.values @ factor_covmat @ factor_exposure_df.T.values) + specific_variances

In [138]:
ticker_covmat

array([[2.83602777e-04, 6.41147483e-08, 1.01391989e-07],
       [6.41147483e-08, 3.53541708e-04, 4.08156744e-07],
       [1.01391989e-07, 4.08156744e-07, 4.53505142e-04]])