In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import itertools
import os
import time
import datetime as dt 
from pandas_datareader import data as web
import yfinance as yf
from scipy.stats import zscore

from functions.datareader import pull_stock_data

<h1 style="color:orange">Load data using YFinance and save to local</h1>

In [3]:
# source for yfinance: https://pypi.org/project/yfinance/

In [40]:
start = dt.date(2015, 1, 1)
end = dt.date(2022, 2, 1)

# meta = yf.Tickers(['BBL.BK', 'KBANK.BK'])

In [41]:
# price_df = meta.history(period = 'MAX').sort_index()

In [42]:
# price_df[[c for c in price_df.columns if c[1] == 'BBL.BK']].tail(20)

In [4]:
sectors = {
    'banking': ['BBL', 'KBANK', 'SCB', 'BAY', 'TISCO', 'KTB', 'KKP', 'TTB'],
    'hospital': ['BDMS', 'BCH', 'BH', 'THG', 'PR9', 'EKH', 'IMH'],
    'infrastructure': ['AOT', 'BAFS', 'BTS', 'BEM', 'DMT'],
    'consumer_staple': ['BJC', 'OR', 'CPALL', 'CRC', 'MAJOR', 'GFPT', 'CPF', 'MAKRO', 'M'],
    'consumer_discretionary': ['COM7', 'CPW', 'SYNEX'],
    'technology': ['BBIK', 'IIG', 'BE8'],
    'industrial': ['WHA', 'AMATA'],
    'material': ['SCGP', 'HMPRO', 'GLOBAL', 'DOHOME', 'TPIPL', 'STGT', 'THMUI'],
    'utilities': ['RATCH', 'BGRIM', 'GULF', 'TPIPP', 'EGCO', 'EA', 'BANPU', 'ACE'],
    'petrochemistry': ['PTTGC', 'BCP', 'IRPC', 'IVL'],
    'real_estate': ['SIRI', 'QH', 'AP', 'SPALI', 'ORI', 'LALIN'],
    'hotel': ['MINT', 'CENTEL', 'ERW', 'AWC', 'SPA'],
    'consumer_finance': ['SAWAD', 'KTC', 'AEONTS', 'TIDLOR', 'MTC'],
    'insurance': ['BLA']
}

all_tickers = sectors.values()
all_tickers = [v + '.BK' for s in all_tickers for v in s]

In [5]:
all_meta = yf.Tickers(all_tickers)

In [6]:
all_price_df = all_meta.history(period = 'max')

[*********************100%***********************]  73 of 73 completed


In [7]:
all_price_df.tail(1)

Unnamed: 0_level_0,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,ACE.BK,AEONTS.BK,AMATA.BK,AOT.BK,AP.BK,AWC.BK,BAFS.BK,BANPU.BK,BAY.BK,BBIK.BK,...,STGT.BK,SYNEX.BK,THG.BK,THMUI.BK,TIDLOR.BK,TISCO.BK,TPIPL.BK,TPIPP.BK,TTB.BK,WHA.BK
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2023-02-28,2.48,198.0,20.200001,72.75,12.5,6.0,32.25,11.1,29.5,116.0,...,2360432.0,335861.0,2624536.0,48300.0,12772126.0,648246.0,1741100.0,958856.0,80484277.0,28374603.0


In [9]:
# Price
parent_dir = './data/set'
for t in all_tickers:
    t_trim = t.replace('.BK', '')
    ticker_dir = f'{parent_dir}/{t_trim}'
    if not os.path.exists(ticker_dir):
        os.mkdir(ticker_dir)
    
    ticker_cols = [c for c in all_price_df.columns if c[1] == t]
    ticker_df = all_price_df[ticker_cols].dropna(axis = 0)
    ticker_df.columns = [c[0].lower() for c in ticker_df.columns]
    ticker_df.insert(0, 'ticker', t_trim)
    ticker_df.index.name = 'date'

    price_dir = f'{ticker_dir}/price'
    if not os.path.exists(price_dir):
        os.mkdir(price_dir)
    years = sorted(list(set(ticker_df.index.year)))
    for y in years:
        year_df = ticker_df[ticker_df.index.year == y]
        year_df.to_parquet(f'{price_dir}/{str(y)}.parquet')

In [2]:
# # to plot
# year_start = 2016
# # tickers = {'BBL': 'blue', 'KBANK': 'green', 'KKP': 'purple'}
# tickers = {'BGRIM': 'green', 'GULF': 'red'}
# plt.figure(figsize = (10, 3))
# for t in tickers:
#     ticker_df = all_df[all_df['ticker'] == t]['Close']

#     # ? filter year
#     ticker_df = ticker_df[ticker_df.index.year >= year_start]

#     first_value = ticker_df.iloc[0]
#     # ? normalize prices
#     ticker_df = ticker_df.div(first_value)
#     ticker_df.plot(color = tickers[t], label = t)
# plt.legend(loc = 'upper right')

<h1 style="color:salmon">Load data from local and use only close price</h1>

In [10]:
raw_df = pd.read_parquet('./data/set')
raw_df.tail()

Unnamed: 0_level_0,ticker,close,dividends,high,low,open,stock splits,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-02-22,WHA,3.94,0.0,4.0,3.92,3.96,0.0,136125400.0
2023-02-23,WHA,3.94,0.0,3.96,3.9,3.94,0.0,52493400.0
2023-02-24,WHA,3.98,0.0,4.02,3.94,4.0,0.0,181460000.0
2023-02-27,WHA,4.06,0.0,4.08,3.94,4.0,0.0,145975100.0
2023-02-28,WHA,4.04,0.0,4.06,4.02,4.06,0.0,28374603.0


In [11]:
# map sector using the sector dict above
vk = [(k, v) for k, v in sectors.items()]
vk[0]
sector_mapper = {sub: s[0]for s in vk for sub in s[1]}
# sector_mapper

raw_df['sector'] = raw_df['ticker'].map(sector_mapper)


<p>We'll define a starting period of our analysis. Therefore we'll need to know whether there are stocks of which the daily price data doesn't exist yet in that period.</p>

In [12]:
start_date = dt.date(2015, 1, 1)

first_date_df = raw_df[['ticker']].reset_index().groupby('ticker').min()
first_date_df['is_available_since_start'] = first_date_df['date'].apply(lambda x: True if x.date() <= start_date else False)

available_tickers = first_date_df[first_date_df['is_available_since_start'] == True].reset_index()['ticker']

In [37]:
# filter only selected stocks
df = raw_df.reset_index().merge(available_tickers, left_on = 'ticker', right_on = 'ticker').set_index('date')
df.head()

Unnamed: 0_level_0,ticker,close,dividends,high,low,open,stock splits,volume,sector
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2001-12-11,AEONTS,4.823158,0.0,5.126979,4.405405,4.48136,0.0,78469000.0,consumer_finance
2001-12-12,AEONTS,4.709226,0.0,5.013047,4.709226,4.937092,0.0,31076500.0,consumer_finance
2001-12-13,AEONTS,4.557314,0.0,4.78518,4.557314,4.747202,0.0,11258000.0,consumer_finance
2001-12-14,AEONTS,4.823158,0.0,4.899113,4.443382,4.557315,0.0,19788500.0,consumer_finance
2001-12-17,AEONTS,4.823158,0.0,4.899113,4.823158,4.899113,0.0,3423000.0,consumer_finance


In [38]:
open_df = df[['ticker', 'sector', 'open']]
close_df = df[['ticker', 'sector', 'close']]
# close_df.head()

<h2 style="color:cream">Create Factors</h2>

In [39]:
# pivot tickers to columns
price_df = close_df.reset_index().pivot(index = 'date', columns = ['sector', 'ticker'], values = 'close') \
                .sort_index()

In [40]:
# 1 year momemtum (1 year return)
return_1yr = price_df.pct_change(periods = 252).dropna(axis = 0)

# demean by sector
momemtum_demean = None

for s in sectors:
    sector_return = return_1yr[[c for c in return_1yr.columns if c[0] == s]]
    sector_avg = sector_return.mean(axis = 1)
    sector_demean = sector_return - np.broadcast_to(sector_avg.values.reshape(-1, 1), sector_return.shape)
    momemtum_demean = sector_return if momemtum_demean is None else momemtum_demean.merge(sector_return, left_index = True, right_index = True)

# rank 
momentum_rank = momemtum_demean.rank(axis = 1, method = 'max')
momentum_rank.head()

# z-score
momentum_factor_df = momentum_rank.apply(zscore, axis = 1)

momentum_factor_df.head(3)

sector,banking,banking,banking,banking,banking,banking,banking,hospital,hospital,hospital,...,real_estate,hotel,hotel,hotel,hotel,consumer_finance,consumer_finance,consumer_finance,consumer_finance,insurance
ticker,BAY,BBL,KBANK,KKP,KTB,TISCO,TTB,BCH,BDMS,BH,...,SPALI,CENTEL,ERW,MINT,SPA,AEONTS,KTC,MTC,SAWAD,BLA
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-12-09,-1.484924,-0.919239,-1.555635,-0.353553,-1.272792,0.070711,-1.131371,-0.070711,0.919239,1.343503,...,-1.202082,1.272792,-0.636396,0.989949,1.626346,-0.424264,1.414214,1.555635,1.697056,0.494975
2015-12-11,-1.414214,-1.06066,-1.626346,-0.141421,-1.272792,0.212132,-1.131371,0.070711,0.848528,1.414214,...,-1.202082,1.343503,-0.424264,0.777817,1.626346,-0.070711,1.272792,1.555635,1.697056,0.141421
2015-12-14,-1.414214,-1.131371,-1.484924,-0.424264,-1.272792,-0.070711,-1.06066,0.141421,0.989949,1.414214,...,-1.202082,1.343503,-0.707107,0.777817,1.626346,0.0,1.272792,1.555635,1.697056,0.070711


In [41]:
# Mean reversion factor (-1 * moving average)
N_DATE = 5
SMOOTH = False

ma_df = price_df.rolling(N_DATE).mean().mul(-1).dropna(axis = 0)

In [42]:
ma_df.head(3)

sector,consumer_finance,industrial,infrastructure,real_estate,infrastructure,utilities,banking,banking,hospital,petrochemistry,...,utilities,consumer_finance,real_estate,hotel,real_estate,consumer_discretionary,banking,material,banking,industrial
ticker,AEONTS,AMATA,AOT,AP,BAFS,BANPU,BAY,BBL,BCH,BCP,...,RATCH,SAWAD,SIRI,SPA,SPALI,SYNEX,TISCO,TPIPL,TTB,WHA
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2014-12-02,-85.29404,-11.949672,-24.946983,-4.651302,-21.684379,-14.417991,-38.609628,-152.678726,-7.53234,-24.359682,...,-44.470298,-15.25214,-1.090397,-2.762472,-17.47804,-1.885349,-27.37234,-1.539469,-2.249652,-2.269846
2014-12-03,-85.373309,-12.057605,-25.435431,-4.67928,-21.623296,-14.365275,-38.487189,-152.375793,-7.507316,-24.015619,...,-44.470298,-15.336328,-1.093786,-2.830837,-17.674792,-1.882842,-27.342555,-1.542741,-2.255326,-2.260466
2014-12-04,-86.166003,-12.165537,-25.869606,-4.700263,-21.592755,-14.2862,-38.405564,-151.997125,-7.482292,-23.705962,...,-44.434229,-15.490673,-1.092656,-2.878946,-17.773167,-1.882842,-27.431911,-1.555829,-2.252489,-2.24796


In [43]:
# Overnight return
# shift close date 1 day forward
open_df_pivot = close_df.reset_index().pivot(index = 'date', columns = ['sector', 'ticker'], values = 'close') \
                .sort_index()
