In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt 
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import alphalens as al
import warnings
import json

from functions.datareader import pull_stock_data, YFinanceReader
from functions.portfolio_optimizer import RiskModelPCA, OptimalHoldings, OptimalHoldingsStrictFactor
from functions.alpha_factor_evaluator import AlphaFactorEvaluator
from functions.indicators import TechnicalIndicators

warnings.filterwarnings('ignore')

In [2]:
# define all parameters
START_DATE = dt.date(2015, 1, 1)

<h1 style="color:turquoise">Load data from local</h1>

In [3]:
# sectors
with open('./keys/set_sectors.json') as f:
    sectors = json.load(f)

In [4]:
# price data
raw_df = pd.read_parquet('./data/set')

raw_df.set_index(pd.DatetimeIndex(raw_df.index))
raw_df.index.name = 'date'
raw_df.tail(2)

Unnamed: 0_level_0,ticker,close,dividends,high,low,open,stock splits,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-04-25,WHA,4.42,0.0,4.44,4.38,4.42,0.0,38961400.0
2023-04-26,WHA,4.42,0.0,4.44,4.34,4.42,0.0,50271318.0


In [5]:
# map sector using the sector dict above
vk = [(k, v) for k, v in sectors.items()]
sector_mapper = {sub: s[0]for s in vk for sub in s[1]}
raw_df['sector'] = raw_df['ticker'].map(sector_mapper)

In [6]:
# filter only stocks that exist since our start date
first_date_df = raw_df[['ticker']].reset_index().groupby('ticker').min()
first_date_df['is_available_since_start'] = first_date_df['date'].apply(lambda x: True if x <= START_DATE else False)

available_tickers = first_date_df[first_date_df['is_available_since_start'] == True].reset_index()['ticker']

# create a sector_mapper of available stocks
sector_mapper_available = {s:sector_mapper[s] for s in sector_mapper if s in available_tickers.values}

# filter only selected stocks
df = raw_df.reset_index().merge(available_tickers, left_on = 'ticker', right_on = 'ticker').set_index('date')

# create an OHLCV df in order to calculate technical indicators
ohlcv_df = df[['ticker', 'open', 'high', 'low', 'close', 'volume']]

# create a dataframe of close price
close_df = df[['ticker', 'sector', 'close']]

<h1 style="color:turquoise">Prepare data</h1>

<h3>Technical Indicators</h3>

In [7]:
ti_df = pd.DataFrame()

# * split multi-index dataframe to a dict of which each value represents OHLCV of each stock
for ticker in available_tickers.values:
    ticker_df = ohlcv_df[ohlcv_df['ticker'] == ticker].drop('ticker', axis = 1)
    
    ti = TechnicalIndicators(ticker_df)
    res_dict = dict()
    res_dict['rsi'] = ti.RSI(n = 14)
    res_dict['stochastic_rsi_k'], res_dict['stochastic_rsi_d'] = ti.stochasticRSI(n = 14, k = 3, d = 3)
    res_dict['macd'], res_dict['macd_signal'] = ti.MACD(n_long = 26, n_short = 12)
    res_dict['vol_change_pct'] = ti.volume_change_pct()
    res_dict['overnight_return'] = ti.overnight_return()
    res_dict['cv_ratio'] = ti.candlestick_volume_ratio()
    res_dict['bollinger_ratio'] = ti.bollinger_ratio(n = 20, k = 2)
    res_df = pd.DataFrame.from_dict(res_dict).reset_index()
    res_df.insert(1, 'ticker', ticker)

    ti_df = ti_df.append(res_df, ignore_index = True)

# * filter date
ti_df = ti_df[ti_df['date'].dt.date >= START_DATE] \
            .set_index(['date', 'ticker'])


In [8]:
ti_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rsi,stochastic_rsi_k,stochastic_rsi_d,macd,macd_signal,vol_change_pct,overnight_return,cv_ratio,bollinger_ratio
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-01-05,AEONTS,35.714196,0.075352,0.025117,-0.060835,0.584415,1.618357,-0.009216761,7e-06,0.110592
2015-01-06,AEONTS,33.333333,0.023444,0.032932,-0.179482,0.431635,0.568266,-0.009259088,1.4e-05,0.166329
2015-01-07,AEONTS,25.000061,0.0,0.032932,-0.239271,0.297454,-0.364706,0.004608197,1.4e-05,0.224731
2015-01-08,AEONTS,33.333469,0.156864,0.060103,-0.221141,0.193735,-0.183333,1.622811e-07,4.4e-05,0.331307
2015-01-09,AEONTS,33.333469,0.156864,0.104576,-0.204417,0.114105,1.60771,-0.009090909,1.4e-05,0.346131


<h3>Daily Return</h3>

In [9]:
daily_return_df = close_df.pivot(columns = 'ticker', values = 'close').pct_change().shift(1).mul(100)

# * filter date
daily_return_df = daily_return_df[daily_return_df.index.date >= START_DATE]

# * melt processed dataframe
daily_return_df = pd.melt(daily_return_df.reset_index(), id_vars='date', value_name='return', var_name='ticker')
daily_return_df.set_index(['date', 'ticker'], inplace=True)

In [10]:
# TODO: combine x and y
df = ti_df.merge(daily_return_df, left_index=True, right_index=True)

# * map sector
sector_inversed = {v:k for k in sectors for v in sectors[k]}
df['sector'] = df.index.get_level_values(1).map(sector_inversed)

In [11]:
# df.head()

<h2>One-Hot Encode Sector</h2>

In [12]:
from lightgbm import LGBMRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
# from autosklearn.regression import AutoSKlearnRegressor

In [13]:
ohe = OneHotEncoder()
ohe.fit(df[['sector']])
sector_dummy = pd.DataFrame(ohe.transform(df[['sector']]).toarray(), columns = ohe.get_feature_names_out(), index = df.index)

In [14]:
df = df.merge(sector_dummy, left_index=True, right_index=True).drop('sector', axis = 1)

# * create month and day of week factors
df['month'] = df.index.get_level_values(0).month
df['day_of_week'] = df.index.get_level_values(0).dayofweek

In [15]:
# # TODO: split data

TRAIN_SIZE = 0.8
dates = sorted(df.index.get_level_values(0))
n_data = len(dates)
train_split = int(TRAIN_SIZE * n_data)
train_dates = dates[:train_split]
test_dates = dates[train_split:]

x_train = df[df.index.isin(train_dates, level=0)]
x_test = df[df.index.isin(test_dates, level=0)]

y_train = x_train.pop('return')
y_test = x_test.pop('return')

In [16]:
# Create a LightGBM regressor object
lgb_model = LGBMRegressor()

# Specify the parameter grid to search over
param_grid = {'n_estimators': [50, 100, 200],
              'max_depth': [3, 5, 7],
              'learning_rate': [0.01, 0.1, 1]}

# Create a time-series aware cross-validator
cv = TimeSeriesSplit(n_splits=3)

# Create a GridSearchCV object with the specified parameters
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=cv)

# Fit the grid search object to the data
grid_search.fit(x_train, y_train)


In [17]:
# # TODO: use the best param to train the entire training data
lgb_model = LGBMRegressor(**grid_search.best_params_)
lgb_model.fit(x_train, y_train)


In [18]:
# # TODO: get training and testing errors
y_train_pred = lgb_model.predict(x_train)
y_pred = lgb_model.predict(x_test)
mse_train = mean_squared_error(y_train_pred, y_train)
mse_test = mean_squared_error(y_pred, y_test)

print(f'MSE on training data: {mse_train:.4f}')
print(f'MSE on testing data: {mse_test:.4f}')

MSE on training data: 2.8295
MSE on testing data: 2.6568


In [19]:
# # * feature importance
importances = pd.Series(lgb_model.feature_importances_, index = lgb_model.feature_name_) \
                .sort_values(ascending=False)
importances

overnight_return                 880
vol_change_pct                   763
bollinger_ratio                  717
stochastic_rsi_d                 474
macd                             463
cv_ratio                         447
stochastic_rsi_k                 371
rsi                              339
macd_signal                      281
month                            214
day_of_week                      171
sector_consumer_discretionary     55
sector_hotel                      30
sector_consumer_finance           24
sector_banking                    22
sector_nan                        13
sector_real_estate                10
sector_material                    7
sector_petrochemistry              7
sector_infrastructure              5
sector_utilities                   5
sector_consumer_staple             4
sector_insurance                   3
sector_hospital                    2
sector_industrial                  1
dtype: int32