In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os

# own code library
from config.config import *
from config.dataprep_config import *
from plotting import *
from dataprep.preprocessors import *

In [2]:
abspath = r"C:\Users\Andy\PycharmProjects\finrlpaper2\MT-DRL-Pytorch"

# Preprocessing US stock dataset
Using intermediate dataset, where unfit company tics have already been removed.

In [3]:
df_us = pd.read_csv(os.path.join(abspath, "data", "intermediate", "US_stocks_WDB_a.csv"), index_col=0)
df_us.head()

Unnamed: 0_level_0,iid,datadate,tic,conm,ajexdi,cshoc,cshtrd,eps,prccd,prchd,prcld,prcod,prcstd,trfd,log_prccd
gvkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1690,1,19950103,AAPL,APPLE INC,112.0,,927400.0,,38.375,38.875,37.875,,3.0,1.083307,3.647406
1690,1,19950104,AAPL,APPLE INC,112.0,,1416800.0,,39.375,39.625,38.625,,3.0,1.083307,3.673131
1690,1,19950105,AAPL,APPLE INC,112.0,,657500.0,,38.875,39.375,38.75,,3.0,1.083307,3.660351
1690,1,19950106,AAPL,APPLE INC,112.0,,9613000.0,,42.0,43.125,41.125,,3.0,1.083307,3.73767
1690,1,19950109,AAPL,APPLE INC,112.0,,2447000.0,,41.203,41.875,41.0,,3.0,1.083307,3.718511


#### Calculate: adjcp (adjusted clising price), open, high, low, volume

In [4]:
# function from preprocessors.py
df_us = calculate_price_volume_WhartonData(df=df_us.copy(), 
                                   new_cols_subset=data_settings.NEW_COLS_SUBSET, 
                                   target_subset=None)
df_us.head(3)

Unnamed: 0,iid,datadate,tic,conm,ajexdi,cshoc,cshtrd,eps,prccd,prchd,prcld,prcod,prcstd,trfd,log_prccd,adjcp,open,high,low,volume
0,1,19950103,AAPL,APPLE INC,112.0,,927400.0,,38.375,38.875,37.875,,3.0,1.083307,3.647406,0.342634,,0.347098,0.33817,927400.0
1,1,19950104,AAPL,APPLE INC,112.0,,1416800.0,,39.375,39.625,38.625,,3.0,1.083307,3.673131,0.351562,,0.353795,0.344866,1416800.0
2,1,19950105,AAPL,APPLE INC,112.0,,657500.0,,38.875,39.375,38.75,,3.0,1.083307,3.660351,0.347098,,0.351562,0.345982,657500.0


#### Calculate: technical indicators


In [5]:
# note: this usually takes up to 5 min on my laptop
df_us = add_technical_indicator_with_StockStats(df=df_us, 
                                                technical_indicators_list=["macd", "rsi_21", "cci_21", "dx_21"])
df_us.head(3)

Unnamed: 0,iid,datadate,tic,conm,ajexdi,cshoc,cshtrd,eps,prccd,prchd,...,log_prccd,adjcp,open,high,low,volume,macd,rsi_21,cci_21,dx_21
0,1,19950103,AAPL,APPLE INC,112.0,,927400.0,,38.375,38.875,...,3.647406,0.342634,,0.347098,0.33817,927400.0,0.0,,,
1,1,19950104,AAPL,APPLE INC,112.0,,1416800.0,,39.375,39.625,...,3.673131,0.351562,,0.353795,0.344866,1416800.0,0.0002,100.0,66.666667,100.0
2,1,19950105,AAPL,APPLE INC,112.0,,657500.0,,38.875,39.375,...,3.660351,0.347098,,0.351562,0.345982,657500.0,0.000117,65.57377,28.571429,100.0


#### Calculate: other features, such as trading volume, volatility, return

In [6]:
df_us = add_other_features(df=df_us,
                       features=["returns_volatility", "return_daily", "log_return_daily"],
                       window_days_vola=7, # window is only relevant for volatility, as returns are always daily here
                       min_periods_vola=7, # min periods to be in window for calculatiom, otherwise NaN calculated
                       price_colum=data_settings.MAIN_PRICE_COLUMN,
                       asset_name_column=data_settings.ASSET_NAME_COLUMN)
df_us.rename(columns={"returns_volatility":"ret_vola_7d"}, inplace=True)

In [7]:
print(df_us["ret_vola_7d"].head(10))
df_us["ret_vola_7d"].isna().sum()/29 # for each of the 29 tickers

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
5         NaN
6         NaN
7    0.045942
8    0.047910
9    0.047445
Name: ret_vola_7d, dtype: float64


7.0

In [8]:
df_us = add_other_features(df=df_us,
                       features=["returns_volatility"],
                       window_days_vola=21, # 21 trading days in a month, usually
                       min_periods_vola=21,
                       price_colum=data_settings.MAIN_PRICE_COLUMN,
                       asset_name_column=data_settings.ASSET_NAME_COLUMN)
df_us.rename(columns={"returns_volatility":"ret_vola_21d"}, inplace=True)

In [9]:
df_us["ret_vola_21d"].head(100)
df_us["ret_vola_21d"].isna().sum()/29 

21.0

In [None]:
df_us = add_other_features(df=df_us,
                       features=["returns_volatility"],
                       window_days_vola=63, # 63 trading days in a quarter, usually
                       min_periods_vola=63,
                       price_colum=data_settings.MAIN_PRICE_COLUMN,
                       asset_name_column=data_settings.ASSET_NAME_COLUMN)
df_us.rename(columns={"returns_volatility":"ret_vola_63d"}, inplace=True)

In [None]:
df_us.head(100)

In [None]:
df_us.sort_values(by=["datadate", "tic"], inplace=True)
df_us["adjcp"].tail()

In [None]:
df_us.loc[df_us["datadate"]>=20210611, ["datadate", "tic", "adjcp"]]
### TODO: remove Nan, especially after date 14.6.2021: df[df["datadate"]>=20210614] (last load was on 13.6.2021)

In [None]:
# removing data that goes beyond the latest fetching date (20210611)
df_us = df_us[df_us["datadate"]<=20210611]

In [None]:
df_us["adjcp"].tail()

In [None]:
df_us.columns

In [None]:
relevant_cols = ['datadate', 'tic','eps','adjcp', 'open', 'high', 'low', 'volume', 'macd', 'rsi_21', 'cci_21',
                 'dx_21', 'ret_vola_7d', 'return_daily', 'log_return_daily','ret_vola_21d', 'ret_vola_63d']

In [None]:
df_us = df_us[relevant_cols]
df_us.head()

In [None]:
print(f"dataset length: {len(df_us)}")
df_us[df_us.datadate>=20000101].isna().sum() #eps and open have many missing values, even from 2000 on
# hence we drop them

In [None]:
df_us.drop(columns=["eps", "open"], inplace=True)

In [None]:
########TODO
df_us = df_us[df_us.datadate>=20000101]

In [None]:
df_us[df_us.datadate>=20000101].isna().sum()

In [None]:
df_us[df_us['adjcp'].isna()] # can look up in another data bank or impute, since only one value per ticker missing
# stock: KO, NKE

In [None]:
df_us[df_us["high"].isna()] # can lok up in another data bank or impute, since only one value per ticker missing
# stock: KO, NKE

In [None]:
df_us[df_us["low"].isna()] # can lok up in another data bank or impute, since only one value per ticker missing
# stock: KO, NKE

In [None]:
df_us[df_us["volume"].isna()] # can lok up in another data bank or impute, since only one value per ticker missing
# stock: KO, NKE

In [None]:
df_us[df_us["log_return_daily"].isna()] # can lok up in another data bank or impute, since only one value per ticker missing
# stock: KO, NKE

In [None]:
### Let us drop KO and NKE, since they have missing values and we already have a lot of stocks in the portfolio
# anyways so it might be better to reduce the number of stocks
df_us = df_us[~df_us["tic"].isin(["KO", "NKE"])]
df_us.tic.unique()

In [None]:
df_us[df_us.datadate>=20000101].isna().sum() # no missing values anymore, hence we can save it

#### VIX (Volatility index)

In [None]:
# add volatility index
vix = pd.read_csv(os.path.join(abspath, "data", "raw", "VIX.csv"), index_col = 0)
vix

In [None]:
# luckily, there are no missing values
vix.isna().sum()

In [None]:
# let's plot it
plt.plot(vix["Adj Close"])
plt.show()
#plt.savefig("vix_adjclose") # in case it doesn't show, just save and open
plt.clf() # tell matplotlib we are done with this plot so it doesn't append subsqeuent plots

In [None]:
# let's plot it
#np.log(df.price) - np.log(df.price.shift(1))
plt.plot(vix["Adj Close"].pct_change(1))
plt.show()
plt.savefig("vix_change") # in case it doesn't show, just save and open
plt.clf() # tell matplotlib we are done with this plot so it doesn't append subsqeuent plots

# we see that the change in VIX is not a good proxy for the VIX value, 
# because there are no distinct peaks during crisis in VIX change, while the VIX value peaks
# but we should still somehow bring this value down a little so it is not so large compared to the other values

In [None]:
d = vix["Adj Close"].rolling(window=21).mean()
st = vix["Adj Close"].rolling(window=21).std()
vix["AdjStdroll21"] = vix["Adj Close"] / st
vix["AdjMeanroll21"] = vix["Adj Close"] / d
vix["Adj100"] = vix["Adj Close"] /100

In [None]:
# Note: after plotting all of the above, it becomes apparent that the best way to bring down the VIX to 
# decimals while not changing the nature of the time series is simply dividing by 100; then
# the vix is varying between 0.1 and 0.8 (latter in crisis times)

In [None]:
#plt.plot(vix["Ad1001"])
#plt.show()
#plt.savefig("vix_adj100") # in case it doesn't show, just save and open
#plt.clf() # tell matplotlib we are done with this plot so it doesn't append subsqeuent plots

In [None]:
# let's plot it
#np.log(df.price) - np.log(df.price.shift(1))
#plt.plot(vix["AdjMeanroll21"])
#plt.show()
#plt.savefig("vix_stdadj") # in case it doesn't show, just save and open
#plt.clf() # tell matplotlib we are done with this plot so it doesn't append subsqeuent plots

In [None]:
vix = pd.DataFrame(vix["Adj100"])
vix

In [None]:
type(vix)

In [None]:
# need to convert string dates to datetime format to be compatible with the format we have in 
# the other data set
vix = vix.reset_index()
vix["Date"] = pd.to_datetime(vix["Date"], format='%m/%d/%Y')
vix["Date"] = vix["Date"].dt.strftime('%Y%m%d')
vix["Date"]

In [None]:
# now we renamce the columns
vix.columns = ["datadate", "adjDiv100"]
vix.head(3)

In [None]:
# now we want to merge this data set with our US data set

In [None]:
df_us.to_csv(os.path.join(abspath, "data", "preprocessed", "US_stocks_WDB_try.csv"))

In [None]:
vix

In [None]:
d[0:10]

In [None]:
# let's plot it
#np.log(df.price) - np.log(df.price.shift(1))
plt.plot(vix["Adj Close"].pct_change(1))
plt.show()
plt.savefig("vix_change") # in case it doesn't show, just save and open
plt.clf() # tell matplotlib we are done with this plot so it doesn't append subsqeuent plots

# we see that the change in VIX is not a good proxy for the VIX value, 
# because there are no distinct peaks during crisis in VIX change, while the VIX value peaks
# but we should still somehow bring this value down a little so it is not so large compared to the other values

In [None]:
d = vix["Adj Close"].rolling(window=21).mean()
st = vix["Adj Close"].rolling(window=21).std()
vix["AdjStdroll21"] = vix["Adj Close"] / st
vix["AdjMeanroll21"] = vix["Adj Close"] / d
vix["Adj100"] = vix["Adj Close"] /100

In [None]:
# Note: after plotting all of the above, it becomes apparent that the best way to bring down the VIX to 
# decimals while not changing the nature of the time series is simply dividing by 100; then
# the vix is varying between 0.1 and 0.8 (latter in crisis times)

In [None]:
#plt.plot(vix["Ad1001"])
#plt.show()
#plt.savefig("vix_adj100") # in case it doesn't show, just save and open
#plt.clf() # tell matplotlib we are done with this plot so it doesn't append subsqeuent plots

In [None]:
# let's plot it
#np.log(df.price) - np.log(df.price.shift(1))
#plt.plot(vix["AdjMeanroll21"])
#plt.show()
#plt.savefig("vix_stdadj") # in case it doesn't show, just save and open
#plt.clf() # tell matplotlib we are done with this plot so it doesn't append subsqeuent plots

In [None]:
vix = pd.DataFrame(vix["Adj100"])
vix

In [None]:
type(vix)

In [None]:
# need to convert string dates to datetime format to be compatible with the format we have in 
# the other data set
vix = vix.reset_index()
vix["Date"] = pd.to_datetime(vix["Date"], format='%m/%d/%Y')
vix["Date"] = vix["Date"].dt.strftime('%Y%m%d')
vix["Date"]

In [None]:
# now we renamce the columns
vix.columns = ["datadate", "adjDiv100"]
vix.head(3)

In [None]:
# now we want to merge this data set with our US data set

In [None]:
df_us.to_csv(os.path.join(abspath, "data", "preprocessed", "US_stocks_WDB_try.csv"))

In [None]:
vix

In [None]:
d[0:10]