In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os

# own code library
from config.config import *
from config.dataprep_config import *
from plotting import *
from dataprep.preprocessors import *

In [2]:
abspath = r"C:\Users\Andy\PycharmProjects\finrlpaper2\MT-DRL-Pytorch"

# Preprocessing US stock dataset
Using intermediate dataset, where unfit company tics have already been removed.

In [3]:
df_us = pd.read_csv(os.path.join(abspath, "data", "intermediate", "US_stocks_WDB_a.csv"), index_col=0)
df_us.head()

Unnamed: 0_level_0,iid,datadate,tic,conm,ajexdi,cshoc,cshtrd,eps,prccd,prchd,prcld,prcod,prcstd,trfd,log_prccd
gvkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1690,1,19950103,AAPL,APPLE INC,112.0,,927400.0,,38.375,38.875,37.875,,3.0,1.083307,3.647406
1690,1,19950104,AAPL,APPLE INC,112.0,,1416800.0,,39.375,39.625,38.625,,3.0,1.083307,3.673131
1690,1,19950105,AAPL,APPLE INC,112.0,,657500.0,,38.875,39.375,38.75,,3.0,1.083307,3.660351
1690,1,19950106,AAPL,APPLE INC,112.0,,9613000.0,,42.0,43.125,41.125,,3.0,1.083307,3.73767
1690,1,19950109,AAPL,APPLE INC,112.0,,2447000.0,,41.203,41.875,41.0,,3.0,1.083307,3.718511


#### Calculate: adjcp (adjusted clising price), open, high, low, volume

In [4]:
# function from preprocessors.py
df_us = calculate_price_volume_WhartonData(df=df_us.copy(), 
                                   new_cols_subset=data_settings.NEW_COLS_SUBSET, 
                                   target_subset=None)
df_us.head(3)

Unnamed: 0,iid,datadate,tic,conm,ajexdi,cshoc,cshtrd,eps,prccd,prchd,prcld,prcod,prcstd,trfd,log_prccd,adjcp,open,high,low,volume
0,1,19950103,AAPL,APPLE INC,112.0,,927400.0,,38.375,38.875,37.875,,3.0,1.083307,3.647406,0.342634,,0.347098,0.33817,927400.0
1,1,19950104,AAPL,APPLE INC,112.0,,1416800.0,,39.375,39.625,38.625,,3.0,1.083307,3.673131,0.351562,,0.353795,0.344866,1416800.0
2,1,19950105,AAPL,APPLE INC,112.0,,657500.0,,38.875,39.375,38.75,,3.0,1.083307,3.660351,0.347098,,0.351562,0.345982,657500.0


#### Calculate: technical indicators


In [5]:
# note: this usually takes up to 2 min
df_us = add_technical_indicator_with_StockStats(df=df_us, 
                                                technical_indicators_list=["macd", "rsi_21", "cci_21", 
                                                                           "dx_21"])
df_us.head(3)

Unnamed: 0,iid,datadate,tic,conm,ajexdi,cshoc,cshtrd,eps,prccd,prchd,...,log_prccd,adjcp,open,high,low,volume,macd,rsi_21,cci_21,dx_21
0,1,19950103,AAPL,APPLE INC,112.0,,927400.0,,38.375,38.875,...,3.647406,0.342634,,0.347098,0.33817,927400.0,0.0,,,
1,1,19950104,AAPL,APPLE INC,112.0,,1416800.0,,39.375,39.625,...,3.673131,0.351562,,0.353795,0.344866,1416800.0,0.0002,100.0,66.666667,100.0
2,1,19950105,AAPL,APPLE INC,112.0,,657500.0,,38.875,39.375,...,3.660351,0.347098,,0.351562,0.345982,657500.0,0.000117,65.57377,28.571429,100.0


#### Calculate: other features, such as trading volume, volatility, return

In [6]:
df_us = add_other_features(df=df_us,
                       features=["returns_volatility", "return_daily", "log_return_daily", "obv"],
                       window_days_vola=7, # window is only relevant for volatility, as returns are always daily here
                       min_periods_vola=7, # min periods to be in window for calculatiom, otherwise NaN calculated
                       price_colum=data_settings.MAIN_PRICE_COLUMN,
                       asset_name_column=data_settings.ASSET_NAME_COLUMN)
df_us.rename(columns={"returns_volatility":"ret_vola_7d"}, inplace=True)

In [7]:
print(df_us["ret_vola_7d"].head(10))
df_us["ret_vola_7d"].isna().sum()/29 # for each of the 29 tickers

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
5         NaN
6         NaN
7    0.045942
8    0.047910
9    0.047445
Name: ret_vola_7d, dtype: float64


7.0

In [8]:
df_us = add_other_features(df=df_us,
                       features=["returns_volatility"],
                       window_days_vola=21, # 21 trading days in a month, usually
                       min_periods_vola=21,
                       price_colum=data_settings.MAIN_PRICE_COLUMN,
                       asset_name_column=data_settings.ASSET_NAME_COLUMN)
df_us.rename(columns={"returns_volatility":"ret_vola_21d"}, inplace=True)

In [9]:
df_us["ret_vola_21d"].head(100)
df_us["ret_vola_21d"].isna().sum()/29 

21.0

In [10]:
df_us = add_other_features(df=df_us,
                       features=["returns_volatility"],
                       window_days_vola=63, # 63 trading days in a quarter, usually
                       min_periods_vola=63,
                       price_colum=data_settings.MAIN_PRICE_COLUMN,
                       asset_name_column=data_settings.ASSET_NAME_COLUMN)
df_us.rename(columns={"returns_volatility":"ret_vola_63d"}, inplace=True)

In [11]:
df_us.head(100)

Unnamed: 0,iid,datadate,tic,conm,ajexdi,cshoc,cshtrd,eps,prccd,prchd,...,macd,rsi_21,cci_21,dx_21,ret_vola_7d,return_daily,log_return_daily,obv,ret_vola_21d,ret_vola_63d
0,1,19950103,AAPL,APPLE INC,112.0,,927400.0,,38.375,38.875,...,0.000000,,,,,,,0.0,,
1,1,19950104,AAPL,APPLE INC,112.0,,1416800.0,,39.375,39.625,...,0.000200,100.000000,66.666667,100.000000,,0.026059,0.025725,1416800.0,,
2,1,19950105,AAPL,APPLE INC,112.0,,657500.0,,38.875,39.375,...,0.000117,65.573770,28.571429,100.000000,,-0.012698,-0.012780,759300.0,,
3,1,19950106,AAPL,APPLE INC,112.0,,9613000.0,,42.000,43.125,...,0.001083,89.437284,133.333333,100.000000,,0.080386,0.077318,10372300.0,,
4,1,19950109,AAPL,APPLE INC,112.0,,2447000.0,,41.203,41.875,...,0.001303,75.434550,65.755057,93.898833,,-0.018976,-0.019159,7925300.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,19950518,AAPL,APPLE INC,112.0,,3318000.0,,43.375,44.125,...,0.014343,63.981126,121.843410,52.221790,0.026084,-0.014205,-0.014306,25153200.0,0.021654,0.027465
96,1,19950519,AAPL,APPLE INC,112.0,,2880000.0,,42.750,43.750,...,0.013883,61.141003,92.554106,37.625478,0.027391,-0.014409,-0.014514,22273200.0,0.021371,0.027451
97,1,19950522,AAPL,APPLE INC,112.0,,3320000.0,,44.125,44.125,...,0.014344,64.755053,96.623298,37.625478,0.028153,0.032164,0.031657,25593200.0,0.020825,0.027349
98,1,19950523,AAPL,APPLE INC,112.0,,2470000.0,,43.875,44.375,...,0.014364,63.625363,98.678855,40.391891,0.015869,-0.005666,-0.005682,23123200.0,0.020886,0.027353


In [12]:
df_us["obv"]

0                 0.0
1           1416800.0
2            759300.0
3          10372300.0
4           7925300.0
             ...     
191993   -476900192.0
191994   -447960572.0
191995   -419001412.0
191996   -391526802.0
191997   -409144412.0
Name: obv, Length: 191998, dtype: float64

In [13]:
df_us.sort_values(by=["datadate", "tic"], inplace=True)
df_us["adjcp"].tail()

19974    NaN
39950    NaN
98779    NaN
172023   NaN
33291    NaN
Name: adjcp, dtype: float64

In [14]:
df_us.loc[df_us["datadate"]>=20210611, ["datadate", "tic", "adjcp"]]
### TODO: remove Nan, especially after date 14.6.2021: df[df["datadate"]>=20210614] (last load was on 13.6.2021)

Unnamed: 0,datadate,tic,adjcp
6657,20210611,AAPL,127.35
13315,20210611,AMGN,242.77
19973,20210611,AXP,164.51
26632,20210611,BA,247.28
33290,20210611,CAT,220.7
39949,20210611,CSCO,54.77
46608,20210611,CVX,107.91
53266,20210611,DIS,177.38
58830,20210611,GS,378.05
65488,20210611,HD,310.77


In [15]:
# removing data that goes beyond the latest fetching date (20210611)
df_us = df_us[df_us["datadate"]<=20210611]

In [16]:
df_us["adjcp"].tail()

165363    397.89
172022     57.33
178681     55.31
185339    140.75
191997     62.17
Name: adjcp, dtype: float64

In [17]:
df_us.columns

Index(['iid', 'datadate', 'tic', 'conm', 'ajexdi', 'cshoc', 'cshtrd', 'eps',
       'prccd', 'prchd', 'prcld', 'prcod', 'prcstd', 'trfd', 'log_prccd',
       'adjcp', 'open', 'high', 'low', 'volume', 'macd', 'rsi_21', 'cci_21',
       'dx_21', 'ret_vola_7d', 'return_daily', 'log_return_daily', 'obv',
       'ret_vola_21d', 'ret_vola_63d'],
      dtype='object')

In [18]:
relevant_cols = ['datadate', 'tic','eps','adjcp', 'open', 'high', 'low', 'volume', 'macd', 'rsi_21', 'cci_21',
                 'dx_21', 'ret_vola_7d', 'return_daily', 'log_return_daily','ret_vola_21d', 'ret_vola_63d',
                "obv"]

In [19]:
df_us = df_us[relevant_cols]
df_us.head()

Unnamed: 0,datadate,tic,eps,adjcp,open,high,low,volume,macd,rsi_21,cci_21,dx_21,ret_vola_7d,return_daily,log_return_daily,ret_vola_21d,ret_vola_63d,obv
0,19950103,AAPL,,0.342634,,0.347098,0.33817,927400.0,0.0,,,,,,,,,0.0
6658,19950103,AMGN,,7.28125,,7.390625,7.25,938700.0,0.0,,,,,,,,,0.0
13316,19950103,AXP,,9.75,,9.833333,9.666667,1321700.0,0.0,,,,,,,,,0.0
19975,19950103,BA,,23.375,,23.5625,23.0625,649600.0,0.0,,,,,,,,,0.0
26633,19950103,CAT,,13.75,,13.78125,13.65625,531100.0,0.0,,,,,,,,,0.0


In [20]:
print(f"dataset length: {len(df_us)}")
df_us[df_us.datadate>=20000101].isna().sum() #eps and open have many missing values, even from 2000 on
# hence we drop them

dataset length: 191990


datadate                0
tic                     0
eps                   307
adjcp                   2
open                33874
high                    2
low                     2
volume                  2
macd                    0
rsi_21                  0
cci_21                  2
dx_21                   0
ret_vola_7d             0
return_daily            0
log_return_daily        4
ret_vola_21d            0
ret_vola_63d            0
obv                     0
dtype: int64

In [21]:
df_us.drop(columns=["eps", "open"], inplace=True)

In [22]:
df_us = df_us[df_us.datadate>=20000101]

In [23]:
df_us[df_us.datadate>=20000101].isna().sum()

datadate            0
tic                 0
adjcp               2
high                2
low                 2
volume              2
macd                0
rsi_21              0
cci_21              2
dx_21               0
ret_vola_7d         0
return_daily        0
log_return_daily    4
ret_vola_21d        0
ret_vola_63d        0
obv                 0
dtype: int64

In [24]:
df_us[df_us['adjcp'].isna()] # can look up in another data bank or impute, since only one value per ticker missing
# stock: KO, NKE

Unnamed: 0,datadate,tic,adjcp,high,low,volume,macd,rsi_21,cci_21,dx_21,ret_vola_7d,return_daily,log_return_daily,ret_vola_21d,ret_vola_63d,obv
100469,20010912,KO,,,,,0.505423,62.198522,,47.669003,0.01271,0.0,,0.013867,0.013617,18816970.0
133762,20010913,NKE,,,,,0.06956,51.964271,,9.947278,0.015348,0.0,,0.01457,0.020281,-58070000.0


In [25]:
df_us[df_us["high"].isna()] # can lok up in another data bank or impute, since only one value per ticker missing
# stock: KO, NKE

Unnamed: 0,datadate,tic,adjcp,high,low,volume,macd,rsi_21,cci_21,dx_21,ret_vola_7d,return_daily,log_return_daily,ret_vola_21d,ret_vola_63d,obv
100469,20010912,KO,,,,,0.505423,62.198522,,47.669003,0.01271,0.0,,0.013867,0.013617,18816970.0
133762,20010913,NKE,,,,,0.06956,51.964271,,9.947278,0.015348,0.0,,0.01457,0.020281,-58070000.0


In [26]:
df_us[df_us["low"].isna()] # can lok up in another data bank or impute, since only one value per ticker missing
# stock: KO, NKE

Unnamed: 0,datadate,tic,adjcp,high,low,volume,macd,rsi_21,cci_21,dx_21,ret_vola_7d,return_daily,log_return_daily,ret_vola_21d,ret_vola_63d,obv
100469,20010912,KO,,,,,0.505423,62.198522,,47.669003,0.01271,0.0,,0.013867,0.013617,18816970.0
133762,20010913,NKE,,,,,0.06956,51.964271,,9.947278,0.015348,0.0,,0.01457,0.020281,-58070000.0


In [27]:
df_us[df_us["volume"].isna()] # can lok up in another data bank or impute, since only one value per ticker missing
# stock: KO, NKE

Unnamed: 0,datadate,tic,adjcp,high,low,volume,macd,rsi_21,cci_21,dx_21,ret_vola_7d,return_daily,log_return_daily,ret_vola_21d,ret_vola_63d,obv
100469,20010912,KO,,,,,0.505423,62.198522,,47.669003,0.01271,0.0,,0.013867,0.013617,18816970.0
133762,20010913,NKE,,,,,0.06956,51.964271,,9.947278,0.015348,0.0,,0.01457,0.020281,-58070000.0


In [28]:
df_us[df_us["log_return_daily"].isna()] # can lok up in another data bank or impute, since only one value per ticker missing
# stock: KO, NKE

Unnamed: 0,datadate,tic,adjcp,high,low,volume,macd,rsi_21,cci_21,dx_21,ret_vola_7d,return_daily,log_return_daily,ret_vola_21d,ret_vola_63d,obv
100469,20010912,KO,,,,,0.505423,62.198522,,47.669003,0.01271,0.0,,0.013867,0.013617,18816970.0
133762,20010913,NKE,,,,,0.06956,51.964271,,9.947278,0.015348,0.0,,0.01457,0.020281,-58070000.0
100470,20010917,KO,25.1,25.1,24.355,10128300.0,0.520251,62.198522,86.400327,47.669003,0.012444,0.005005,,0.013863,0.013619,18816970.0
133763,20010917,NKE,5.50625,5.6875,5.49625,2371700.0,-0.003167,51.964271,-317.938509,9.947278,0.03696,-0.094553,,0.025059,0.023625,-58070000.0


In [29]:
### Let us drop KO and NKE, since they have missing values and we already have a lot of stocks in the portfolio
# anyways so it might be better to reduce the number of stocks
df_us = df_us[~df_us["tic"].isin(["KO", "NKE"])]
df_us.tic.unique()

array(['AAPL', 'AMGN', 'AXP', 'BA', 'CAT', 'CSCO', 'CVX', 'DIS', 'GS',
       'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'MCD', 'MMM', 'MRK',
       'MSFT', 'PFE', 'PG', 'RTX', 'UNH', 'VZ', 'WBA', 'WMT', 'XOM'],
      dtype=object)

In [30]:
df_us[df_us.datadate>=20000101].isna().sum() # no missing values anymore, hence we can save it

datadate            0
tic                 0
adjcp               0
high                0
low                 0
volume              0
macd                0
rsi_21              0
cci_21              0
dx_21               0
ret_vola_7d         0
return_daily        0
log_return_daily    0
ret_vola_21d        0
ret_vola_63d        0
obv                 0
dtype: int64

#### VIX (Volatility index)

In [31]:
# add volatility index
vix = pd.read_csv(os.path.join(abspath, "data", "raw", "VIX.csv"), index_col = 0)
vix

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1/3/2000,24.360001,26.150000,23.980000,24.209999,24.209999,0
1/4/2000,24.940001,27.180000,24.799999,27.010000,27.010000,0
1/5/2000,27.980000,29.000000,25.850000,26.410000,26.410000,0
1/6/2000,26.680000,26.709999,24.700001,25.730000,25.730000,0
1/7/2000,25.139999,25.170000,21.719999,21.719999,21.719999,0
...,...,...,...,...,...,...
6/4/2021,18.090000,18.420000,16.180000,16.420000,16.420000,0
6/7/2021,17.340000,17.350000,15.780000,16.420000,16.420000,0
6/8/2021,16.580000,17.750000,15.150000,17.070000,17.070000,0
6/9/2021,17.180000,17.959999,15.550000,17.889999,17.889999,0


In [32]:
# luckily, there are no missing values
vix.isna().sum()

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [33]:
# let's plot it
plt.plot(vix["Adj Close"])
plt.show()
#plt.savefig("vix_adjclose") # in case it doesn't show, just save and open
plt.clf() # tell matplotlib we are done with this plot so it doesn't append subsqeuent plots

  This is separate from the ipykernel package so we can avoid doing imports until


In [34]:
# let's plot it
#np.log(df.price) - np.log(df.price.shift(1))
plt.plot(vix["Adj Close"].pct_change(1))
plt.show()
plt.savefig("vix_change") # in case it doesn't show, just save and open
plt.clf() # tell matplotlib we are done with this plot so it doesn't append subsqeuent plots

# we see that the change in VIX is not a good proxy for the VIX value, 
# because there are no distinct peaks during crisis in VIX change, while the VIX value peaks
# but we should still somehow bring this value down a little so it is not so large compared to the other values

  after removing the cwd from sys.path.


In [35]:
d = vix["Adj Close"].rolling(window=21).mean()
st = vix["Adj Close"].rolling(window=21).std()
vix["AdjStdroll21"] = vix["Adj Close"] / st
vix["AdjMeanroll21"] = vix["Adj Close"] / d
vix["Adj100"] = vix["Adj Close"] /100

In [36]:
# Note: after plotting all of the above, it becomes apparent that the best way to bring down the VIX to 
# decimals while not changing the nature of the time series is simply dividing by 100; then
# the vix is varying between 0.1 and 0.8 (latter in crisis times)

In [37]:
#plt.plot(vix["Ad1001"])
#plt.show()
#plt.savefig("vix_adj100") # in case it doesn't show, just save and open
#plt.clf() # tell matplotlib we are done with this plot so it doesn't append subsqeuent plots

In [38]:
# let's plot it
#np.log(df.price) - np.log(df.price.shift(1))
#plt.plot(vix["AdjMeanroll21"])
#plt.show()
#plt.savefig("vix_stdadj") # in case it doesn't show, just save and open
#plt.clf() # tell matplotlib we are done with this plot so it doesn't append subsqeuent plots

In [39]:
vix = pd.DataFrame(vix["Adj100"])
vix

Unnamed: 0_level_0,Adj100
Date,Unnamed: 1_level_1
1/3/2000,0.2421
1/4/2000,0.2701
1/5/2000,0.2641
1/6/2000,0.2573
1/7/2000,0.2172
...,...
6/4/2021,0.1642
6/7/2021,0.1642
6/8/2021,0.1707
6/9/2021,0.1789


In [40]:
type(vix)

pandas.core.frame.DataFrame

In [41]:
# need to convert string dates to datetime format to be compatible with the format we have in 
# the other data set
vix = vix.reset_index()
vix["Date"] = pd.to_datetime(vix["Date"], format='%m/%d/%Y')
vix["Date"] = vix["Date"].dt.strftime('%Y%m%d')
vix["Date"] 

0       20000103
1       20000104
2       20000105
3       20000106
4       20000107
          ...   
5389    20210604
5390    20210607
5391    20210608
5392    20210609
5393    20210610
Name: Date, Length: 5394, dtype: object

In [42]:
# now we renamce the columns
vix.columns = ["datadate", "adjDiv100"]
vix.head(3)

Unnamed: 0,datadate,adjDiv100
0,20000103,0.2421
1,20000104,0.2701
2,20000105,0.2641


In [43]:
vix["datadate"] = vix["datadate"].astype(int)

In [44]:
vix["datadate"].dtype
#df_us["datadate"].dtype

dtype('int32')

In [45]:
vix # note: vix is only available until 20210610, not 20210611 like us dataset, hence we
# will need to remove the last date later after merging

Unnamed: 0,datadate,adjDiv100
0,20000103,0.2421
1,20000104,0.2701
2,20000105,0.2641
3,20000106,0.2573
4,20000107,0.2172
...,...,...
5389,20210604,0.1642
5390,20210607,0.1642
5391,20210608,0.1707
5392,20210609,0.1789


In [46]:
df_us

Unnamed: 0,datadate,tic,adjcp,high,low,volume,macd,rsi_21,cci_21,dx_21,ret_vola_7d,return_daily,log_return_daily,ret_vola_21d,ret_vola_63d,obv
1263,20000103,AAPL,0.999442,1.004464,0.907924,4783300.0,0.020676,62.689835,71.109369,32.884996,0.041159,0.088754,0.085034,0.037706,0.039918,443302500.0
7921,20000103,AMGN,62.937500,70.000000,62.875000,22916100.0,4.433505,69.256746,162.560084,51.622364,0.055611,0.047867,0.046756,0.040388,0.033829,342177900.0
14579,20000103,AXP,52.416667,54.833333,52.000000,1887700.0,0.758946,51.210306,-13.428284,7.875716,0.027691,-0.054135,-0.055656,0.021863,0.024413,149873200.0
21238,20000103,BA,40.562500,41.687500,39.812500,2637100.0,0.091544,49.165297,67.240227,4.977704,0.020398,-0.021116,-0.021342,0.021983,0.023994,-60797600.0
27896,20000103,CAT,24.312500,24.500000,23.843750,2527500.0,-0.376621,50.137377,167.696352,16.830546,0.019601,0.033201,0.032661,0.023378,0.027229,-13391900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165363,20210611,UNH,397.890000,403.360000,394.780000,3094284.0,-0.085413,48.286149,-171.230261,25.750828,0.006814,-0.008967,-0.009007,0.005814,0.010855,359265417.0
172022,20210611,VZ,57.330000,57.550000,57.010000,12923530.0,-0.130330,50.142868,23.438693,5.985309,0.003599,-0.000174,-0.000174,0.005581,0.007572,50929261.0
178681,20210611,WBA,55.310000,55.820000,54.810000,3936324.0,0.214953,56.055460,96.937514,8.610903,0.016722,0.000000,0.000000,0.015847,0.015266,-39050502.0
185339,20210611,WMT,140.750000,140.850000,139.860000,8408446.0,0.120465,51.691318,-36.375217,6.314197,0.005872,0.006220,0.006200,0.007404,0.008832,634509393.0


In [47]:
# now we want to merge this data set with our US data set
merged = df_us.merge(vix, how='left', left_on='datadate', right_on='datadate')
merged

Unnamed: 0,datadate,tic,adjcp,high,low,volume,macd,rsi_21,cci_21,dx_21,ret_vola_7d,return_daily,log_return_daily,ret_vola_21d,ret_vola_63d,obv,adjDiv100
0,20000103,AAPL,0.999442,1.004464,0.907924,4783300.0,0.020676,62.689835,71.109369,32.884996,0.041159,0.088754,0.085034,0.037706,0.039918,443302500.0,0.2421
1,20000103,AMGN,62.937500,70.000000,62.875000,22916100.0,4.433505,69.256746,162.560084,51.622364,0.055611,0.047867,0.046756,0.040388,0.033829,342177900.0,0.2421
2,20000103,AXP,52.416667,54.833333,52.000000,1887700.0,0.758946,51.210306,-13.428284,7.875716,0.027691,-0.054135,-0.055656,0.021863,0.024413,149873200.0,0.2421
3,20000103,BA,40.562500,41.687500,39.812500,2637100.0,0.091544,49.165297,67.240227,4.977704,0.020398,-0.021116,-0.021342,0.021983,0.023994,-60797600.0,0.2421
4,20000103,CAT,24.312500,24.500000,23.843750,2527500.0,-0.376621,50.137377,167.696352,16.830546,0.019601,0.033201,0.032661,0.023378,0.027229,-13391900.0,0.2421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145660,20210611,UNH,397.890000,403.360000,394.780000,3094284.0,-0.085413,48.286149,-171.230261,25.750828,0.006814,-0.008967,-0.009007,0.005814,0.010855,359265417.0,
145661,20210611,VZ,57.330000,57.550000,57.010000,12923530.0,-0.130330,50.142868,23.438693,5.985309,0.003599,-0.000174,-0.000174,0.005581,0.007572,50929261.0,
145662,20210611,WBA,55.310000,55.820000,54.810000,3936324.0,0.214953,56.055460,96.937514,8.610903,0.016722,0.000000,0.000000,0.015847,0.015266,-39050502.0,
145663,20210611,WMT,140.750000,140.850000,139.860000,8408446.0,0.120465,51.691318,-36.375217,6.314197,0.005872,0.006220,0.006200,0.007404,0.008832,634509393.0,


In [48]:
# inspect if it has merged correctly
print(vix["adjDiv100"][vix["datadate"] == 20010103])
print(merged["adjDiv100"][merged["datadate"] == 20010103])

253    0.266
Name: adjDiv100, dtype: float64
6831    0.266
6832    0.266
6833    0.266
6834    0.266
6835    0.266
6836    0.266
6837    0.266
6838    0.266
6839    0.266
6840    0.266
6841    0.266
6842    0.266
6843    0.266
6844    0.266
6845    0.266
6846    0.266
6847    0.266
6848    0.266
6849    0.266
6850    0.266
6851    0.266
6852    0.266
6853    0.266
6854    0.266
6855    0.266
6856    0.266
6857    0.266
Name: adjDiv100, dtype: float64


In [49]:
merged[merged["adjDiv100"].isna()] # the nan are only on the last day that was not available for the VIX
# hence we can drop it

Unnamed: 0,datadate,tic,adjcp,high,low,volume,macd,rsi_21,cci_21,dx_21,ret_vola_7d,return_daily,log_return_daily,ret_vola_21d,ret_vola_63d,obv,adjDiv100
145638,20210611,AAPL,127.35,127.44,126.1,53382200.0,-0.425812,50.622933,72.870371,2.508869,0.010612,0.009833,0.009785,0.011516,0.014609,3092922000.0,
145639,20210611,AMGN,242.77,245.5,241.98,2001843.0,-2.239841,48.904432,-0.269191,11.51563,0.0094,-0.007644,-0.007673,0.01137,0.014854,410322700.0,
145640,20210611,AXP,164.51,164.54,162.51,1630111.0,3.128713,63.294175,67.150527,11.93508,0.009953,0.014054,0.013956,0.009544,0.013467,484347000.0,
145641,20210611,BA,247.28,251.12,245.78,9918690.0,3.850208,54.331793,43.57195,22.02753,0.011293,-0.004268,-0.004277,0.016503,0.018669,366122300.0,
145642,20210611,CAT,220.7,227.92,220.03,8729381.0,-1.160196,36.933626,-293.58591,51.03561,0.015502,-0.022283,-0.022535,0.014182,0.014068,222311600.0,
145643,20210611,CSCO,54.77,55.35,54.54,17132720.0,0.655393,63.08699,175.854097,28.719198,0.008905,-0.004725,-0.004736,0.008603,0.011512,3448313000.0,
145644,20210611,CVX,107.91,108.97,107.86,7730489.0,0.744114,54.077098,63.020142,19.698619,0.007531,-0.004612,-0.004623,0.016392,0.015767,766996600.0,
145645,20210611,DIS,177.38,178.49,176.81,5473003.0,-0.966735,46.808875,54.908303,0.6914,0.00396,0.004587,0.004577,0.009943,0.010358,1291741000.0,
145646,20210611,GS,378.05,378.75,375.11,2114857.0,7.529486,58.630961,31.291947,8.577473,0.013176,0.010991,0.010931,0.01357,0.01387,-233046100.0,
145647,20210611,HD,310.77,311.24,308.35,2783890.0,-2.900028,46.527719,-90.063403,28.131604,0.006475,0.007979,0.007947,0.008696,0.012087,409248900.0,


In [50]:
merged.dropna(inplace=True)
merged

Unnamed: 0,datadate,tic,adjcp,high,low,volume,macd,rsi_21,cci_21,dx_21,ret_vola_7d,return_daily,log_return_daily,ret_vola_21d,ret_vola_63d,obv,adjDiv100
0,20000103,AAPL,0.999442,1.004464,0.907924,4783300.0,0.020676,62.689835,71.109369,32.884996,0.041159,0.088754,0.085034,0.037706,0.039918,443302500.0,0.2421
1,20000103,AMGN,62.937500,70.000000,62.875000,22916100.0,4.433505,69.256746,162.560084,51.622364,0.055611,0.047867,0.046756,0.040388,0.033829,342177900.0,0.2421
2,20000103,AXP,52.416667,54.833333,52.000000,1887700.0,0.758946,51.210306,-13.428284,7.875716,0.027691,-0.054135,-0.055656,0.021863,0.024413,149873200.0,0.2421
3,20000103,BA,40.562500,41.687500,39.812500,2637100.0,0.091544,49.165297,67.240227,4.977704,0.020398,-0.021116,-0.021342,0.021983,0.023994,-60797600.0,0.2421
4,20000103,CAT,24.312500,24.500000,23.843750,2527500.0,-0.376621,50.137377,167.696352,16.830546,0.019601,0.033201,0.032661,0.023378,0.027229,-13391900.0,0.2421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145633,20210610,UNH,401.490000,402.850000,398.470000,4033604.0,0.528489,51.685914,-157.317168,13.885261,0.006250,0.000972,0.000972,0.006399,0.010795,362359701.0,0.1610
145634,20210610,VZ,57.340000,57.610000,57.220000,12011370.0,-0.163279,50.224807,24.883644,2.118092,0.003654,0.005083,0.005070,0.005586,0.007574,63852791.0,0.1610
145635,20210610,WBA,55.310000,55.580000,53.570000,6633430.0,0.134194,56.055460,61.547628,5.316343,0.017459,0.035186,0.034581,0.016707,0.015791,-39050502.0,0.1610
145636,20210610,WMT,139.880000,140.190000,139.080000,5458551.0,0.125756,49.169215,-60.801067,17.664751,0.005048,0.005752,0.005736,0.009450,0.008988,626100947.0,0.1610


In [51]:
#### SAVE DATA SET

In [52]:
merged.to_csv(os.path.join(abspath, "data", "preprocessed", "US_stocks_WDB_full.csv"))