In [1]:
%run init_notebook.py

In [2]:
import pandas as pd
import numpy as np

import os
import datetime as dt
from functools import wraps, reduce

import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
from settings import RAW_DATA_DIR, WORK_DATA_DIR
from utils.utils import *
from utils.load import save_file, load_data
from utils.cast_data import apply_datetime_format, apply_textmonth_to_nummonth

In [19]:
# load raw format
fears = load_data(file_name="fears.csv", file_path=RAW_DATA_DIR)
cbeo_vix = load_data(file_name="cbeo_vix.csv", file_path=RAW_DATA_DIR).rename(columns={"Date": "date"})
naaim_risk = load_data(file_name="naaim_risk.xlsx", file_path=RAW_DATA_DIR).rename(columns={"Date": "date",
                                                                                  "NAAIM Number": "naaim_ind",
                                                                                 "Standard Deviation": "naaim_std"}).iloc[:-4].sort_values("date").reset_index(drop=True)

ff_factors5 = load_data(file_name="ff_factors5.csv", file_path=RAW_DATA_DIR, skiprows=3).add_prefix("ff_").rename(columns={"ff_Unnamed: 0" : "date",
                                                                                                                           "ff_Mkt-RF": "ff_M_RF"}).iloc[:-1]
ff_factors3 = load_data(file_name="ff_factors3.csv", file_path=RAW_DATA_DIR, skiprows=3).add_prefix("ff_").add_suffix("_3").rename(columns={"ff_Unnamed: 0_3" : "date",
                                                                                                                                            "ff_Mkt-RF_3": "ff_M_RF_3"}).iloc[:-1]

us_termspread = load_data(file_name="us_termspread.csv", file_path=RAW_DATA_DIR).rename(columns={"DATE": "date", "TEDRATE": "termspread"})
us_termspread["termspread"] = us_termspread["termspread"].replace({"." : np.nan})

aaii_bull = load_data(file_name="aaii_bull.xlsx", file_path=RAW_DATA_DIR, sheet_name="raw_data").iloc[:, :10].add_prefix("aaii_").rename(columns={"aaii_Reported Date": "date"})
sp500 = load_data(file_name="sp500_yf.xlsx", file_path=RAW_DATA_DIR, sheet_name="sp500").add_prefix("sp_").rename(columns=
                                                                                                                  {
                                                                                                                      "sp_Date": "date",
                                                                                                                      "sp_Volume": "sp_volume",
                                                                                                                      "sp_Close*": "sp_close",
                                                                                                                      "sp_Adj Close**": "sp_adj_close"
                                                                                                                  }
                                                                                                                 ).sort_values("date", ascending=True).reset_index(drop=True)
sp500["date"] = sp500.date.apply(lambda x: apply_textmonth_to_nummonth(x))

pc1 = load_data(file_name="cboe_putcall_total_2003-2012.csv", file_path=RAW_DATA_DIR, skiprows=2).rename(columns={"Trade_date": "date", "P/C Ratio": "pc_ratio"})
pc2 = load_data(file_name="cboe_putcall_total_2006-2019.csv", file_path=RAW_DATA_DIR, skiprows=2).rename(columns={"DATE": "date",
                                                                                                        "CALLS": "Call",
                                                                                                        "PUTS": "Put",
                                                                                                        "TOTAL": "Total",
                                                                                                        "P/C Ratio": "pc_ratio"})
goog_sent = load_data(file_name="gog_sent.csv", file_path=RAW_DATA_DIR)
goog_sent = goog_sent[goog_sent.country=="United States"].drop(["Unnamed: 0", "country"], axis=1).rename(columns={"sentiment_combined": "goog_sent",
                                                                                                                  "week": "date"}).reset_index(drop=True)
;

('Date',)
Unknown columns found
['Date']
('Date',)
('Standard Deviation',)
('NAAIM Number',)
Unknown columns found
['Date', 'Standard Deviation', 'NAAIM Number']
('Unnamed: 0',)
('Mkt-RF',)
('SMB',)
('HML',)
('RMW',)
('CMA',)
('RF',)
Unknown columns found
['Unnamed: 0', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
('Unnamed: 0',)
('Mkt-RF',)
('SMB',)
('HML',)
('RF',)
Unknown columns found
['Unnamed: 0', 'Mkt-RF', 'SMB', 'HML', 'RF']
('DATE',)
('TEDRATE',)
Unknown columns found
['DATE', 'TEDRATE']
('Reported Date',)
('Bullish',)
('Neutral',)
('Bearish',)
('Bullish 8-week Mov Avg',)
('Bull-Bear Spread',)
('Bullish Average',)
('Bullish Average +St. Dev.',)
('Bullish Average - St. Dev.',)
('S&P 500 Weekly High',)
('S&P 500 Weekly Low',)
('S&P 500 Weekly Close',)
Unknown columns found
['Reported Date', 'Bullish', 'Neutral', 'Bearish', 'Bullish 8-week Mov Avg', 'Bull-Bear Spread', 'Bullish Average', 'Bullish Average +St. Dev.', 'Bullish Average - St. Dev.', 'S&P 500 Weekly High', 'S&P 500 Wee

''

In [20]:
# create cast_dict
_ = list(fears.columns)
_.extend(list(cbeo_vix.columns))
_.extend(list(naaim_risk.columns))
_.extend(list(ff_factors3.columns))
_.extend(list(ff_factors5.columns))
_.extend(list(us_termspread.columns))
_.extend(list(pc1.columns))
_.extend(list(aaii_bull.columns))
_.extend(list(sp500.columns))
_.extend(list(goog_sent.columns))

_ = list(set(_))
_.sort()
# {item: float for item in _}

In [21]:
# save files in feather format
save_file(fears, file_name="fears.feather", file_path=WORK_DATA_DIR)
save_file(cbeo_vix, file_name="cbeo_vix.feather", file_path=WORK_DATA_DIR)
save_file(naaim_risk, file_name="naaim_risk.feather", file_path=WORK_DATA_DIR)
save_file(ff_factors5, file_name="ff_factors5.feather", file_path=WORK_DATA_DIR)
save_file(ff_factors3, file_name="ff_factors3.feather", file_path=WORK_DATA_DIR)
save_file(us_termspread, file_name="us_termspread.feather", file_path=WORK_DATA_DIR)
save_file(pc1, file_name="put_call1.feather", file_path=WORK_DATA_DIR)
save_file(pc2, file_name="put_call2.feather", file_path=WORK_DATA_DIR)
save_file(aaii_bull, file_name="aaii_bull.feather", file_path=WORK_DATA_DIR)
save_file(sp500, file_name="sp500.feather", file_path=WORK_DATA_DIR)
save_file(goog_sent, file_name="goog_sent.feather", file_path=WORK_DATA_DIR)

Load feather files

In [22]:
fears = load_data(file_name="fears.feather", file_path=WORK_DATA_DIR)
cbeo_vix = load_data(file_name="cbeo_vix.feather", file_path=WORK_DATA_DIR)
naaim_risk = load_data(file_name="naaim_risk.feather", file_path=WORK_DATA_DIR)
ff_factors5 = load_data(file_name="ff_factors5.feather", file_path=WORK_DATA_DIR)
ff_factors3 = load_data(file_name="ff_factors3.feather", file_path=WORK_DATA_DIR)
us_termspread = load_data(file_name="us_termspread.feather", file_path=WORK_DATA_DIR)
put_call1 = load_data(file_name="put_call1.feather", file_path=WORK_DATA_DIR)
put_call2 = load_data(file_name="put_call2.feather", file_path=WORK_DATA_DIR)
aaii_bull = load_data(file_name="aaii_bull.feather", file_path=WORK_DATA_DIR)
sp500 = load_data(file_name="sp500.feather", file_path=WORK_DATA_DIR)
goog_sent = load_data(file_name="goog_sent.feather", file_path=WORK_DATA_DIR, europe_time_slash=True)

put_call = pd.concat([put_call1,
                      put_call2.set_index("date").loc[put_call1.iloc[-1].date:].iloc[1:].reset_index()],
                     axis=0).reset_index(drop=True)

In [23]:
data_dict = {}
data_dict.update(
    {
        "data_cols" : 
        {
        "fears": list(fears.drop("date", axis=1).columns),
        "cbeo_vix": list(cbeo_vix.drop("date", axis=1).columns),
        "naaim_risk": list(naaim_risk.drop("date", axis=1).columns),
        "ff_factors5": list(ff_factors5.drop("date", axis=1).columns),
        "termspread": list(us_termspread.drop("date", axis=1).columns),
        "put_call": list(put_call.drop("date", axis=1).columns),
        "aai_bull": list(aaii_bull.drop("date", axis=1).columns),
        "sp500": list(sp500.drop("date", axis=1).columns),
            "goog_sent": list(goog_sent.drop("date", axis=1).columns)
        }
    }
)

dict_df = dict(zip( ["fears", "cbeo_vix", "naaim_risk", "ff_factors3", "ff_factors5", "us_termspread", "put_call", "aaii_bull", "sp500", "goog_sent"],
                   [fears, cbeo_vix, naaim_risk, ff_factors3, ff_factors5, us_termspread, put_call, aaii_bull, sp500, goog_sent]))

Merge files

In [24]:
# time interval overview
output = []
for df in dict_df.values():
    output.append([df["date"].min(),
                   df["date"].max(), 
                   (df["date"].max() - df["date"].min()).days,
                   round((df["date"].max() - df["date"].min()).days / 7),
                   len(df) / (df["date"].max() - df["date"].min()).days,
                  (df["date"] - df["date"].shift(1)).apply(lambda x: x.days).mean()])
pd.DataFrame(output, columns=["min", "max", "days", "weeks", "days_perc", "lag_mean"], index=dict_df.keys())

Unnamed: 0,min,max,days,weeks,days_perc,lag_mean
fears,2004-07-01,2011-12-30,2738,391,0.69065,1.448677
cbeo_vix,1990-01-02,2021-09-30,11594,1656,0.994739,1.005376
naaim_risk,2006-07-05,2022-02-16,5705,815,0.143208,6.991422
ff_factors3,1926-07-01,2021-12-31,34882,4983,0.720945,1.387124
ff_factors5,1963-07-01,2021-12-30,21367,3052,0.68924,1.450971
us_termspread,1986-01-02,2022-01-21,13168,1881,0.714383,1.399957
put_call,2003-10-17,2019-10-04,5831,833,0.689076,1.451581
aaii_bull,1987-07-24,2022-01-05,12584,1798,0.1428,7.006682
sp500,1990-01-02,2021-12-30,11685,1669,0.69003,1.381791
goog_sent,2004-07-04,2014-12-21,3822,546,0.143119,7.0


In [25]:
aaii_bull["week"] = aaii_bull.date.apply(lambda x: apply_date_to_week(x))
naaim_risk["week"] = naaim_risk.date.apply(lambda x: apply_date_to_week(x))
goog_sent["week"] = goog_sent.date.apply(lambda x: apply_date_to_week(x))

# drop week duplicates
naaim_risk.drop_duplicates("week", inplace=True)
aaii_bull.drop_duplicates("week", inplace=True)
goog_sent.drop_duplicates("week", inplace=True)

In [26]:
# merge weekly data
data_frames = [aaii_bull.drop("date", axis=1),
               goog_sent.drop("date", axis=1),
               naaim_risk.drop("date", axis=1)]

df_sub = reduce(lambda  left,right: pd.merge(left,right,on=['week'],
                                            how='outer'), data_frames)

In [27]:
# merge daily and weekly data
df_list = [put_call,
           cbeo_vix,
           fears,
           ff_factors5,
           us_termspread,
           sp500]

df = reduce(lambda  left,right: pd.merge(left,
                                                right,
                                                on=['date'],
                                                how='outer'),
                   df_list)
df["week"] = df.date.apply(lambda x: apply_date_to_week(x))

df = pd.merge(df,
         df_sub,
         on="week",
         how="outer")

In [28]:
df_w = cut_to_weekly_data(df)

In [32]:
df = df[~df.date.isna()]

In [33]:
save_file(data_dict, "data_dict.pkl", WORK_DATA_DIR)

In [34]:
save_file(df, file_name="merged_data.csv", file_path=WORK_DATA_DIR, index=False)