In [1]:
%run init_notebook.py

In [2]:
import pandas as pd
import numpy as np

import os
import datetime as dt
from functools import wraps

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from settings import RAW_DATA_DIR, WORK_DATA_DIR
from utils.utils import *
from utils.cast_data import apply_datetime_format

In [4]:
# load raw format
fears = load_data(file_name="fears.csv", file_path=RAW_DATA_DIR)
cbeo_vix = load_data(file_name="cbeo_vix.csv", file_path=RAW_DATA_DIR).rename(columns={"Date": "date"})
naaim_risk = load_data(file_name="naaim_risk.xlsx", file_path=RAW_DATA_DIR).rename(columns={"Date": "date",
                                                                                  "NAAIM Number": "naaim_ind",
                                                                                 "Standard Deviation": "naaim_std"}).iloc[:-4].sort_values("date").reset_index(drop=True)

ff_factors5 = load_data(file_name="ff_factors5.csv", file_path=RAW_DATA_DIR, skiprows=3).add_prefix("ff_").rename(columns={"ff_Unnamed: 0" : "date",
                                                                                                                           "ff_Mkt-RF": "ff_M_RF"}).iloc[:-1]
ff_factors3 = load_data(file_name="ff_factors3.csv", file_path=RAW_DATA_DIR, skiprows=3).add_prefix("ff_").add_suffix("_3").rename(columns={"ff_Unnamed: 0_3" : "date",
                                                                                                                                            "ff_Mkt-RF_3": "ff_M_RF_3"}).iloc[:-1]

us_termspread = load_data(file_name="us_termspread.csv", file_path=RAW_DATA_DIR).rename(columns={"DATE": "date", "TEDRATE": "termspread"})
us_termspread["termspread"] = us_termspread["termspread"].replace({"." : np.nan})

# trade_vol = load_data("trade_vol.csv", file_path=RAW_DATA_DIR)

aaii_bull = load_data(file_name="aaii_bull.xlsx", file_path=RAW_DATA_DIR, sheet_name="raw_data").iloc[:, :10].add_prefix("aaii_").rename(columns={"aaii_Reported Date": "date"})
sp500 = load_data(file_name="sp500_yf.xlsx", file_path=RAW_DATA_DIR, sheet_name="sp500").add_prefix("sp_").rename(columns=
                                                                                                                  {
                                                                                                                      "sp_Date": "date",
                                                                                                                      "sp_Volume": "sp_volume",
                                                                                                                      "sp_Close*": "sp_close",
                                                                                                                      "sp_Adj Close**": "sp_adj_close"
                                                                                                                  }
                                                                                                                 ).sort_values("date", ascending=True).reset_index(drop=True)
sp500["date"] = sp500.date.apply(lambda x: apply_textmonth_to_nummonth(x))

pc1 = load_data(file_name="cboe_putcall_total_2003-2012.csv", file_path=RAW_DATA_DIR, skiprows=2).rename(columns={"Trade_date": "date", "P/C Ratio": "pc_ratio"})
pc2 = load_data(file_name="cboe_putcall_total_2006-2019.csv", file_path=RAW_DATA_DIR, skiprows=2).rename(columns={"DATE": "date",
                                                                                                        "CALLS": "Call",
                                                                                                        "PUTS": "Put",
                                                                                                        "TOTAL": "Total",
                                                                                                        "P/C Ratio": "pc_ratio"})
;

('Date',)
Unknown columns found
['Date']
('Date',)
('Standard Deviation',)
('NAAIM Number',)
Unknown columns found
['Date', 'Standard Deviation', 'NAAIM Number']
('Unnamed: 0',)
('Mkt-RF',)
('SMB',)
('HML',)
('RMW',)
('CMA',)
('RF',)
Unknown columns found
['Unnamed: 0', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
('Unnamed: 0',)
('Mkt-RF',)
('SMB',)
('HML',)
('RF',)
Unknown columns found
['Unnamed: 0', 'Mkt-RF', 'SMB', 'HML', 'RF']
('DATE',)
('TEDRATE',)
Unknown columns found
['DATE', 'TEDRATE']
('Reported Date',)
('Bullish',)
('Neutral',)
('Bearish',)
('Bullish 8-week Mov Avg',)
('Bull-Bear Spread',)
('Bullish Average',)
('Bullish Average +St. Dev.',)
('Bullish Average - St. Dev.',)
('S&P 500 Weekly High',)
('S&P 500 Weekly Low',)
('S&P 500 Weekly Close',)
Unknown columns found
['Reported Date', 'Bullish', 'Neutral', 'Bearish', 'Bullish 8-week Mov Avg', 'Bull-Bear Spread', 'Bullish Average', 'Bullish Average +St. Dev.', 'Bullish Average - St. Dev.', 'S&P 500 Weekly High', 'S&P 500 Wee

''

In [5]:
# create cast_dict
_ = list(fears.columns)
_.extend(list(cbeo_vix.columns))
_.extend(list(naaim_risk.columns))
_.extend(list(ff_factors3.columns))
_.extend(list(ff_factors5.columns))
_.extend(list(us_termspread.columns))
_.extend(list(pc1.columns))
_.extend(list(aaii_bull.columns))
_.extend(list(sp500.columns))

_ = list(set(_))
_.sort()
# {item: float for item in _}

In [6]:
# save files in feather format
save_file(fears, file_name="fears.feather", file_path=WORK_DATA_DIR)
save_file(cbeo_vix, file_name="cbeo_vix.feather", file_path=WORK_DATA_DIR)
save_file(naaim_risk, file_name="naaim_risk.feather", file_path=WORK_DATA_DIR)
save_file(ff_factors5, file_name="ff_factors5.feather", file_path=WORK_DATA_DIR)
save_file(ff_factors3, file_name="ff_factors3.feather", file_path=WORK_DATA_DIR)
save_file(us_termspread, file_name="us_termspread.feather", file_path=WORK_DATA_DIR)
save_file(pc1, file_name="put_call1.feather", file_path=WORK_DATA_DIR)
save_file(pc2, file_name="put_call2.feather", file_path=WORK_DATA_DIR)
save_file(aaii_bull, file_name="aaii_bull.feather", file_path=WORK_DATA_DIR)
save_file(sp500, file_name="sp500.feather", file_path=WORK_DATA_DIR)

Load feather files

In [7]:
fears = load_data(file_name="fears.feather", file_path=WORK_DATA_DIR)
cbeo_vix = load_data(file_name="cbeo_vix.feather", file_path=WORK_DATA_DIR)
naaim_risk = load_data(file_name="naaim_risk.feather", file_path=WORK_DATA_DIR)
ff_factors5 = load_data(file_name="ff_factors5.feather", file_path=WORK_DATA_DIR)
ff_factors3 = load_data(file_name="ff_factors3.feather", file_path=WORK_DATA_DIR)
us_termspread = load_data(file_name="us_termspread.feather", file_path=WORK_DATA_DIR)
put_call1 = load_data(file_name="put_call1.feather", file_path=WORK_DATA_DIR)
put_call2 = load_data(file_name="put_call2.feather", file_path=WORK_DATA_DIR)
aaii_bull = load_data(file_name="aaii_bull.feather", file_path=WORK_DATA_DIR)
# # trade_vol = load_data("trade_vol.csv", file_path=RAW_DATA_DIR)
sp500 = load_data(file_name="sp500.feather", file_path=WORK_DATA_DIR)

put_call = pd.concat([put_call1,
                      put_call2.set_index("date").loc[put_call1.iloc[-1].date:].iloc[1:].reset_index()],
                     axis=0).reset_index(drop=True)

In [8]:
data_dict = {}
data_dict.update(
    {
        "data_cols" : 
        {
        "fears": list(fears.drop("date", axis=1).columns),
        "cbeo_vix": list(cbeo_vix.drop("date", axis=1).columns),
        "naaim_risk": list(naaim_risk.drop("date", axis=1).columns),
        "ff_factors5": list(ff_factors5.drop("date", axis=1).columns),
        "termspread": list(us_termspread.drop("date", axis=1).columns),
        "put_call": list(put_call.drop("date", axis=1).columns),
        "aai_bull": list(aaii_bull.drop("date", axis=1).columns),
        "sp500": list(sp500.drop("date", axis=1).columns)
        }
    }
)

dict_df = dict(zip( ["fears", "cbeo_vix", "naaim_risk", "ff_factors3", "ff_factors5", "us_termspread", "put_call", "aaii_bull", "sp500"],
                   [fears, cbeo_vix, naaim_risk, ff_factors3, ff_factors5, us_termspread, put_call, aaii_bull, sp500]))

# data_dict.update({"df" : dict_df})

Merge files

In [9]:
# time overview
output = []
for df in dict_df.values():
    output.append([df["date"].min(),
                   df["date"].max(), 
                   (df["date"].max() - df["date"].min()).days,
                   round((df["date"].max() - df["date"].min()).days / 7),
                   len(df) / (df["date"].max() - df["date"].min()).days,
                  (df["date"] - df["date"].shift(1)).apply(lambda x: x.days).mean()])
pd.DataFrame(output, columns=["min", "max", "days", "weeks", "days_perc", "lag_mean"], index=dict_df.keys())

Unnamed: 0,min,max,days,weeks,days_perc,lag_mean
fears,2004-07-01,2011-12-30,2738,391,0.69065,1.448677
cbeo_vix,1990-01-02,2021-09-30,11594,1656,0.994739,1.005376
naaim_risk,2006-07-05,2022-02-16,5705,815,0.143208,6.991422
ff_factors3,1926-07-01,2021-12-31,34882,4983,0.720945,1.387124
ff_factors5,1963-07-01,2021-12-30,21367,3052,0.68924,1.450971
us_termspread,1986-01-02,2022-01-21,13168,1881,0.714383,1.399957
put_call,2003-10-17,2019-10-04,5831,833,0.689076,1.451581
aaii_bull,1987-07-24,2022-01-05,12584,1798,0.1428,7.006682
sp500,1990-01-02,2021-12-30,11685,1669,0.69003,1.381791


In [10]:
aaii_bull["week"] = aaii_bull.date.apply(lambda x: apply_date_to_week(x))
naaim_risk["week"] = naaim_risk.date.apply(lambda x: apply_date_to_week(x))
naaim_risk.drop_duplicates("week", inplace=True)
aaii_bull.drop_duplicates("week", inplace=True)

In [11]:
_ = pd.merge(naaim_risk,
              aaii_bull.drop("date", axis=1),
              on="week",
              how="outer").rename(columns={"date": "date_week"}).sort_values("week", ascending=True)

df = pd.merge(put_call,
              cbeo_vix, 
              on="date", 
              how="outer").sort_values("date")

df = pd.merge(df,
              fears,
              on="date",
              how="outer").sort_values("date")

df = pd.merge(df,
              ff_factors5,
              on="date",
              how="outer").sort_values("date")

df = pd.merge(df,
              us_termspread,
              on="date",
              how="outer").sort_values("date")

df = pd.merge(df,
              sp500,
              on="date",
              how="outer").sort_values("date")

df["week"] = df.date.apply(lambda x: apply_date_to_week(x))

In [12]:
df = pd.merge(df,
         _,
         on="week",
         how="outer")

In [13]:
df.drop(data_dict["data_cols"]["fears"], axis=1).drop_duplicates("week").dropna()

Unnamed: 0,date,Call,Put,Total,pc_ratio,vixo,vixh,vixl,vix,vxoo,...,S&P 500,aaii_Bullish,aaii_Neutral,aaii_Bearish,aaii_Total,aaii_Bullish 8-week Mov Avg,aaii_Bull-Bear Spread,aaii_Bullish Average,aaii_Bullish Average +St. Dev.,aaii_Bullish Average - St. Dev.
14529,2006-07-03,482979.0,463824.0,946803.0,0.96,13.29,13.51,12.77,13.05,12.20,...,1265.48,0.377000,0.196700,0.426200,0.999900,0.333175,-0.049200,0.379967,0.480016,0.279917
14534,2006-07-10,1003583.0,927974.0,1931557.0,0.92,14.17,14.50,13.67,14.02,13.27,...,1236.20,0.365000,0.240900,0.394200,1.000100,0.329562,-0.029200,0.379967,0.480016,0.279917
14539,2006-07-17,1313329.0,1425727.0,2739056.0,1.09,18.73,18.76,17.75,18.64,17.46,...,1240.29,0.238500,0.183500,0.578000,1.000000,0.318075,-0.339500,0.379967,0.480016,0.279917
14544,2006-07-24,1791832.0,1225369.0,3017201.0,0.68,17.08,17.08,14.89,14.98,15.73,...,1278.55,0.348800,0.220900,0.430200,0.999900,0.323213,-0.081400,0.379967,0.480016,0.279917
14549,2006-07-31,983758.0,823480.0,1807238.0,0.84,15.01,15.13,14.86,14.95,13.91,...,1279.36,0.314600,0.213500,0.471900,1.000000,0.329750,-0.157300,0.379967,0.480016,0.279917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17956,2019-08-19,2342839.0,2197016.0,4539855.0,0.94,17.93,18.22,16.52,16.88,18.44,...,2900.51,0.266355,0.336449,0.397196,1.000000,0.305428,-0.130841,0.379967,0.480016,0.279917
17971,2019-09-09,2339451.0,2296322.0,4635773.0,0.98,15.26,16.13,14.95,15.27,15.21,...,3000.93,0.331250,0.356250,0.312500,1.000000,0.286930,0.018750,0.379967,0.480016,0.279917
17976,2019-09-16,2053129.0,2452665.0,4505794.0,1.19,14.89,15.29,14.50,14.67,15.01,...,3006.73,0.353383,0.368421,0.278195,0.999999,0.291426,0.075188,0.379967,0.480016,0.279917
17981,2019-09-23,1904221.0,1962814.0,3867035.0,1.03,15.35,16.00,14.71,14.91,16.04,...,2966.60,0.293737,0.373650,0.332613,1.000000,0.280097,-0.038876,0.379967,0.480016,0.279917


In [14]:
save_file(data_dict, "data_dict.pkl", WORK_DATA_DIR)

In [15]:
save_file(df, file_name="merged_data.csv", file_path=WORK_DATA_DIR, index=False)