In [1]:
%run init_notebook.py

In [2]:
import pandas as pd
import numpy as np

import os
import datetime as dt
from functools import wraps

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from settings import RAW_DATA_DIR, WORK_DATA_DIR
from utils.utils import *
from utils.load import save_file, load_data
from utils.cast_data import apply_datetime_format, apply_textmonth_to_nummonth

In [4]:
# load raw format
fears = load_data(file_name="fears.csv", file_path=RAW_DATA_DIR)
cbeo_vix = load_data(file_name="cbeo_vix.csv", file_path=RAW_DATA_DIR).rename(columns={"Date": "date"})
naaim_risk = load_data(file_name="naaim_risk.xlsx", file_path=RAW_DATA_DIR).rename(columns={"Date": "date",
                                                                                  "NAAIM Number": "naaim_ind",
                                                                                 "Standard Deviation": "naaim_std"}).iloc[:-4].sort_values("date").reset_index(drop=True)

ff_factors5 = load_data(file_name="ff_factors5.csv", file_path=RAW_DATA_DIR, skiprows=3).add_prefix("ff_").rename(columns={"ff_Unnamed: 0" : "date",
                                                                                                                           "ff_Mkt-RF": "ff_M_RF"}).iloc[:-1]
ff_factors3 = load_data(file_name="ff_factors3.csv", file_path=RAW_DATA_DIR, skiprows=3).add_prefix("ff_").add_suffix("_3").rename(columns={"ff_Unnamed: 0_3" : "date",
                                                                                                                                            "ff_Mkt-RF_3": "ff_M_RF_3"}).iloc[:-1]

us_termspread = load_data(file_name="us_termspread.csv", file_path=RAW_DATA_DIR).rename(columns={"DATE": "date", "TEDRATE": "termspread"})
us_termspread["termspread"] = us_termspread["termspread"].replace({"." : np.nan})

# trade_vol = load_data("trade_vol.csv", file_path=RAW_DATA_DIR)

aaii_bull = load_data(file_name="aaii_bull.xlsx", file_path=RAW_DATA_DIR, sheet_name="raw_data").iloc[:, :10].add_prefix("aaii_").rename(columns={"aaii_Reported Date": "date"})
sp500 = load_data(file_name="sp500_yf.xlsx", file_path=RAW_DATA_DIR, sheet_name="sp500").add_prefix("sp_").rename(columns=
                                                                                                                  {
                                                                                                                      "sp_Date": "date",
                                                                                                                      "sp_Volume": "sp_volume",
                                                                                                                      "sp_Close*": "sp_close",
                                                                                                                      "sp_Adj Close**": "sp_adj_close"
                                                                                                                  }
                                                                                                                 ).sort_values("date", ascending=True).reset_index(drop=True)
sp500["date"] = sp500.date.apply(lambda x: apply_textmonth_to_nummonth(x))

pc1 = load_data(file_name="cboe_putcall_total_2003-2012.csv", file_path=RAW_DATA_DIR, skiprows=2).rename(columns={"Trade_date": "date", "P/C Ratio": "pc_ratio"})
pc2 = load_data(file_name="cboe_putcall_total_2006-2019.csv", file_path=RAW_DATA_DIR, skiprows=2).rename(columns={"DATE": "date",
                                                                                                        "CALLS": "Call",
                                                                                                        "PUTS": "Put",
                                                                                                        "TOTAL": "Total",
                                                                                                        "P/C Ratio": "pc_ratio"})
goog_sent = load_data(file_name="gog_sent.csv", file_path=RAW_DATA_DIR)
goog_sent = goog_sent[goog_sent.country=="United States"].drop(["Unnamed: 0", "country"], axis=1).rename(columns={"sentiment_combined": "goog_sent",
                                                                                                                  "week": "date"}).reset_index(drop=True)
;

('Date',)
Unknown columns found
['Date']
('Date',)
('Standard Deviation',)
('NAAIM Number',)
Unknown columns found
['Date', 'Standard Deviation', 'NAAIM Number']
('Unnamed: 0',)
('Mkt-RF',)
('SMB',)
('HML',)
('RMW',)
('CMA',)
('RF',)
Unknown columns found
['Unnamed: 0', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
('Unnamed: 0',)
('Mkt-RF',)
('SMB',)
('HML',)
('RF',)
Unknown columns found
['Unnamed: 0', 'Mkt-RF', 'SMB', 'HML', 'RF']
('DATE',)
('TEDRATE',)
Unknown columns found
['DATE', 'TEDRATE']
('Reported Date',)
('Bullish',)
('Neutral',)
('Bearish',)
('Bullish 8-week Mov Avg',)
('Bull-Bear Spread',)
('Bullish Average',)
('Bullish Average +St. Dev.',)
('Bullish Average - St. Dev.',)
('S&P 500 Weekly High',)
('S&P 500 Weekly Low',)
('S&P 500 Weekly Close',)
Unknown columns found
['Reported Date', 'Bullish', 'Neutral', 'Bearish', 'Bullish 8-week Mov Avg', 'Bull-Bear Spread', 'Bullish Average', 'Bullish Average +St. Dev.', 'Bullish Average - St. Dev.', 'S&P 500 Weekly High', 'S&P 500 Wee

''

In [5]:
# create cast_dict
_ = list(fears.columns)
_.extend(list(cbeo_vix.columns))
_.extend(list(naaim_risk.columns))
_.extend(list(ff_factors3.columns))
_.extend(list(ff_factors5.columns))
_.extend(list(us_termspread.columns))
_.extend(list(pc1.columns))
_.extend(list(aaii_bull.columns))
_.extend(list(sp500.columns))
_.extend(list(goog_sent.columns))

_ = list(set(_))
_.sort()
# {item: float for item in _}

In [6]:
# save files in feather format
save_file(fears, file_name="fears.feather", file_path=WORK_DATA_DIR)
save_file(cbeo_vix, file_name="cbeo_vix.feather", file_path=WORK_DATA_DIR)
save_file(naaim_risk, file_name="naaim_risk.feather", file_path=WORK_DATA_DIR)
save_file(ff_factors5, file_name="ff_factors5.feather", file_path=WORK_DATA_DIR)
save_file(ff_factors3, file_name="ff_factors3.feather", file_path=WORK_DATA_DIR)
save_file(us_termspread, file_name="us_termspread.feather", file_path=WORK_DATA_DIR)
save_file(pc1, file_name="put_call1.feather", file_path=WORK_DATA_DIR)
save_file(pc2, file_name="put_call2.feather", file_path=WORK_DATA_DIR)
save_file(aaii_bull, file_name="aaii_bull.feather", file_path=WORK_DATA_DIR)
save_file(sp500, file_name="sp500.feather", file_path=WORK_DATA_DIR)
save_file(goog_sent, file_name="goog_sent.feather", file_path=WORK_DATA_DIR)

Load feather files

In [7]:
fears = load_data(file_name="fears.feather", file_path=WORK_DATA_DIR)
cbeo_vix = load_data(file_name="cbeo_vix.feather", file_path=WORK_DATA_DIR)
naaim_risk = load_data(file_name="naaim_risk.feather", file_path=WORK_DATA_DIR)
ff_factors5 = load_data(file_name="ff_factors5.feather", file_path=WORK_DATA_DIR)
ff_factors3 = load_data(file_name="ff_factors3.feather", file_path=WORK_DATA_DIR)
us_termspread = load_data(file_name="us_termspread.feather", file_path=WORK_DATA_DIR)
put_call1 = load_data(file_name="put_call1.feather", file_path=WORK_DATA_DIR)
put_call2 = load_data(file_name="put_call2.feather", file_path=WORK_DATA_DIR)
aaii_bull = load_data(file_name="aaii_bull.feather", file_path=WORK_DATA_DIR)
sp500 = load_data(file_name="sp500.feather", file_path=WORK_DATA_DIR)
goog_sent = load_data(file_name="goog_sent.feather", file_path=WORK_DATA_DIR, europe_time_slash=True)

put_call = pd.concat([put_call1,
                      put_call2.set_index("date").loc[put_call1.iloc[-1].date:].iloc[1:].reset_index()],
                     axis=0).reset_index(drop=True)

In [8]:
data_dict = {}
data_dict.update(
    {
        "data_cols" : 
        {
        "fears": list(fears.drop("date", axis=1).columns),
        "cbeo_vix": list(cbeo_vix.drop("date", axis=1).columns),
        "naaim_risk": list(naaim_risk.drop("date", axis=1).columns),
        "ff_factors5": list(ff_factors5.drop("date", axis=1).columns),
        "termspread": list(us_termspread.drop("date", axis=1).columns),
        "put_call": list(put_call.drop("date", axis=1).columns),
        "aai_bull": list(aaii_bull.drop("date", axis=1).columns),
        "sp500": list(sp500.drop("date", axis=1).columns),
            "goog_sent": list(goog_sent.drop("date", axis=1).columns)
        }
    }
)

dict_df = dict(zip( ["fears", "cbeo_vix", "naaim_risk", "ff_factors3", "ff_factors5", "us_termspread", "put_call", "aaii_bull", "sp500", "goog_sent"],
                   [fears, cbeo_vix, naaim_risk, ff_factors3, ff_factors5, us_termspread, put_call, aaii_bull, sp500, goog_sent]))

Merge files

In [9]:
# time overview
output = []
for df in dict_df.values():
    output.append([df["date"].min(),
                   df["date"].max(), 
                   (df["date"].max() - df["date"].min()).days,
                   round((df["date"].max() - df["date"].min()).days / 7),
                   len(df) / (df["date"].max() - df["date"].min()).days,
                  (df["date"] - df["date"].shift(1)).apply(lambda x: x.days).mean()])
pd.DataFrame(output, columns=["min", "max", "days", "weeks", "days_perc", "lag_mean"], index=dict_df.keys())

Unnamed: 0,min,max,days,weeks,days_perc,lag_mean
fears,2004-07-01,2011-12-30,2738,391,0.69065,1.448677
cbeo_vix,1990-01-02,2021-09-30,11594,1656,0.994739,1.005376
naaim_risk,2006-07-05,2022-02-16,5705,815,0.143208,6.991422
ff_factors3,1926-07-01,2021-12-31,34882,4983,0.720945,1.387124
ff_factors5,1963-07-01,2021-12-30,21367,3052,0.68924,1.450971
us_termspread,1986-01-02,2022-01-21,13168,1881,0.714383,1.399957
put_call,2003-10-17,2019-10-04,5831,833,0.689076,1.451581
aaii_bull,1987-07-24,2022-01-05,12584,1798,0.1428,7.006682
sp500,1990-01-02,2021-12-30,11685,1669,0.69003,1.381791
goog_sent,2004-07-04,2014-12-21,3822,546,0.143119,7.0


In [47]:
aaii_bull["week"] = aaii_bull.date.apply(lambda x: apply_date_to_week(x))
naaim_risk["week"] = naaim_risk.date.apply(lambda x: apply_date_to_week(x))
goog_sent["week"] = goog_sent.date.apply(lambda x: apply_date_to_week(x))

naaim_risk.drop_duplicates("week", inplace=True)
aaii_bull.drop_duplicates("week", inplace=True)
goog_sent.drop_duplicates("week", inplace=True)

In [80]:
sub = pd.merge(naaim_risk,
              aaii_bull.drop("date", axis=1),
              on="week",
              how="outer").rename(columns={"date": "date_week"}).sort_values("week", ascending=True)

# sub = pd.merge(sub,
#                goog_sent,
#               on="week",
#               how="outer").rename(columns={"date": "date_week"}).sort_values("week", ascending=True)

In [81]:
cut_to_weekly_data(naaim_risk)

Unnamed: 0,date,Mean/Average,Most Bearish Response,Quart 1 (25% at/below),Quart 2 (median),Quart 3 (25% at/above),Most Bullish Response,naaim_std,naaim_ind,S&P 500,week
0,2006-07-05,19.44,-100.0,0.00,20.0,50.00,100.00,55.55,19.44,1265.48,200627
1,2006-07-12,31.20,-50.0,0.00,25.0,50.00,150.00,47.84,31.20,1236.20,200628
2,2006-07-19,18.76,-100.0,0.00,25.0,50.00,100.00,38.17,18.76,1240.29,200629
3,2006-07-26,17.38,-50.0,0.00,15.0,50.00,100.00,33.78,17.38,1278.55,200630
4,2006-08-02,44.23,-50.0,21.25,50.0,53.75,175.00,43.69,44.23,1279.36,200631
...,...,...,...,...,...,...,...,...,...,...,...
812,2022-01-19,56.73,-125.0,20.00,75.0,91.25,200.00,76.22,56.73,4577.11,20223
813,2022-01-26,53.39,-100.0,0.00,70.0,100.00,200.00,66.22,53.39,4349.93,20224
814,2022-02-02,62.54,0.0,20.00,60.0,90.00,200.00,49.17,62.54,4589.38,20225
815,2022-02-09,66.80,0.0,37.50,85.0,100.00,121.25,40.53,66.80,4587.18,20226


In [77]:
cut_to_weekly_data(sub.drop("goog_sent", axis=1))

Unnamed: 0,date_week,Mean/Average,Most Bearish Response,Quart 1 (25% at/below),Quart 2 (median),Quart 3 (25% at/above),Most Bullish Response,naaim_std,naaim_ind,S&P 500,...,aaii_Bullish,aaii_Neutral,aaii_Bearish,aaii_Total,aaii_Bullish 8-week Mov Avg,aaii_Bull-Bear Spread,aaii_Bullish Average,aaii_Bullish Average +St. Dev.,aaii_Bullish Average - St. Dev.,date_week.1
981,2006-07-05,19.44,-100.0,0.00,20.0,50.00,100.0,55.55,19.44,1265.48,...,0.377000,0.196700,0.426200,0.999900,0.333175,-0.049200,0.379967,0.480016,0.279917,2006-07-09
982,2006-07-12,31.20,-50.0,0.00,25.0,50.00,150.0,47.84,31.20,1236.20,...,0.365000,0.240900,0.394200,1.000100,0.329562,-0.029200,0.379967,0.480016,0.279917,2006-07-16
983,2006-07-19,18.76,-100.0,0.00,25.0,50.00,100.0,38.17,18.76,1240.29,...,0.238500,0.183500,0.578000,1.000000,0.318075,-0.339500,0.379967,0.480016,0.279917,2006-07-23
985,2006-07-26,17.38,-50.0,0.00,15.0,50.00,100.0,33.78,17.38,1278.55,...,0.348800,0.220900,0.430200,0.999900,0.323213,-0.081400,0.379967,0.480016,0.279917,2006-07-30
986,2006-08-02,44.23,-50.0,21.25,50.0,53.75,175.0,43.69,44.23,1279.36,...,0.314600,0.213500,0.471900,1.000000,0.329750,-0.157300,0.379967,0.480016,0.279917,2006-08-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1425,2014-12-17,65.12,-125.0,50.00,80.0,99.50,200.0,61.82,65.12,2012.89,...,0.387352,0.343874,0.268775,1.000001,0.484604,0.118577,0.379967,0.480016,0.279917,2014-12-21
1427,2014-02-05,50.97,-125.0,31.25,55.0,80.00,135.0,48.90,50.97,1751.64,...,0.278960,0.356974,0.364066,1.000000,0.408015,-0.085106,0.379967,0.480016,0.279917,2014-02-09
1428,2014-02-12,73.26,-125.0,60.00,80.0,100.00,150.0,50.46,73.26,1819.26,...,0.401478,0.325123,0.273399,1.000000,0.398878,0.128079,0.379967,0.480016,0.279917,2014-02-16
1429,2014-02-19,75.95,-125.0,60.00,80.0,100.00,200.0,53.89,75.95,1840.76,...,0.422000,0.350000,0.228000,1.000000,0.382808,0.194000,0.379967,0.480016,0.279917,2014-02-23


In [54]:
df = pd.merge(put_call,
              cbeo_vix, 
              on="date", 
              how="outer").sort_values("date")

# df = pd.merge(df,
#               fears,
#               on="date",
#               how="outer").sort_values("date")

df = pd.merge(df,
              ff_factors5,
              on="date",
              how="outer").sort_values("date")

df = pd.merge(df,
              us_termspread,
              on="date",
              how="outer").sort_values("date")

df = pd.merge(df,
              sp500,
              on="date",
              how="outer").sort_values("date")

df["week"] = df.date.apply(lambda x: apply_date_to_week(x))

In [55]:
df = pd.merge(df,
         sub,
         on="week",
         how="outer")

In [56]:
df_w = cut_to_weekly_data(df)

In [57]:
df_w

Unnamed: 0,date,Call,Put,Total,pc_ratio,vixo,vixh,vixl,vix,vxoo,...,aaii_Neutral,aaii_Bearish,aaii_Total,aaii_Bullish 8-week Mov Avg,aaii_Bull-Bear Spread,aaii_Bullish Average,aaii_Bullish Average +St. Dev.,aaii_Bullish Average - St. Dev.,date_week,goog_sent
14529,2006-07-03,482979.0,463824.0,946803.0,0.96,13.29,13.51,12.77,13.05,12.20,...,0.196700,0.426200,0.999900,0.333175,-0.049200,0.379967,0.480016,0.279917,2006-07-09,-0.445017
14534,2006-07-10,1003583.0,927974.0,1931557.0,0.92,14.17,14.50,13.67,14.02,13.27,...,0.240900,0.394200,1.000100,0.329562,-0.029200,0.379967,0.480016,0.279917,2006-07-16,-0.019745
14539,2006-07-17,1313329.0,1425727.0,2739056.0,1.09,18.73,18.76,17.75,18.64,17.46,...,0.183500,0.578000,1.000000,0.318075,-0.339500,0.379967,0.480016,0.279917,2006-07-23,0.457586
14544,2006-07-24,1791832.0,1225369.0,3017201.0,0.68,17.08,17.08,14.89,14.98,15.73,...,0.220900,0.430200,0.999900,0.323213,-0.081400,0.379967,0.480016,0.279917,2006-07-30,0.025680
14549,2006-07-31,983758.0,823480.0,1807238.0,0.84,15.01,15.13,14.86,14.95,13.91,...,0.213500,0.471900,1.000000,0.329750,-0.157300,0.379967,0.480016,0.279917,2006-08-06,-0.142872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16716,2014-11-17,1898194.0,1603553.0,3501747.0,0.84,14.70,14.73,13.84,13.99,12.86,...,0.270588,0.238235,0.999999,0.470940,0.252941,0.379967,0.480016,0.279917,2014-11-23,-0.047662
16721,2014-11-24,1965029.0,1544569.0,3509598.0,0.79,12.92,13.02,12.43,12.62,11.18,...,0.270627,0.207921,1.000000,0.491851,0.313531,0.379967,0.480016,0.279917,2014-11-30,0.334989
16726,2014-12-01,2223201.0,2197144.0,4420345.0,0.99,14.16,14.75,13.94,14.29,13.00,...,0.313808,0.259414,1.000000,0.495352,0.167364,0.379967,0.480016,0.279917,2014-12-07,-0.155669
16731,2014-12-08,2217307.0,2308289.0,4525596.0,1.04,13.05,14.67,12.55,14.21,11.69,...,0.326460,0.223368,1.000000,0.498294,0.226804,0.379967,0.480016,0.279917,2014-12-14,0.140924


In [59]:
df_w = cut_to_weekly_data(df.drop(data_dict["data_cols"]["goog_sent"], axis=1)).reset_index(drop=True).copy()

In [60]:
df_w

Unnamed: 0,date,Call,Put,Total,pc_ratio,vixo,vixh,vixl,vix,vxoo,...,aaii_Bullish,aaii_Neutral,aaii_Bearish,aaii_Total,aaii_Bullish 8-week Mov Avg,aaii_Bull-Bear Spread,aaii_Bullish Average,aaii_Bullish Average +St. Dev.,aaii_Bullish Average - St. Dev.,date_week
0,2006-07-03,482979.0,463824.0,946803.0,0.96,13.29,13.51,12.77,13.05,12.20,...,0.377000,0.196700,0.426200,0.999900,0.333175,-0.049200,0.379967,0.480016,0.279917,2006-07-09
1,2006-07-10,1003583.0,927974.0,1931557.0,0.92,14.17,14.50,13.67,14.02,13.27,...,0.365000,0.240900,0.394200,1.000100,0.329562,-0.029200,0.379967,0.480016,0.279917,2006-07-16
2,2006-07-17,1313329.0,1425727.0,2739056.0,1.09,18.73,18.76,17.75,18.64,17.46,...,0.238500,0.183500,0.578000,1.000000,0.318075,-0.339500,0.379967,0.480016,0.279917,2006-07-23
3,2006-07-24,1791832.0,1225369.0,3017201.0,0.68,17.08,17.08,14.89,14.98,15.73,...,0.348800,0.220900,0.430200,0.999900,0.323213,-0.081400,0.379967,0.480016,0.279917,2006-07-30
4,2006-07-31,983758.0,823480.0,1807238.0,0.84,15.01,15.13,14.86,14.95,13.91,...,0.314600,0.213500,0.471900,1.000000,0.329750,-0.157300,0.379967,0.480016,0.279917,2006-08-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,2014-11-17,1898194.0,1603553.0,3501747.0,0.84,14.70,14.73,13.84,13.99,12.86,...,0.491176,0.270588,0.238235,0.999999,0.470940,0.252941,0.379967,0.480016,0.279917,2014-11-23
438,2014-11-24,1965029.0,1544569.0,3509598.0,0.79,12.92,13.02,12.43,12.62,11.18,...,0.521452,0.270627,0.207921,1.000000,0.491851,0.313531,0.379967,0.480016,0.279917,2014-11-30
439,2014-12-01,2223201.0,2197144.0,4420345.0,0.99,14.16,14.75,13.94,14.29,13.00,...,0.426778,0.313808,0.259414,1.000000,0.495352,0.167364,0.379967,0.480016,0.279917,2014-12-07
440,2014-12-08,2217307.0,2308289.0,4525596.0,1.04,13.05,14.67,12.55,14.21,11.69,...,0.450172,0.326460,0.223368,1.000000,0.498294,0.226804,0.379967,0.480016,0.279917,2014-12-14


In [75]:
(df_w.date - df_w.date.shift(1)).apply(lambda x: x.days).describe()

count    690.000000
mean       7.010145
std        0.690942
min        5.000000
25%        7.000000
50%        7.000000
75%        7.000000
max       16.000000
Name: date, dtype: float64

In [14]:
save_file(data_dict, "data_dict.pkl", WORK_DATA_DIR)

In [15]:
save_file(df, file_name="merged_data.csv", file_path=WORK_DATA_DIR, index=False)