In [1]:
%run init_notebook.py

In [2]:
import pandas as pd
import numpy as np

In [3]:
from settings import RAW_DATA_DIR, WORK_DATA_DIR, OUT_DATA_DIR, OUT_MODEL_DIR, random_state
from utils.utils import *
from utils.load import load_data, save_file, save_model
from utils.cast_data import apply_datetime_format, check_datetime_sanity
from utils.plotting import *
from src.src import *

In [4]:
# load data
df_all = load_data(file_name="merged_data.csv", file_path=WORK_DATA_DIR)
data_dict = load_data(file_name="merged_data_dict.pkl", file_path=WORK_DATA_DIR)

In [5]:
# select relevant data
df_w = df_all.copy()

## SELECT COLS INCLUDED HERE
# # len 1500
drop_cols = data_dict["data_cols"]["fears"] + data_dict["data_cols"]["goog_sent"] + data_dict["data_cols"]["put_call"] + data_dict["data_cols"]["naaim_risk"]

# # len 760
# drop_cols = data_dict["data_cols"]["fears"] + data_dict["data_cols"]["goog_sent"] + data_dict["data_cols"]["put_call"]

# # len 660
# drop_cols = data_dict["data_cols"]["fears"] + data_dict["data_cols"]["goog_sent"]

# # len 440
# drop_cols = data_dict["data_cols"]["goog_sent"]

# # len 270
# drop_cols = None

df_w = df_w.drop(drop_cols, axis=1).copy()

# # get data on weekly basis
df_w = cut_to_weekly_data(df_w, "is_thu").reset_index(drop=True)

# # preserve actual prices
df_w["sp_true_vals"] = df_w["sp_close"]

In [6]:
df_w = cut_to_weekly_data(df_w, "is_thu")
df_w_notrans = df_w.copy()

In [7]:
# get aggregation variables
df_w["sp_agg1"] = df_w.sp_close_lag1 * df_w.sp_close_lag2 * df_w.sp_close_lag3 * df_w.sp_close_lag4

In [12]:
# edit data_dict
# actual sp prices
data_dict.update(sp_true_vals="sp_true_vals")
# all sp cols, lags & leads
data_dict.update(sp_cols=[item for item in df_w.columns if "sp_close" in item])


relevant_cols=[
    # 'fears35',
    # 'fears25',
    # 'fears30',
    'vixo',
    # 'naaim_ind',
    # 'naaim_max',
    # 'naaim_q1',
    'ff_M_RF',
    'ff_SMB',
    'ff_HML',
    'ff_RMW',
    'ff_CMA',
    # 'ff_RF',
    'termspread',
    # 'pc_ratio',
    'aaii_Bullish',
    'aaii_Bearish',
    'aaii_Neutral',
    # 'goog_sent'
] 

# relevant cols are features + sp_cols + target col, thus all relevant columns
relevant_cols = relevant_cols + data_dict["sp_cols"]

data_dict.update(relevant_cols=relevant_cols)

In [13]:
# sanity check
_ = [item for item in data_dict["relevant_cols"] if item not in df_w.columns]
assert len(_) == 0, f"Relevant cols has columns not pertaining to df: {_}"

In [14]:
# log returns
# all cols but ff_factors, as these are returns already
df_w, dist, log = df_log_return(df_w,
                                cols=[item for item in data_dict["relevant_cols"] if item not in data_dict["data_cols"]["ff_factors5"]])

data_dict = update_dict(data_dict,
            update_keys=["dist_translation", "log_return"],
            update_vals=[dist, log])

In [15]:
# test normality & stationarity
stest = StatsTest(plot=False)

stationarity = stest.df_test_stationarity(df_w, data_dict["relevant_cols"])
data_dict.update(stationarity=stationarity)

normality = stest.df_test_normality(df_w, cols=data_dict["relevant_cols"])
data_dict.update(normality=normality)

stationarity

{'vixo': True,
 'ff_M_RF': True,
 'ff_SMB': True,
 'ff_HML': True,
 'ff_RMW': True,
 'ff_CMA': True,
 'termspread': True,
 'aaii_Bullish': True,
 'aaii_Bearish': True,
 'aaii_Neutral': True,
 'sp_close': True,
 'sp_close_lead14': True,
 'sp_close_lead13': True,
 'sp_close_lead12': True,
 'sp_close_lead11': True,
 'sp_close_lead10': True,
 'sp_close_lead9': True,
 'sp_close_lead8': True,
 'sp_close_lead7': True,
 'sp_close_lead6': True,
 'sp_close_lead5': True,
 'sp_close_lead4': True,
 'sp_close_lead3': True,
 'sp_close_lead2': True,
 'sp_close_lead1': True,
 'sp_close_lag1': True,
 'sp_close_lag2': True,
 'sp_close_lag3': True,
 'sp_close_lag4': True,
 'sp_close_lag5': True,
 'sp_close_lag6': True,
 'sp_close_lag7': True}

# Saving

In [16]:
df_w

Unnamed: 0,date,sp_Open,sp_High,sp_Low,sp_close,sp_adj_close,sp_volume,vixo,vixh,vixl,...,sp_close_lead1,sp_close_lag1,sp_close_lag2,sp_close_lag3,sp_close_lag4,sp_close_lag5,sp_close_lag6,sp_close_lag7,sp_true_vals,sp_agg1
1,1990-01-18,337.40,338.38,333.98,-0.030116,338.19,1.785900e+08,0.193892,24.34,24.34,...,-0.002297,-0.028949,-0.025698,-0.048621,-0.035459,-0.020279,-0.032436,-0.028396,338.19,1.317044e+10
2,1990-01-25,330.26,332.33,325.33,-0.036465,326.08,1.722700e+08,0.051642,25.63,25.63,...,-0.040159,-0.021389,-0.027189,-0.019839,-0.002297,-0.030116,-0.028949,-0.025698,326.08,1.227126e+10
3,1990-02-01,329.08,329.86,327.76,0.008276,328.79,1.545800e+08,-0.030101,24.87,24.87,...,0.015593,-0.003579,-0.026369,-0.015803,-0.040159,-0.036465,-0.021389,-0.027189,328.79,1.126105e+10
4,1990-02-08,333.75,336.09,332.00,0.012603,332.96,1.762400e+08,-0.045238,23.77,23.77,...,0.008126,0.014091,0.020471,0.020243,0.015593,0.008276,-0.003579,-0.026369,332.96,1.208238e+10
5,1990-02-15,332.01,335.21,331.61,0.005780,334.89,1.746200e+08,-0.187298,19.71,19.71,...,-0.002701,-0.005227,0.004117,-0.005348,0.008126,0.012603,0.014091,0.020471,334.89,1.210254e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,2021-09-02,4534.48,4545.85,4524.66,0.014867,4536.95,2.897010e+09,-0.070590,16.98,15.73,...,0.005762,0.006186,0.008092,0.010937,0.015127,0.014467,0.021564,0.008540,4536.95,4.178546e+14
1587,2021-09-09,4513.02,4529.90,4492.07,-0.009672,4493.28,3.035300e+09,0.178010,19.54,17.17,...,-0.017090,-0.002217,-0.000586,0.001465,0.006098,0.012028,0.005874,0.009442,4493.28,4.198480e+14
1588,2021-09-16,4477.09,4485.87,4443.80,-0.004356,4473.75,3.321030e+09,-0.054439,19.76,17.65,...,-0.005756,-0.007420,-0.017178,-0.014816,-0.017425,-0.006834,-0.001906,-0.001936,4473.75,3.966503e+14
1589,2021-09-23,4406.75,4465.40,4406.75,-0.005552,4448.98,2.833290e+09,0.078328,20.21,18.42,...,0.005060,-0.019166,-0.020202,-0.025153,-0.005756,-0.004356,-0.007420,-0.017178,4448.98,3.697316e+14


In [18]:
if drop_cols == ['fears25', 'fears30', 'fears35', 'goog_sent']:
    print("Saving as df_weekly_660")
    save_file(df_w, file_name="df_weekly_660.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(df_w_notrans, file_name="df_weekly_no_trans_660.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(data_dict, file_name="dict_weekly_660.pkl", file_path=WORK_DATA_DIR, index=False)

elif drop_cols == ['fears25',
                 'fears30',
                 'fears35',
                 'goog_sent',
                 'Call',
                 'Put',
                 'Total',
                 'pc_ratio']:
    print("Saving as df_weekly_760")
    save_file(df_w, file_name="df_weekly_760.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(df_w_notrans, file_name="df_weekly_no_trans_760.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(data_dict, file_name="dict_weekly_760.pkl", file_path=WORK_DATA_DIR, index=False)
    
elif drop_cols == ['fears25',
                 'fears30',
                 'fears35',
                 'goog_sent',
                 'Call',
                 'Put',
                 'Total',
                 'pc_ratio',
                 'Mean/Average',
                 'naaim_max',
                 'naaim_q1',
                 'Quart 2 (median)',
                 'Quart 3 (25% at/above)',
                 'Most Bullish Response',
                 'naaim_std',
                 'naaim_ind',
                 'S&P 500']:
    print("Saving as df_weekly_1560")
    save_file(df_w, file_name="df_weekly_1560.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(df_w_notrans, file_name="df_weekly_no_trans_1560.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(data_dict, file_name="dict_weekly_1560.pkl", file_path=WORK_DATA_DIR, index=False)

Saving as df_weekly_1560
