In [1]:
%run init_notebook.py

In [2]:
import pandas as pd
import numpy as np

In [3]:
from settings import RAW_DATA_DIR, WORK_DATA_DIR, OUT_DATA_DIR, OUT_MODEL_DIR, random_state
from utils.utils import *
from utils.load import load_data, save_file, save_model
from utils.cast_data import apply_datetime_format, check_datetime_sanity
from utils.plotting import *
from src.src import *

In [4]:
# load data
df_all = load_data(file_name="merged_data.csv", file_path=WORK_DATA_DIR)
data_dict = load_data(file_name="merged_data_dict.pkl", file_path=WORK_DATA_DIR)

In [33]:
# select relevant data
df_w = df_all.copy()

## SELECT COLS INCLUDED HERE
# # len 1500
# drop_cols = data_dict["data"]["df_cols"]["fears"] + data_dict["data"]["df_cols"]["goog_sent"] +data_dict["data"]["df_cols"]["put_call"] + data_dict["data"]["df_cols"]["naaim_risk"]

# # len 760
drop_cols = data_dict["data"]["df_cols"]["fears"] + data_dict["data"]["df_cols"]["goog_sent"] + data_dict["data"]["df_cols"]["put_call"]

# # len 660
drop_cols = data_dict["data"]["df_cols"]["fears"] + data_dict["data"]["df_cols"]["goog_sent"]

# # len 440
drop_cols = data_dict["data"]["df_cols"]["fears"]

# # len 270
# drop_cols = None

df_w = df_w.drop(drop_cols, axis=1).copy()

# # get data on weekly basis
df_w = cut_to_weekly_data(df_w, "is_thu").reset_index(drop=True)

# actual sp prices
data_dict["data"].update(sp_true_vals="sp_true_vals")

# # preserve actual prices
df_w[data_dict["data"]["sp_true_vals"]] = df_w["sp_close"]

In [34]:
df_w = cut_to_weekly_data(df_w, "is_thu")
df_w_notrans = df_w.copy()

In [35]:
# get aggregation variables
df_w["sp_agg1"] = df_w.sp_close_lag1 * df_w.sp_close_lag2 * df_w.sp_close_lag3 * df_w.sp_close_lag4

In [36]:
# edit data_dict

# all sp cols, lags & leads
data_dict["data"].update(sp_cols=[item for item in df_w.columns if "sp_close" in item])


relevant_cols=[
    # 'fears35',
    # 'fears25',
    # 'fears30',
    'vixo',
    'naaim_ind',
    'naaim_max',
    'naaim_q1',
    'ff_M_RF',
    'ff_SMB',
    'ff_HML',
    'ff_RMW',
    'ff_CMA',
    # 'ff_RF',
    'termspread',
    'pc_ratio',
    'aaii_Bullish',
    'aaii_Bearish',
    'aaii_Neutral',
    # 'goog_sent'
] 

# relevant cols are features + sp_cols + target col, thus all relevant columns
relevant_cols = relevant_cols + data_dict["data"]["sp_cols"]

data_dict["data"].update(relevant_cols=relevant_cols)

In [37]:
# sanity check
_ = [item for item in data_dict["data"]["relevant_cols"] if item not in df_w.columns]
assert len(_) == 0, f"Relevant cols has columns not pertaining to df: {_}"

In [38]:
# log returns
# all cols but ff_factors, as these are returns already
df_w, dist, log = df_log_return(df_w,
                                cols=[item for item in data_dict["data"]["relevant_cols"] if item not in data_dict["data"]["df_cols"]["ff_factors5"]])


data_dict["data_transformation"].update(dist_translation=dist)
data_dict["data_transformation"].update(log_returns=log)

In [39]:
# test normality & stationarity
stest = StatsTest(plot=False)

stationarity = stest.df_test_stationarity(df_w, data_dict["data"]["relevant_cols"])
data_dict["data_stests"].update(stationarity=stationarity)

normality = stest.df_test_normality(df_w, cols=data_dict["data"]["relevant_cols"])
data_dict["data_stests"].update(normality=normality)

stationarity

{'vixo': True,
 'naaim_ind': True,
 'naaim_max': True,
 'naaim_q1': True,
 'ff_M_RF': True,
 'ff_SMB': True,
 'ff_HML': True,
 'ff_RMW': True,
 'ff_CMA': True,
 'termspread': True,
 'pc_ratio': True,
 'aaii_Bullish': True,
 'aaii_Bearish': True,
 'aaii_Neutral': True,
 'sp_close': True,
 'sp_close_lead14': True,
 'sp_close_lead13': True,
 'sp_close_lead12': True,
 'sp_close_lead11': True,
 'sp_close_lead10': True,
 'sp_close_lead9': True,
 'sp_close_lead8': True,
 'sp_close_lead7': True,
 'sp_close_lead6': True,
 'sp_close_lead5': True,
 'sp_close_lead4': True,
 'sp_close_lead3': True,
 'sp_close_lead2': True,
 'sp_close_lead1': True,
 'sp_close_lag1': True,
 'sp_close_lag2': True,
 'sp_close_lag3': True,
 'sp_close_lag4': True,
 'sp_close_lag5': True,
 'sp_close_lag6': True,
 'sp_close_lag7': True}

# Saving

In [40]:
drop_cols

['fears25', 'fears30', 'fears35']

In [41]:
if drop_cols == ['fears25', 'fears30', 'fears35', 'goog_sent']:
    print("Saving as df_weekly_660")
    save_file(df_w, file_name="df_weekly_660.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(df_w_notrans, file_name="df_weekly_no_trans_660.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(data_dict, file_name="dict_weekly_660.pkl", file_path=WORK_DATA_DIR, index=False)

elif drop_cols == ['fears25',
                 'fears30',
                 'fears35',
                 'goog_sent',
                 'Call',
                 'Put',
                 'Total',
                 'pc_ratio']:
    print("Saving as df_weekly_760")
    save_file(df_w, file_name="df_weekly_760.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(df_w_notrans, file_name="df_weekly_no_trans_760.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(data_dict, file_name="dict_weekly_760.pkl", file_path=WORK_DATA_DIR, index=False)
    
elif drop_cols == ['fears25', 'fears30', 'fears35']:
    print("Saving as df_weekly_440")
    save_file(df_w, file_name="df_weekly_440.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(df_w_notrans, file_name="df_weekly_no_trans_440.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(data_dict, file_name="dict_weekly_440.pkl", file_path=WORK_DATA_DIR, index=False)
    
elif drop_cols == ['fears25',
                 'fears30',
                 'fears35',
                 'goog_sent',
                 'Call',
                 'Put',
                 'Total',
                 'pc_ratio',
                 'Mean/Average',
                 'naaim_max',
                 'naaim_q1',
                 'Quart 2 (median)',
                 'Quart 3 (25% at/above)',
                 'Most Bullish Response',
                 'naaim_std',
                 'naaim_ind',
                 'S&P 500']:
    print("Saving as df_weekly_1560")
    save_file(df_w, file_name="df_weekly_1560.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(df_w_notrans, file_name="df_weekly_no_trans_1560.csv", file_path=WORK_DATA_DIR, index=False)
    save_file(data_dict, file_name="dict_weekly_1560.pkl", file_path=WORK_DATA_DIR, index=False)

Saving as df_weekly_440
