# notebooks/02_feature_engineering.ipynb

In [4]:
import sys, os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))
from data_loader import create_panel_data

## Load Panel Data

In [5]:
panel_df = create_panel_data(frequency='5_min')

print('Data Head: ')
print(panel_df.head())
print('\nData Tail: ')
print(panel_df.tail())

  stacked = df.stack(dropna=False)
  stacked = df.stack(dropna=False)
  stacked = df.stack(dropna=False)
  stacked = df.stack(dropna=False)


Data Head: 
                        rv       bpv      good       bad          rq
Date       Stock                                                    
2003-01-02 AAPL   6.493909  3.771960  5.102315  1.391595  152.729402
           AMGN   5.177506  3.635898  3.362921  1.814585   46.168213
           AMZN   9.886836  8.018926  6.074276  3.812559  200.751309
           AXP    4.448244  4.122573  3.337253  1.110991  114.695371
           BA     7.469396  7.016747  5.107386  2.362010  150.112379

Data Tail: 
                        rv       bpv      good       bad        rq
Date       Stock                                                  
2024-03-28 TRV    0.501900  0.450509  0.225921  0.275979  0.517683
           UNH    0.774552  0.762729  0.406755  0.367797  0.696658
           V      0.627872  0.445852  0.329977  0.297895  1.160977
           VZ     0.783853  0.706211  0.493097  0.290755  1.847938
           WMT    0.359440  0.336123  0.106481  0.252960  0.150975


  stacked = df.stack(dropna=False)


## Integrate VIX Data

In [9]:
START_DATE = '2003-01-02'
END_DATE = '2024-03-28'

vix_path = os.path.join('..', 'data', 'VIX_History.csv')
vix_data = pd.read_csv(vix_path)

vix_data['Date'] = pd.to_datetime(vix_data['DATE'], format='%m/%d/%Y')
vix_data = vix_data.set_index('Date')
vix = vix_data[['CLOSE']].rename(columns={'CLOSE': 'vix'})
vix = vix.loc[START_DATE:END_DATE]

panel_vix = panel_df.join(vix, on='Date')
print('Panel with VIX: ')
print(panel_vix.head())

Panel with VIX: 
                        rv       bpv      good       bad          rq    vix
Date       Stock                                                           
2003-01-02 AAPL   6.493909  3.771960  5.102315  1.391595  152.729402  25.39
           AMGN   5.177506  3.635898  3.362921  1.814585   46.168213  25.39
           AMZN   9.886836  8.018926  6.074276  3.812559  200.751309  25.39
           AXP    4.448244  4.122573  3.337253  1.110991  114.695371  25.39
           BA     7.469396  7.016747  5.107386  2.362010  150.112379  25.39


## Define Target Variable (Y_reg)

In [11]:
df = panel_vix.sort_index()

df['Y_reg'] = df.groupby('Stock')['rv'].shift(-1)

print('Example for AAPL: ')
display(df.loc[pd.IndexSlice[:, 'AAPL'], :].tail()[['rv', 'Y_reg']])

Example for AAPL: 


Unnamed: 0_level_0,Unnamed: 1_level_0,rv,Y_reg
Date,Stock,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-03-22,AAPL,1.216548,0.682342
2024-03-25,AAPL,0.682342,0.42599
2024-03-26,AAPL,0.42599,0.959378
2024-03-27,AAPL,0.959378,0.563307
2024-03-28,AAPL,0.563307,


## Engineer Features (X)

In [None]:
# Engineering HAR Features

df['rv_lag_1'] = df.groupby('Stock')['rv'].shift(1)
df['rv_rolling_5'] = df.groupby('Stock')['rv_lag_1'].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)
df['rv_rolling_22'] = df.groupby('Stock')['rv_lag_1'].rolling(window=22, min_periods=1).mean().reset_index(level=0, drop=True)

df['bpv_lag_1'] = df.groupby('Stock')['bpv'].shift(1)
df['good_lag_1'] = df.groupby('Stock')['good'].shift(1)
df['bad_lag_1'] = df.groupby('Stock')['bad'].shift(1)
df['rq_lag_1'] = df.groupby('Stock')['rq'].shift(1)
df['vix_lag_1'] = df.groupby('Stock')['vix'].shift(1)

epsilon = 1e-10
df['bad_good_ratio_lag_1'] = df['bad_lag_1'] / (df['good_lag_1'] + epsilon)
df['jump_ratio_lag_1'] = (df['rv_lag_1'] - df['bpv_lag_1']) / (df['rv_lag_1'] + epsilon)
display(df.tail())