# Exploring OLS models before pipelining

 Wrap preprocessing + estimator in an sklearn Pipeline and dump with joblib.

 Store model files under models/ but track them with DVC or MLflow.

 Use Papermill or nbconvert to transform exploratory notebooks into parameterised, non-interactive steps.

 Orchestrate regular re-training with Prefect or similar.

 Preserve Python environment specs next to the artefact.

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from pathlib import Path     
import models 

file_path = Path("/home/user/data/sector_10.parquet")

file_path.name  # → "sector_10.parquet"

file_path.stem  # → "sector_10"

.glob: indicates patterns to be found within the dir

In [21]:
data_dir = Path.cwd().parent/'data'

df_dict = {
    file.stem.replace("sector_","") : pd.read_parquet(file)
    for file in data_dir.glob("sector_*.parquet")
}

In [19]:
df_dict['10']

Unnamed: 0_level_0,vol,ret,shrout,prc,askhi,bidlo,put_volume,call_volume,put_call_ratio,vix_close,...,cma,rf,enhanced_baker,news_sent,mktcap,turn_sd,sect_mktcap,mvel1,dolvol,daily_illq
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1998-01-02,1.420280e+06,0.003287,1.013942e+06,62.745296,63.068083,61.909911,517.261250,334.857022,1.847545,23.420000,...,-0.0014,0.00021,2.396,0.27,6.448867e+07,3.072879,6.362010e+07,17.982000,8.911586e+07,0.000037
1998-01-05,2.406067e+06,-0.021214,1.019566e+06,61.345891,62.921670,60.701696,530.978253,660.178316,1.212341,24.360001,...,-0.0001,0.00021,2.396,0.25,6.375211e+07,3.072879,6.254618e+07,17.970513,1.476023e+08,0.000144
1998-01-06,3.054767e+06,-0.034761,1.018568e+06,59.217327,60.627729,58.551701,1616.712030,1889.734468,1.173952,25.660000,...,0.0013,0.00021,2.396,0.25,6.141981e+07,3.072879,6.031685e+07,17.933243,1.808951e+08,0.000192
1998-01-07,3.424121e+06,0.024678,1.023829e+06,60.827941,61.007635,58.900709,698.726996,1089.622229,0.860878,25.070000,...,0.0031,0.00021,2.396,0.26,6.361143e+07,3.072879,6.227738e+07,17.968304,2.082822e+08,0.000118
1998-01-08,2.111215e+06,-0.026138,1.026307e+06,59.194500,60.581523,58.992218,803.304360,1392.336539,0.953283,26.010000,...,0.0029,0.00021,2.396,0.22,6.219968e+07,3.072879,6.075174e+07,17.945860,1.249723e+08,0.000209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-21,2.347036e+07,-0.010369,1.883786e+06,66.738612,68.679728,66.089400,11076.968965,11395.992407,1.101672,30.110001,...,0.0064,0.00011,2.409,-0.04,1.307736e+08,3.072879,1.257212e+08,18.688979,1.566379e+09,0.000007
2018-12-24,7.596090e+06,-0.040239,1.888364e+06,64.208265,66.537325,64.152997,8901.756413,12131.554525,0.944079,36.070000,...,-0.0034,0.00011,2.409,-0.04,1.263795e+08,3.072879,1.212486e+08,18.654799,4.877318e+08,0.000083
2018-12-26,1.292724e+07,0.062713,1.874340e+06,68.235958,68.260304,63.652624,8887.402809,17054.673252,0.715864,30.410000,...,-0.0108,0.00011,2.409,-0.08,1.324008e+08,3.072879,1.278974e+08,18.701344,8.821027e+08,0.000071
2018-12-27,1.189542e+07,0.006472,1.873945e+06,68.997290,69.016614,66.134674,5811.118599,12469.599469,0.656975,29.959999,...,0.0010,0.00011,2.409,-0.07,1.337375e+08,3.072879,1.292971e+08,18.711390,8.207516e+08,0.000008


# OLS baseline

## Assumption 1: Linearity of the respeonse - predictor relationships

Check with: 
- Fitted values vs resid
- Ramsey RESET


In [22]:
df_dict['10'].columns

Index(['vol', 'ret', 'shrout', 'prc', 'askhi', 'bidlo', 'put_volume',
       'call_volume', 'put_call_ratio', 'vix_close', 'turn', 'baspread',
       'mktrf', 'smb', 'hml', 'rmw', 'umd', 'cma', 'rf', 'enhanced_baker',
       'news_sent', 'mktcap', 'turn_sd', 'sect_mktcap', 'mvel1', 'dolvol',
       'daily_illq', 'excess_ret', 'excess_mkt_ret'],
      dtype='object')

In [None]:
features = ['vol', 'ret', 'shrout', 'prc', 'askhi', 'bidlo', 'put_volume',
       'call_volume']

y = df_dict['10']['excess_ret']

In [None]:
ols_sm(df_dict['10'],)