In [31]:
import os
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path

In [32]:
# Adjust directories (sources and outputs)

load_dotenv()

raw_data_path = os.getenv("RAW_DATA_PATH")
processed_data_path = os.getenv("PROCESSED_DATA_PATH")
prepared_data_path = os.getenv("PREPARED_DATA_PATH")
out_object_path = os.getenv("OUT_OBJECTS_PATH")

raw_data_path = Path(raw_data_path)
processed_data_path = Path(processed_data_path)
prepared_data_path = Path(prepared_data_path)
out_object_path = Path(out_object_path)

In [33]:
# Load SPY (Close Price)

spy = pd.read_csv(raw_data_path / 'SPY_raw_data.csv', header = 0)

spy = spy.iloc[2:].reset_index(drop = True)
spy = spy.rename(columns = {spy.columns[0]: 'Date'})
spy['Date'] = pd.to_datetime(spy['Date'])
spy = spy.set_index('Date')
spy = spy.apply(pd.to_numeric, errors = 'coerce')

# Close Price

spy_close = spy['Close']

print(spy_close.info())
print("--" * 30)

spy_close.head()

<class 'pandas.core.series.Series'>
DatetimeIndex: 5256 entries, 2005-01-03 to 2025-11-20
Series name: Close
Non-Null Count  Dtype  
--------------  -----  
5256 non-null   float64
dtypes: float64(1)
memory usage: 82.1 KB
None
------------------------------------------------------------


Date
2005-01-03    81.847115
2005-01-04    80.847000
2005-01-05    80.289101
2005-01-06    80.697327
2005-01-07    80.581680
Name: Close, dtype: float64

In [34]:
# Load the Features Data

complete_features = pd.read_csv(processed_data_path / 'features_processed_data.csv')

print(complete_features.info())
print("--" * 30)

complete_features.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5266 entries, 0 to 5265
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             5266 non-null   object 
 1   vix              5256 non-null   float64
 2   gold             5251 non-null   float64
 3   oil              5255 non-null   float64
 4   tlt              5256 non-null   float64
 5   rsp              5256 non-null   float64
 6   tnx              5251 non-null   float64
 7   iwm              5256 non-null   float64
 8   dxy              5262 non-null   float64
 9   BAA10Y           5217 non-null   float64
 10  BAMLC4A0C710YEY  5264 non-null   float64
 11  NFCI             5262 non-null   float64
 12  STLFSI4          5262 non-null   float64
 13  T5YIE            5219 non-null   float64
 14  T10Y2Y           5219 non-null   float64
 15  T10Y3M           5219 non-null   float64
 16  EFFR             5217 non-null   float64
 17  BAMLH0A0HYM2  

Unnamed: 0,Date,vix,gold,oil,tlt,rsp,tnx,iwm,dxy,BAA10Y,BAMLC4A0C710YEY,NFCI,STLFSI4,T5YIE,T10Y2Y,T10Y3M,EFFR,BAMLH0A0HYM2
0,2005-01-03,14.08,428.700012,42.119999,45.395805,27.763653,4.22,48.436901,81.300003,1.86,4.94,,,2.62,1.13,1.91,2.31,3.06
1,2005-01-04,13.98,428.5,43.91,44.920033,27.376051,4.283,47.398598,82.57,1.85,5.01,,,2.62,1.09,1.96,2.25,2.97
2,2005-01-05,14.09,426.600006,43.389999,45.1605,27.165127,4.277,46.451237,82.540001,1.83,5.0,,,2.6,1.07,1.96,2.25,3.04
3,2005-01-06,13.58,421.0,45.560001,45.191257,27.246248,4.272,46.697548,83.150002,1.84,5.0,,,2.59,1.11,1.98,2.25,3.09
4,2005-01-07,13.49,418.899994,45.43,45.293507,27.174141,4.285,46.178391,83.610001,1.83,5.02,-0.69875,-0.7361,2.57,1.09,1.97,2.24,3.07


In [35]:
# Load the Technical Indicators data

complete_technicals = pd.read_csv(processed_data_path / 'technical_indicators_processed_data.csv')

print(complete_technicals.info())
print("--" * 30)

complete_technicals.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5256 entries, 0 to 5255
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Date                    5256 non-null   object 
 1   High                    5256 non-null   float64
 2   Low                     5256 non-null   float64
 3   Open                    5256 non-null   float64
 4   Volume                  5256 non-null   int64  
 5   Scaled_SMA50            5207 non-null   float64
 6   Scaled_SMA200           5057 non-null   float64
 7   Scaled_EMA50            5256 non-null   float64
 8   Scaled_EMA200           5256 non-null   float64
 9   Scaled_HMA50            5201 non-null   float64
 10  Scaled_HMA200           5044 non-null   float64
 11  Momentum_20p            5236 non-null   float64
 12  Momentum_100p           5156 non-null   float64
 13  RSI                     5242 non-null   float64
 14  Stoch_K                 5243 non-null   

Unnamed: 0,Date,High,Low,Open,Volume,Scaled_SMA50,Scaled_SMA200,Scaled_EMA50,Scaled_EMA200,Scaled_HMA50,...,Stoch_D,WilliamsR,Norm_ATR,Scaled_Upper_Bollinger,Scaled_Lower_Bollinger,Scaled_Upper_Keltner,Scaled_Lower_Keltner,OBV,Anchored_VWAP,ILV
0,2005-01-03,82.840437,81.57497,82.704362,55748000,,,0.0,0.0,,...,,,,,,,,0.0,,0.015394
1,2005-01-04,82.010413,80.581661,81.955983,69167600,,,-0.960894,-0.990163,,...,,,,,,,,-69167600.0,81.146358,0.017575
2,2005-01-05,81.132744,80.282296,80.785759,65667300,,,-1.459233,-1.532659,,...,,,,,,,,-134834900.0,80.864709,0.010538
3,2005-01-06,81.064721,80.459202,80.581667,47814700,,,-1.009791,-1.113245,,...,,,,,,,,-87020200.0,80.832171,0.007498
4,2005-01-07,81.119164,80.370766,80.94227,55847700,,,-1.081303,-1.216663,,...,,,,,,,,-142867900.0,80.799005,0.009269


In [36]:
# Merge function --> Join the 3 dataframes

def merge_time_series(spy_close, complete_features, complete_technicals):

    # Make sure the index are date_time format

    spy_close.index = pd.to_datetime(spy_close.index)
    spy_close = pd.DataFrame(spy_close)

    complete_features = complete_features.copy()
    complete_technicals = complete_technicals.copy()

    if 'Date' in complete_features.columns:
        complete_features['Date'] = pd.to_datetime(complete_features['Date'])
        complete_features = complete_features.set_index('Date')

    if 'Date' in complete_technicals.columns:
        complete_technicals['Date'] = pd.to_datetime(complete_technicals['Date'])
        complete_technicals = complete_technicals.set_index('Date')

    # Rename Close columns

    spy_close = spy_close.rename(columns = {'Close': 'Close_SPY'})

    # Merge the 3 dataframes using left join

    df_final = (
        complete_features
        .join(spy_close, how = "left")
        .join(complete_technicals, how = "left")
    )

    # Order by data and make sure the index

    df_final = df_final.sort_index()
    df_final.index.name = "Date"

    return df_final

In [37]:
# Applying the merge function

df_final = merge_time_series(spy_close, complete_features, complete_technicals)

In [38]:
# Validate infromation

df_final.info()
print("--" * 30)
df_final.head(10)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5266 entries, 2005-01-03 to 2025-11-20
Data columns (total 42 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   vix                     5256 non-null   float64
 1   gold                    5251 non-null   float64
 2   oil                     5255 non-null   float64
 3   tlt                     5256 non-null   float64
 4   rsp                     5256 non-null   float64
 5   tnx                     5251 non-null   float64
 6   iwm                     5256 non-null   float64
 7   dxy                     5262 non-null   float64
 8   BAA10Y                  5217 non-null   float64
 9   BAMLC4A0C710YEY         5264 non-null   float64
 10  NFCI                    5262 non-null   float64
 11  STLFSI4                 5262 non-null   float64
 12  T5YIE                   5219 non-null   float64
 13  T10Y2Y                  5219 non-null   float64
 14  T10Y3M                

Unnamed: 0_level_0,vix,gold,oil,tlt,rsp,tnx,iwm,dxy,BAA10Y,BAMLC4A0C710YEY,...,Stoch_D,WilliamsR,Norm_ATR,Scaled_Upper_Bollinger,Scaled_Lower_Bollinger,Scaled_Upper_Keltner,Scaled_Lower_Keltner,OBV,Anchored_VWAP,ILV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-03,14.08,428.700012,42.119999,45.395805,27.763653,4.22,48.436901,81.300003,1.86,4.94,...,,,,,,,,0.0,,0.015394
2005-01-04,13.98,428.5,43.91,44.920033,27.376051,4.283,47.398598,82.57,1.85,5.01,...,,,,,,,,-69167600.0,81.146358,0.017575
2005-01-05,14.09,426.600006,43.389999,45.1605,27.165127,4.277,46.451237,82.540001,1.83,5.0,...,,,,,,,,-134834900.0,80.864709,0.010538
2005-01-06,13.58,421.0,45.560001,45.191257,27.246248,4.272,46.697548,83.150002,1.84,5.0,...,,,,,,,,-87020200.0,80.832171,0.007498
2005-01-07,13.49,418.899994,45.43,45.293507,27.174141,4.285,46.178391,83.610001,1.83,5.02,...,,,,,,,,-142867900.0,80.799005,0.009269
2005-01-10,13.23,419.100006,45.330002,45.36515,27.228224,4.278,46.648277,83.290001,1.82,5.01,...,,,,,,,,-86304600.0,80.821683,0.00942
2005-01-11,13.19,421.899994,45.68,45.631142,27.087603,4.244,46.079861,83.019997,1.81,4.98,...,,,,,,,,-149404300.0,80.763009,0.006336
2005-01-12,12.56,426.100006,46.369999,45.692535,27.222818,4.236,46.269344,82.139999,1.81,4.98,...,,,,,,,,-76683800.0,80.71748,0.01117
2005-01-13,12.84,424.5,48.040001,46.132523,27.037125,4.187,46.129131,82.510002,1.8,4.93,...,,,,,,,,-132221300.0,80.663909,0.010414
2005-01-14,12.43,422.700012,48.380001,46.127331,27.271496,4.216,46.568707,83.059998,1.79,4.96,...,,,,,,,,-90188800.0,80.643119,0.006517


In [39]:
print(df_final.isnull().sum())

vix                        10
gold                       15
oil                        11
tlt                        10
rsp                        10
tnx                        15
iwm                        10
dxy                         4
BAA10Y                     49
BAMLC4A0C710YEY             2
NFCI                        4
STLFSI4                     4
T5YIE                      47
T10Y2Y                     47
T10Y3M                     47
EFFR                       49
BAMLH0A0HYM2                2
Close_SPY                  10
High                       10
Low                        10
Open                       10
Volume                     10
Scaled_SMA50               59
Scaled_SMA200             209
Scaled_EMA50               10
Scaled_EMA200              10
Scaled_HMA50               65
Scaled_HMA200             222
Momentum_20p               30
Momentum_100p             110
RSI                        24
Stoch_K                    23
Stoch_D                    25
WilliamsR 

In [41]:
df_final

Unnamed: 0_level_0,vix,gold,oil,tlt,rsp,tnx,iwm,dxy,BAA10Y,BAMLC4A0C710YEY,...,Stoch_D,WilliamsR,Norm_ATR,Scaled_Upper_Bollinger,Scaled_Lower_Bollinger,Scaled_Upper_Keltner,Scaled_Lower_Keltner,OBV,Anchored_VWAP,ILV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-11-14,12.180000,468.200012,57.689999,47.143139,29.532074,4.604,50.660538,92.129997,1.83,5.60,...,96.669282,-4.782519,0.009621,-0.615643,4.789819,-0.329518,3.410509,4.200970e+08,81.999075,0.005174
2005-11-15,12.230000,468.100006,56.980000,47.450993,29.394024,4.557,49.925789,92.089996,1.83,5.54,...,93.143113,-12.195012,0.009586,-1.120789,4.404296,-0.778174,3.015637,3.505045e+08,82.014176,0.009962
2005-11-16,12.260000,478.299988,57.880001,47.949867,29.468491,4.484,49.941105,92.330002,1.84,5.49,...,90.255562,-8.607848,0.009064,-1.068296,4.324288,-0.674793,3.021933,4.016375e+08,82.025031,0.004624
2005-11-17,11.250000,486.200012,56.330002,48.066677,29.837170,4.459,50.805973,91.879997,1.85,5.46,...,91.988001,-0.136084,0.008785,-0.596432,5.117632,0.015253,3.558497,4.573550e+08,82.038991,0.012188
2005-11-18,11.120000,485.600006,56.139999,47.843758,29.911619,4.502,51.196293,91.930000,1.86,5.51,...,95.616617,-1.930534,0.008437,-0.469945,5.176719,0.261896,3.600495,5.297922e+08,82.059730,0.007612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-11-14,19.830000,4087.600098,60.090000,88.870003,187.339996,4.148,237.479996,99.269997,1.77,4.99,...,51.152363,-56.305516,0.011657,-18.506590,8.595149,-18.260199,11.591236,1.588454e+10,175.251449,0.018508
2025-11-17,22.379999,4068.300049,59.910000,89.089996,184.880005,4.133,232.759995,99.589996,1.77,4.99,...,30.431609,-84.345516,0.012567,-24.601113,2.047751,-24.021319,6.576790,1.579408e+10,175.320130,0.017277
2025-11-18,24.690001,4061.300049,60.740002,89.059998,184.990005,4.123,233.470001,99.550003,1.79,4.98,...,22.436995,-87.529467,0.012996,-31.394279,-1.270446,-28.499696,2.477459,1.567961e+10,175.405808,0.014020
2025-11-19,23.660000,4077.699951,59.439999,88.879997,184.429993,4.133,233.429993,100.230003,1.79,4.99,...,17.431930,-79.994048,0.013057,-29.460056,2.720066,-25.530127,6.509889,1.577432e+10,175.477043,0.012956


In [42]:
# Data Imputation - Missing Values Handling


def impute_data(df):
    
    df = df.copy()

    # RegEx detectors (easy for the number of columns)

    market_keywords = [
        'vix','gold','oil','tlt','rsp','tnx','iwm','dxy',
        'BAA','BAMLC','NFCI','STLFSI','T5Y','T10Y','EFFR',
        'Close','High','Low','Open','Volume','OBV','VWAP','ILV'
    ]

    indicator_keywords = [
        'SMA','EMA','HMA','Momentum','RSI','Stoch',
        'Williams','ATR','Bollinger','Keltner'
    ]

    market_cols = [c for c in df.columns if any(k in c for k in market_keywords)]
    indicator_cols = [c for c in df.columns if any(k in c for k in indicator_keywords)]

    #  Market time series imputation (first with ffill and the bfill)

    df[market_cols] = df[market_cols].ffill().bfill()


    # Technical indicators imputation

    # We only impute isolated internal NA (not the initial natural NA)

    for c in indicator_cols:

        na_positions = df[c].isna()
        
        if na_positions.sum() == 0:

            continue
        
        # If there are internal NAs, ffill

        first_valid = df[c].first_valid_index()
        last_valid = df[c].last_valid_index()
        
        # Select internal NAs

        internal_nas = df.loc[first_valid:last_valid, c].isna()
        
        if internal_nas.sum() > 0:

            df[c] = df[c].ffill().bfill()

    return df, market_cols, indicator_cols

In [43]:
df_clean, market_cols, indicator_cols = impute_data(df_final)

print("Market series imputed:")
print(market_cols)

print("\nTechnical indicators impited:")
print(indicator_cols)

print("\nMissing value left:")
print(df_clean.isnull().sum())

Market series imputed:
['vix', 'gold', 'oil', 'tlt', 'rsp', 'tnx', 'iwm', 'dxy', 'BAA10Y', 'BAMLC4A0C710YEY', 'NFCI', 'STLFSI4', 'T5YIE', 'T10Y2Y', 'T10Y3M', 'EFFR', 'Close_SPY', 'High', 'Low', 'Open', 'Volume', 'Scaled_Lower_Bollinger', 'Scaled_Lower_Keltner', 'OBV', 'Anchored_VWAP', 'ILV']

Technical indicators impited:
['Scaled_SMA50', 'Scaled_SMA200', 'Scaled_EMA50', 'Scaled_EMA200', 'Scaled_HMA50', 'Scaled_HMA200', 'Momentum_20p', 'Momentum_100p', 'RSI', 'Stoch_K', 'Stoch_D', 'WilliamsR', 'Norm_ATR', 'Scaled_Upper_Bollinger', 'Scaled_Lower_Bollinger', 'Scaled_Upper_Keltner', 'Scaled_Lower_Keltner']

Missing value left:
vix                       0
gold                      0
oil                       0
tlt                       0
rsp                       0
tnx                       0
iwm                       0
dxy                       0
BAA10Y                    0
BAMLC4A0C710YEY           0
NFCI                      0
STLFSI4                   0
T5YIE                     0
T10Y