# **Testing the following process: Loading, Merging, Wrangling and Consolidating Data**

- Load the datasets (SPY, features from yf and FRED)
- Drop the first 2 rows of features loaded from yf
- Rename column 'Date' and convert it into datetime
- Set 'Date' as index
- Apply the numeric transformation for datasets
- Drop the unnecessary columns for FRED datasets 
- Keep only the 'Close' price columns for assets (not SPY data) 

In [20]:
# Necessary libraries

import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from pathlib import Path

In [21]:
# Adjust directories (sources and outputs)

load_dotenv()

raw_data_path = os.getenv("RAW_DATA_PATH")
processed_data_path = os.getenv("PROCESSED_DATA_PATH")

raw_data_path = Path(raw_data_path)
processed_data_path = Path(processed_data_path)

# SPY Data

In [22]:
# Main feature - S&P 500 ETF (SPY)

spy = pd.read_csv(raw_data_path / 'SPY_raw_data.csv', header = 0)

spy = spy.iloc[2:].reset_index(drop = True)
spy = spy.rename(columns = {spy.columns[0]: 'Date'})
spy['Date'] = pd.to_datetime(spy['Date'])
spy = spy.set_index('Date')
spy = spy.apply(pd.to_numeric, errors = 'coerce')

print(spy.info())
print("--" * 30)
spy.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5256 entries, 2005-01-03 to 2025-11-20
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   5256 non-null   float64
 1   High    5256 non-null   float64
 2   Low     5256 non-null   float64
 3   Open    5256 non-null   float64
 4   Volume  5256 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 246.4 KB
None
------------------------------------------------------------


Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2005-01-03,81.847115,82.840437,81.57497,82.704362,55748000
2005-01-04,80.847,82.010413,80.581661,81.955983,69167600
2005-01-05,80.289101,81.132744,80.282296,80.785759,65667300
2005-01-06,80.697327,81.064721,80.459202,80.581667,47814700
2005-01-07,80.58168,81.119164,80.370766,80.94227,55847700


# Financial features (other assets) Function

In [23]:
# Function to load and clean CSV files 

def load_and_clean_tickers(path):

    df = pd.read_csv(path, header = 0)

    df = df.iloc[2:].reset_index(drop=True)
    df = df.rename(columns={df.columns[0]: 'Date'})
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.set_index('Date')
    df = df.apply(pd.to_numeric, errors = 'coerce')

    df = df['Close'].to_frame()

    return df

In [24]:
# Invoke the fuction

tickers = {
    "vix": "VIX_raw_data.csv",
    "gold": "Gold_raw_data.csv",
    "oil": "CrudeOil_raw_data.csv",
    "tlt": "TLT_raw_data.csv",
    "rsp": "RSP_raw_data.csv",
    "tnx": "TNX_raw_data.csv",
    "iwm": "IWM_raw_data.csv",
    "dxy": "DXY_raw_data.csv"
}

data_fin = {name: load_and_clean_tickers(raw_data_path / file) for name, file in tickers.items()}

In [25]:
for col in data_fin:

    print(f"{col} info:")
    print(data_fin[col].info())
    print("--" * 30)
    
    data_fin[col].head()

vix info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5256 entries, 2005-01-03 to 2025-11-20
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   5256 non-null   float64
dtypes: float64(1)
memory usage: 82.1 KB
None
------------------------------------------------------------
gold info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5251 entries, 2005-01-03 to 2025-11-20
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   5251 non-null   float64
dtypes: float64(1)
memory usage: 82.0 KB
None
------------------------------------------------------------
oil info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5255 entries, 2005-01-03 to 2025-11-20
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   5255 non-null   float64
dtypes: float64(1)
memory usage: 82.1 KB
None
----

# FRED Data Function

In [26]:
# Function to load and clean CSV files from FRED sources

def load_csv_FRED(path, column):

    df = pd.read_csv(path, parse_dates=['Date'], index_col = 'Date')

    return df[[column]]

In [27]:
# Invoke the function 

datasets_FRED = {
    "baa10yc": ("Baa_Corporate_to_10_Yield.csv", "BAA10Y"),
    "corp710y": ("Corporate_Bond_710_raw_data.csv", "BAMLC4A0C710YEY"),
    "nfci": ("NFCI_fin_condition_raw_data.csv", "NFCI"),
    "str_index": ("STLFSI4_Stress_raw_data.csv", "STLFSI4"),
    "t5yie": ("T5YIE_Breakeven_raw_data.csv", "T5YIE"),
    "t10y2y": ("T10Y_minus_2Y_raw_data.csv", "T10Y2Y"),
    "t10y3m": ("T10Y_minus_3M_raw_data.csv", "T10Y3M"),
    "effr": ("EFFR_funds_rates_raw_data.csv", "EFFR"),
    "high_yield": ("High_Yield_raw_data.csv", "BAMLH0A0HYM2")
}

data_FRED = {name: load_csv_FRED(raw_data_path / file[0], file[1]) for name, file in datasets_FRED.items()}

In [28]:
for col in data_FRED:

    print(f"{col} info:")
    print(data_FRED[col].info())
    print("--" * 30)
    
    data_FRED[col].head()

baa10yc info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5450 entries, 2005-01-03 to 2025-11-21
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   BAA10Y  5223 non-null   float64
dtypes: float64(1)
memory usage: 85.2 KB
None
------------------------------------------------------------
corp710y info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5519 entries, 2005-01-03 to 2025-11-21
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   BAMLC4A0C710YEY  5455 non-null   float64
dtypes: float64(1)
memory usage: 86.2 KB
None
------------------------------------------------------------
nfci info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1090 entries, 2005-01-07 to 2025-11-21
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   NFCI    1090 non-null   float64
dtypes: float6