In [None]:
# 02. Data Cleaning & Feature Engineering

## Purpose
This notebook implements Phase 2 of the yield curve forecasting project: transforming raw daily yield curve and macroeconomic data into a clean, consistent, and well-engineered dataset suitable for supervised learning.

## Objectives
1. **Data Frequency Alignment** - Align all datasets to common daily frequency
2. **Gap Handling** - Forward-fill missing values and remove non-trading days  
3. **Yield Transformation** - Transform yields to continuously compounded rates if needed
4. **Yield Curve Features** - Generate slope, curvature, and PCA-based features
5. **Macro Feature Engineering** - Create lagged macro indicators without look-ahead bias
6. **Standardization** - Apply consistent z-score normalization
7. **Model-Ready Data** - Construct final feature matrix (X) and target matrix (Y)

## Expected Outputs
- Clean feature matrix (X) with engineered features
- Target matrix (Y) with future yield values
- Processed data saved to `/data/processed/`
- Documentation of all transformations applied

## Dependencies
- pandas, numpy, sklearn for data manipulation
- Raw yield curve and macro data (generated if not available)
- Configuration parameters from config files

---


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
from pathlib import Path
import logging

# Machine learning libraries
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Create necessary directories
Path("../data/processed").mkdir(parents=True, exist_ok=True)
Path("../data/features").mkdir(parents=True, exist_ok=True)


In [None]:
## 1. Data Loading and Generation

Since the raw data sources require API keys and proper configuration, we'll generate realistic sample data that represents:
- **Treasury Yield Curves**: Daily yields for tenors 1M, 3M, 6M, 1Y, 2Y, 3Y, 5Y, 7Y, 10Y, 20Y, 30Y
- **Macroeconomic Indicators**: Fed Funds Rate, CPI, PMI, Unemployment Rate, VIX, and other key indicators

This sample data will demonstrate the complete feature engineering pipeline that can be applied to real data.


In [None]:
def generate_realistic_yield_data(start_date='2000-01-01', end_date='2024-12-01'):
    """
    Generate realistic Treasury yield curve data with proper term structure characteristics.
    """
    # Create business day range
    date_range = pd.bdate_range(start=start_date, end=end_date, freq='B')
    n_days = len(date_range)
    
    # Define tenors in years
    tenors = [1/12, 3/12, 6/12, 1, 2, 3, 5, 7, 10, 20, 30]
    tenor_names = ['1M', '3M', '6M', '1Y', '2Y', '3Y', '5Y', '7Y', '10Y', '20Y', '30Y']
    
    # Set random seed for reproducibility
    np.random.seed(42)
    
    # Generate base yield curve with realistic term structure
    # Base rates follow historical patterns with upward sloping curve
    base_rates = np.array([0.5, 0.8, 1.2, 1.8, 2.5, 2.8, 3.2, 3.5, 3.8, 4.2, 4.5])
    
    # Generate time-varying factors (level, slope, curvature)
    time_factor = np.linspace(0, 1, n_days)
    
    # Level factor (overall interest rate environment)
    level_factor = 2.0 + 1.5 * np.sin(2 * np.pi * time_factor * 3) + \
                   np.cumsum(np.random.normal(0, 0.01, n_days))
    
    # Slope factor (yield curve steepness)
    slope_factor = 0.3 + 0.2 * np.sin(2 * np.pi * time_factor * 2) + \
                   np.cumsum(np.random.normal(0, 0.005, n_days))
    
    # Curvature factor (mid-curve behavior)
    curvature_factor = 0.1 * np.sin(2 * np.pi * time_factor * 4) + \
                       np.cumsum(np.random.normal(0, 0.003, n_days))
    
    # Generate yields for each tenor
    yields_data = {}
    
    for i, (tenor, name) in enumerate(zip(tenors, tenor_names)):
        # Base yield with term structure
        base_yield = base_rates[i]
        
        # Apply factors with tenor-specific loadings
        level_loading = 1.0
        slope_loading = tenor  # Longer tenors more sensitive to slope
        curvature_loading = tenor * (30 - tenor) / 100  # Hump-shaped loading
        
        yields = (base_yield + 
                 level_loading * level_factor +
                 slope_loading * slope_factor +
                 curvature_loading * curvature_factor +
                 np.random.normal(0, 0.05, n_days))  # Idiosyncratic noise
        
        # Ensure yields are positive
        yields = np.maximum(yields, 0.01)
        
        yields_data[name] = yields
    
    # Create DataFrame
    df_yields = pd.DataFrame(yields_data, index=date_range)
    df_yields.index.name = 'date'
    df_yields = df_yields.reset_index()
    
    logger.info(f"Generated {len(df_yields)} days of yield curve data")
    return df_yields

def generate_realistic_macro_data(start_date='2000-01-01', end_date='2024-12-01'):
    """
    Generate realistic macroeconomic indicator data.
    """
    # Create business day range for daily indicators and monthly for others
    daily_range = pd.bdate_range(start=start_date, end=end_date, freq='B')
    monthly_range = pd.date_range(start=start_date, end=end_date, freq='MS')
    
    n_daily = len(daily_range)
    n_monthly = len(monthly_range)
    
    np.random.seed(42)
    
    # Generate daily indicators
    daily_data = {}
    
    # Federal Funds Rate (daily)
    fed_funds = 2.0 + 3.0 * np.sin(np.linspace(0, 4*np.pi, n_daily)) + \
                np.cumsum(np.random.normal(0, 0.02, n_daily))
    fed_funds = np.maximum(fed_funds, 0.0)  # Non-negative
    daily_data['fed_funds_rate'] = fed_funds
    
    # VIX (volatility index)
    vix = 20 + 10 * np.sin(np.linspace(0, 6*np.pi, n_daily)) + \
          np.cumsum(np.random.normal(0, 0.5, n_daily))
    vix = np.maximum(vix, 5.0)  # Minimum VIX
    daily_data['vix'] = vix
    
    # TED Spread
    ted_spread = 0.3 + 0.5 * np.sin(np.linspace(0, 5*np.pi, n_daily)) + \
                 np.cumsum(np.random.normal(0, 0.01, n_daily))
    ted_spread = np.maximum(ted_spread, 0.0)
    daily_data['ted_spread'] = ted_spread
    
    # 5Y and 10Y Breakeven Inflation
    infl_5y = 2.0 + 0.5 * np.sin(np.linspace(0, 3*np.pi, n_daily)) + \
              np.cumsum(np.random.normal(0, 0.01, n_daily))
    infl_10y = 2.2 + 0.4 * np.sin(np.linspace(0, 3*np.pi, n_daily)) + \
               np.cumsum(np.random.normal(0, 0.008, n_daily))
    daily_data['breakeven_5y'] = infl_5y
    daily_data['breakeven_10y'] = infl_10y
    
    # Create daily DataFrame
    df_daily = pd.DataFrame(daily_data, index=daily_range)
    
    # Generate monthly indicators and interpolate to daily
    monthly_data = {}
    
    # CPI (monthly, interpolated to daily)
    cpi_base = 250
    cpi_growth = np.cumsum(np.random.normal(0.002, 0.001, n_monthly))  # ~2.4% annual
    cpi = cpi_base * np.exp(cpi_growth)
    monthly_data['cpi'] = cpi
    
    # Core CPI
    core_cpi = cpi * (1 + np.random.normal(0, 0.001, n_monthly))
    monthly_data['core_cpi'] = core_cpi
    
    # Unemployment Rate
    unemployment = 5.0 + 2.0 * np.sin(np.linspace(0, 2*np.pi, n_monthly)) + \
                   np.cumsum(np.random.normal(0, 0.05, n_monthly))
    unemployment = np.clip(unemployment, 2.0, 15.0)
    monthly_data['unemployment_rate'] = unemployment
    
    # ISM Manufacturing PMI
    pmi = 52 + 5 * np.sin(np.linspace(0, 3*np.pi, n_monthly)) + \
          np.random.normal(0, 2, n_monthly)
    pmi = np.clip(pmi, 30, 70)
    monthly_data['ism_pmi'] = pmi
    
    # Industrial Production Index
    ip_growth = np.cumsum(np.random.normal(0.0015, 0.003, n_monthly))  # ~1.8% annual
    ip = 100 * np.exp(ip_growth)
    monthly_data['industrial_production'] = ip
    
    # Nonfarm Payrolls (monthly change in thousands)
    payrolls = np.random.normal(150, 50, n_monthly)  # Average 150k jobs per month
    monthly_data['payrolls_change'] = payrolls
    
    # Consumer Sentiment
    sentiment = 85 + 15 * np.sin(np.linspace(0, 2*np.pi, n_monthly)) + \
                np.random.normal(0, 5, n_monthly)
    sentiment = np.clip(sentiment, 50, 120)
    monthly_data['consumer_sentiment'] = sentiment
    
    # Create monthly DataFrame and interpolate to daily
    df_monthly = pd.DataFrame(monthly_data, index=monthly_range)
    
    # Reindex to daily and forward fill
    df_monthly_daily = df_monthly.reindex(daily_range, method='ffill')
    
    # Combine daily and monthly data
    df_macro = pd.concat([df_daily, df_monthly_daily], axis=1)
    df_macro.index.name = 'date'
    df_macro = df_macro.reset_index()
    
    logger.info(f"Generated {len(df_macro)} days of macroeconomic data")
    return df_macro

# Generate the data
print("🔄 Generating realistic yield curve data...")
df_yields = generate_realistic_yield_data()

print("🔄 Generating realistic macroeconomic data...")
df_macro = generate_realistic_macro_data()

print("✅ Data generation completed successfully!")
print(f"Yield data shape: {df_yields.shape}")
print(f"Macro data shape: {df_macro.shape}")


In [None]:
## 2. Data Frequency Alignment & Gap Handling

This section aligns all datasets to a common daily frequency and handles missing values appropriately.


In [None]:
# Align datasets on common date index
print("🔄 Aligning yield curve and macro datasets...")

# Merge yield and macro data on date
df_combined = pd.merge(df_yields, df_macro, on='date', how='inner')

print(f"Combined dataset shape: {df_combined.shape}")
print(f"Date range: {df_combined['date'].min()} to {df_combined['date'].max()}")

# Check for missing values
missing_values = df_combined.isnull().sum()
print("\n📊 Missing Values by Column:")
print(missing_values[missing_values > 0])

# Forward fill missing values (common for financial time series)
print("\n🔄 Forward-filling missing values...")
df_combined = df_combined.fillna(method='ffill')

# Check for any remaining missing values
remaining_missing = df_combined.isnull().sum().sum()
print(f"Remaining missing values: {remaining_missing}")

# Display basic statistics
print("\n📈 Basic Statistics:")
print(df_combined.describe().round(3))

# Display first few rows to verify data structure
print("\n📋 First 5 rows of combined dataset:")
print(df_combined.head())


In [None]:
## 3. Yield Transformation

Convert nominal yield values to continuously compounded rates for consistency in growth-based forecasting applications.


In [None]:
# Define yield columns for transformation
yield_columns = ['1M', '3M', '6M', '1Y', '2Y', '3Y', '5Y', '7Y', '10Y', '20Y', '30Y']

# Store original yields for comparison
df_original_yields = df_combined[yield_columns].copy()

print("🔄 Converting yields to continuously compounded rates...")

# Convert from annual percentage to continuously compounded rates
# Formula: r_cc = ln(1 + r_annual/100)
for col in yield_columns:
    df_combined[f'{col}_cc'] = np.log(1 + df_combined[col] / 100)

# Create separate dataframe with continuously compounded yields
yield_cc_columns = [f'{col}_cc' for col in yield_columns]
df_yields_cc = df_combined[['date'] + yield_cc_columns].copy()

print("✅ Yield transformation completed")

# Visualize the transformation effect
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot original vs transformed yields for a few key tenors
tenors_to_plot = ['2Y', '10Y']
for i, tenor in enumerate(tenors_to_plot):
    # Original yields
    axes[0, i].plot(df_combined['date'], df_combined[tenor], label='Original (%)', alpha=0.7)
    axes[0, i].set_title(f'{tenor} Treasury Yield - Original')
    axes[0, i].set_ylabel('Yield (%)')
    axes[0, i].grid(True, alpha=0.3)
    axes[0, i].tick_params(axis='x', rotation=45)
    
    # Transformed yields
    axes[1, i].plot(df_combined['date'], df_combined[f'{tenor}_cc'], 
                   label='Continuously Compounded', color='red', alpha=0.7)
    axes[1, i].set_title(f'{tenor} Treasury Yield - Continuously Compounded')
    axes[1, i].set_ylabel('Continuously Compounded Rate')
    axes[1, i].set_xlabel('Date')
    axes[1, i].grid(True, alpha=0.3)
    axes[1, i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Display transformation statistics
print("\n📊 Transformation Statistics:")
print("Original Yields (first 5 tenors):")
print(df_original_yields[yield_columns[:5]].describe().round(4))
print("\nContinuously Compounded Yields (first 5 tenors):")
print(df_combined[yield_cc_columns[:5]].describe().round(4))


In [None]:
## 4. Derived Yield Curve Features

Generate structural indicators of yield curve shape that capture key market dynamics:
- **Slope**: 10Y - 2Y spread (most common measure of yield curve steepness)
- **Curvature**: (2Y + 30Y) - 2×(10Y) (captures mid-curve flattening/steepening)  
- **PCA Scores**: Principal components capturing level, slope, and curvature factors


In [None]:
print("🔄 Creating derived yield curve features...")

# 1. SLOPE: 10Y - 2Y spread
df_combined['yield_slope_10y2y'] = df_combined['10Y'] - df_combined['2Y']
df_combined['yield_slope_10y2y_cc'] = df_combined['10Y_cc'] - df_combined['2Y_cc']

# Additional slope measures
df_combined['yield_slope_30y2y'] = df_combined['30Y'] - df_combined['2Y']
df_combined['yield_slope_10y3m'] = df_combined['10Y'] - df_combined['3M']

print("✅ Slope features created")

# 2. CURVATURE: (2Y + 30Y) - 2*(10Y)
df_combined['yield_curvature'] = (df_combined['2Y'] + df_combined['30Y']) - 2 * df_combined['10Y']
df_combined['yield_curvature_cc'] = (df_combined['2Y_cc'] + df_combined['30Y_cc']) - 2 * df_combined['10Y_cc']

# Alternative curvature measure: butterfly spread
df_combined['yield_butterfly_5y'] = (df_combined['3Y'] + df_combined['7Y']) - 2 * df_combined['5Y']

print("✅ Curvature features created")

# 3. PCA ANALYSIS: Extract principal components from yield curve
print("🔄 Performing PCA analysis on yield curve...")

# Prepare data for PCA (using continuously compounded yields)
pca_data = df_combined[yield_cc_columns].copy()

# Standardize data before PCA
scaler_pca = StandardScaler()
pca_data_scaled = scaler_pca.fit_transform(pca_data)

# Apply PCA
pca = PCA(n_components=5)  # First 5 components capture most variance
pca_scores = pca.fit_transform(pca_data_scaled)

# Add PCA scores to dataframe
for i in range(5):
    df_combined[f'pca_factor_{i+1}'] = pca_scores[:, i]

print("✅ PCA analysis completed")

# Display PCA results
print(f"\n📊 PCA Results:")
print(f"Explained variance ratio: {pca.explained_variance_ratio_.round(4)}")
print(f"Cumulative variance explained: {np.cumsum(pca.explained_variance_ratio_).round(4)}")

# Analyze PCA loadings
pca_loadings = pd.DataFrame(
    pca.components_[:3].T,  # First 3 components
    columns=['PC1 (Level)', 'PC2 (Slope)', 'PC3 (Curvature)'],
    index=yield_columns
)

print("\n📋 PCA Loadings (First 3 Components):")
print(pca_loadings.round(4))

# Visualize PCA loadings
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, (col, title) in enumerate([('PC1 (Level)', 'Level Factor'), 
                                 ('PC2 (Slope)', 'Slope Factor'), 
                                 ('PC3 (Curvature)', 'Curvature Factor')]):
    axes[i].bar(pca_loadings.index, pca_loadings[col], alpha=0.7)
    axes[i].set_title(f'{title} - PCA Loadings')
    axes[i].set_xlabel('Tenor')
    axes[i].set_ylabel('Loading')
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 4. ADDITIONAL YIELD CURVE FEATURES

# Term structure level (average of all yields)
df_combined['yield_level'] = df_combined[yield_columns].mean(axis=1)

# Short-end vs long-end average
short_tenors = ['1M', '3M', '6M', '1Y']
long_tenors = ['10Y', '20Y', '30Y']
df_combined['yield_short_avg'] = df_combined[short_tenors].mean(axis=1)
df_combined['yield_long_avg'] = df_combined[long_tenors].mean(axis=1)
df_combined['yield_short_long_spread'] = df_combined['yield_long_avg'] - df_combined['yield_short_avg']

# Yield curve range (max - min)
df_combined['yield_range'] = df_combined[yield_columns].max(axis=1) - df_combined[yield_columns].min(axis=1)

print("✅ Additional yield curve features created")

# Create summary of yield curve features
yield_curve_features = [
    'yield_slope_10y2y', 'yield_slope_10y2y_cc', 'yield_slope_30y2y', 'yield_slope_10y3m',
    'yield_curvature', 'yield_curvature_cc', 'yield_butterfly_5y',
    'pca_factor_1', 'pca_factor_2', 'pca_factor_3', 'pca_factor_4', 'pca_factor_5',
    'yield_level', 'yield_short_avg', 'yield_long_avg', 'yield_short_long_spread', 'yield_range'
]

print(f"\n📈 Created {len(yield_curve_features)} yield curve features:")
for feature in yield_curve_features:
    print(f"  • {feature}")

# Display statistics for key features
print("\n📊 Key Yield Curve Features Statistics:")
key_features = ['yield_slope_10y2y', 'yield_curvature', 'pca_factor_1', 'pca_factor_2', 'pca_factor_3']
print(df_combined[key_features].describe().round(4))


In [None]:
## 5. Macroeconomic Feature Engineering

Create lagged versions of macroeconomic indicators to avoid look-ahead bias. This ensures we only use past information available at each prediction point.


In [None]:
print("🔄 Creating macroeconomic features with proper lags...")

# Define macro variables and their characteristics
macro_variables = {
    # Policy and rates
    'fed_funds_rate': {'lags': [1, 5, 22], 'transform': ['level', 'change']},
    'ted_spread': {'lags': [1, 5, 22], 'transform': ['level', 'change']},
    
    # Inflation expectations  
    'breakeven_5y': {'lags': [1, 5, 22], 'transform': ['level', 'change']},
    'breakeven_10y': {'lags': [1, 5, 22], 'transform': ['level', 'change']},
    
    # Financial markets
    'vix': {'lags': [1, 5, 22], 'transform': ['level', 'change', 'log']},
    
    # Economic indicators (monthly, already forward-filled)
    'cpi': {'lags': [1, 22, 66], 'transform': ['pct_change']},  # 1 day, 1 month, 3 months
    'core_cpi': {'lags': [1, 22, 66], 'transform': ['pct_change']},
    'unemployment_rate': {'lags': [1, 22, 66], 'transform': ['level', 'change']},
    'ism_pmi': {'lags': [1, 22, 66], 'transform': ['level', 'change']},
    'industrial_production': {'lags': [1, 22, 66], 'transform': ['pct_change']},
    'consumer_sentiment': {'lags': [1, 22, 66], 'transform': ['level', 'change']},
}

# Function to create lagged features
def create_lagged_features(df, variable, lags, transforms):
    """Create lagged and transformed features for a variable."""
    features = {}
    
    for lag in lags:
        # Create base lagged variable
        lagged_var = df[variable].shift(lag)
        
        for transform in transforms:
            if transform == 'level':
                features[f'{variable}_lag{lag}'] = lagged_var
                
            elif transform == 'change':
                # First difference
                features[f'{variable}_change_lag{lag}'] = lagged_var.diff()
                
            elif transform == 'pct_change':
                # Percentage change
                features[f'{variable}_pctchg_lag{lag}'] = lagged_var.pct_change()
                
            elif transform == 'log':
                # Log transformation (for variables like VIX)
                features[f'{variable}_log_lag{lag}'] = np.log(lagged_var + 1e-8)  # Small constant to avoid log(0)
                
    return features

# Create all macro features
macro_features = {}

for variable, config in macro_variables.items():
    if variable in df_combined.columns:
        var_features = create_lagged_features(
            df_combined, 
            variable, 
            config['lags'], 
            config['transform']
        )
        macro_features.update(var_features)
        print(f"✅ Created {len(var_features)} features for {variable}")
    else:
        print(f"⚠️  Variable {variable} not found in dataset")

# Add macro features to main dataframe
for feature_name, feature_data in macro_features.items():
    df_combined[feature_name] = feature_data

print(f"\n📈 Total macro features created: {len(macro_features)}")

# Create additional derived macro features
print("\n🔄 Creating derived macro features...")

# Interest rate momentum
if 'fed_funds_rate_lag1' in df_combined.columns:
    df_combined['fed_funds_momentum_5d'] = (
        df_combined['fed_funds_rate_lag1'] - df_combined['fed_funds_rate_lag5']
    )
    df_combined['fed_funds_momentum_22d'] = (
        df_combined['fed_funds_rate_lag1'] - df_combined['fed_funds_rate_lag22']
    )

# Inflation expectations spread
if all(col in df_combined.columns for col in ['breakeven_10y_lag1', 'breakeven_5y_lag1']):
    df_combined['inflation_term_spread'] = (
        df_combined['breakeven_10y_lag1'] - df_combined['breakeven_5y_lag1']
    )

# Risk sentiment indicators
if 'vix_log_lag1' in df_combined.columns:
    df_combined['vix_percentile_22d'] = (
        df_combined['vix_log_lag1'].rolling(22).rank() / 22
    )

# Economic activity momentum
if all(col in df_combined.columns for col in ['ism_pmi_lag1', 'ism_pmi_lag22']):
    df_combined['economic_momentum'] = (
        0.5 * (df_combined['ism_pmi_lag1'] - 50) +  # PMI relative to neutral
        0.3 * df_combined['ism_pmi_change_lag1'] +   # Recent PMI change
        0.2 * (df_combined['unemployment_rate_lag22'] - df_combined['unemployment_rate_lag1'])  # Employment improvement
    )

print("✅ Derived macro features created")

# Remove rows with NaN values created by lagging (keep sufficient history)
initial_rows = len(df_combined)
df_combined = df_combined.dropna()
final_rows = len(df_combined)

print(f"\n📊 Data summary after feature engineering:")
print(f"Rows before NaN removal: {initial_rows}")
print(f"Rows after NaN removal: {final_rows}")
print(f"Rows removed: {initial_rows - final_rows}")

# Get list of all macro feature columns
all_macro_features = [col for col in df_combined.columns if any(
    col.startswith(var) for var in macro_variables.keys()
) and col not in macro_variables.keys()]

# Add derived features
derived_features = [
    'fed_funds_momentum_5d', 'fed_funds_momentum_22d', 
    'inflation_term_spread', 'vix_percentile_22d', 'economic_momentum'
]
all_macro_features.extend([f for f in derived_features if f in df_combined.columns])

print(f"\n📈 Total engineered macro features: {len(all_macro_features)}")

# Display statistics for key macro features
key_macro_features = [f for f in all_macro_features[:10] if f in df_combined.columns]
if key_macro_features:
    print(f"\n📊 Key Macro Features Statistics (showing first 10):")
    print(df_combined[key_macro_features].describe().round(4))


In [None]:
## 6. Standardization and Normalization

Apply z-score standardization to all continuous variables while maintaining train/test consistency. This ensures that scaling statistics are computed only from the training window.


In [None]:
print("🔄 Preparing data for standardization...")

# Define feature groups for standardization
feature_groups = {
    'yield_levels': yield_columns,  # Original yield levels
    'yield_continuous': yield_cc_columns,  # Continuously compounded yields
    'yield_curve_features': yield_curve_features,  # Derived yield curve features
    'macro_features': all_macro_features,  # All macro features
}

# Combine all features to be standardized
all_features = []
for group_features in feature_groups.values():
    all_features.extend([f for f in group_features if f in df_combined.columns])

# Remove duplicates while preserving order
all_features = list(dict.fromkeys(all_features))

print(f"📊 Total features to standardize: {len(all_features)}")
print(f"Feature breakdown:")
for group_name, group_features in feature_groups.items():
    available_features = [f for f in group_features if f in df_combined.columns]
    print(f"  • {group_name}: {len(available_features)} features")

# Create train/test split (80/20) while preserving time series order
# This is important for avoiding look-ahead bias in standardization
split_date = df_combined['date'].quantile(0.8)
train_mask = df_combined['date'] <= split_date
test_mask = df_combined['date'] > split_date

print(f"\n📅 Data split:")
print(f"Train period: {df_combined.loc[train_mask, 'date'].min()} to {df_combined.loc[train_mask, 'date'].max()}")
print(f"Test period: {df_combined.loc[test_mask, 'date'].min()} to {df_combined.loc[test_mask, 'date'].max()}")
print(f"Train samples: {train_mask.sum()}")
print(f"Test samples: {test_mask.sum()}")

# Initialize scalers for different feature groups
scalers = {}
df_scaled = df_combined.copy()

print(f"\n🔄 Applying z-score standardization by feature group...")

for group_name, group_features in feature_groups.items():
    available_features = [f for f in group_features if f in df_combined.columns]
    
    if not available_features:
        continue
        
    print(f"Processing {group_name} ({len(available_features)} features)...")
    
    # Initialize scaler for this group
    scaler = StandardScaler()
    
    # Fit scaler only on training data
    train_data = df_combined.loc[train_mask, available_features]
    scaler.fit(train_data)
    
    # Transform both train and test data
    df_scaled.loc[:, available_features] = scaler.transform(df_combined[available_features])
    
    # Store scaler for later use
    scalers[group_name] = scaler
    
    # Display scaling statistics
    print(f"  ✅ {group_name} standardized")
    print(f"     Mean (train): {train_data.mean().mean():.4f}")
    print(f"     Std (train): {train_data.std().mean():.4f}")
    print(f"     Mean (scaled): {df_scaled.loc[train_mask, available_features].mean().mean():.4f}")
    print(f"     Std (scaled): {df_scaled.loc[train_mask, available_features].std().mean():.4f}")

print("✅ Standardization completed")

# Verify standardization worked correctly
print(f"\n🔍 Standardization verification:")
sample_features = all_features[:5]  # Check first 5 features
train_scaled_stats = df_scaled.loc[train_mask, sample_features].describe()
print("Train set statistics for sample features (should have mean≈0, std≈1):")
print(train_scaled_stats.loc[['mean', 'std']].round(4))

# Create metadata for reproducibility
scaling_metadata = {
    'scaling_date': datetime.now().isoformat(),
    'train_split_date': split_date.isoformat() if hasattr(split_date, 'isoformat') else str(split_date),
    'train_samples': int(train_mask.sum()),
    'test_samples': int(test_mask.sum()),
    'feature_groups': {
        group: len([f for f in features if f in df_combined.columns])
        for group, features in feature_groups.items()
    },
    'total_features_scaled': len(all_features)
}

print(f"\n📋 Scaling metadata:")
for key, value in scaling_metadata.items():
    print(f"  • {key}: {value}")

# Save scaling metadata to be included with processed data
import json
with open('../data/processed/scaling_metadata.json', 'w') as f:
    json.dump(scaling_metadata, f, indent=2)

print("✅ Scaling metadata saved")


In [None]:
## 7. Model-Ready Data Construction

Construct the final feature matrix (X) and target matrix (Y) for supervised learning. This includes creating forward-looking targets while ensuring no look-ahead bias in features.


In [None]:
print("🔄 Constructing model-ready feature matrix (X) and target matrix (Y)...")

# FEATURE MATRIX (X) CONSTRUCTION
# ================================

# Select features for modeling (excluding raw macro variables to avoid redundancy)
feature_columns = []

# Add yield curve features
feature_columns.extend(yield_curve_features)

# Add engineered macro features (excluding original macro variables)
feature_columns.extend(all_macro_features)

# Ensure all feature columns exist in the scaled dataframe
feature_columns = [col for col in feature_columns if col in df_scaled.columns]

print(f"📊 Feature matrix dimensions:")
print(f"  • Total features selected: {len(feature_columns)}")

# Create feature matrix X
X = df_scaled[['date'] + feature_columns].copy()

print(f"  • Feature matrix shape: {X.shape}")

# TARGET MATRIX (Y) CONSTRUCTION
# ===============================

# Define target variables (future yield values)
# We'll predict 1-day, 5-day, and 22-day ahead yields for key tenors
target_tenors = ['2Y', '5Y', '10Y', '30Y']  # Key tenors for policy analysis
forecast_horizons = [1, 5, 22]  # 1 day, 1 week, 1 month ahead

print(f"\n🎯 Creating target variables:")
print(f"  • Target tenors: {target_tenors}")
print(f"  • Forecast horizons: {forecast_horizons} days")

# Create target variables
target_columns = []
Y_data = {'date': df_scaled['date'].copy()}

for tenor in target_tenors:
    for horizon in forecast_horizons:
        target_col = f'{tenor}_target_{horizon}d'
        
        # Use continuously compounded yields for targets
        if f'{tenor}_cc' in df_scaled.columns:
            Y_data[target_col] = df_scaled[f'{tenor}_cc'].shift(-horizon)  # Negative shift for future values
        else:
            # Fallback to original yields if CC not available
            Y_data[target_col] = df_scaled[tenor].shift(-horizon)
            
        target_columns.append(target_col)

# Create target matrix Y
Y = pd.DataFrame(Y_data)

print(f"  • Target matrix shape: {Y.shape}")
print(f"  • Target variables created: {len(target_columns)}")

# Remove rows with NaN targets (due to forward-looking nature)
initial_samples = len(X)
valid_mask = Y[target_columns].notna().all(axis=1)

X_clean = X[valid_mask].reset_index(drop=True)
Y_clean = Y[valid_mask].reset_index(drop=True)

final_samples = len(X_clean)
print(f"\n📊 Final dataset dimensions:")
print(f"  • Samples before target cleaning: {initial_samples}")
print(f"  • Samples after target cleaning: {final_samples}")
print(f"  • Samples removed: {initial_samples - final_samples}")

# TRAIN/TEST SPLIT FOR FINAL DATA
# ================================

# Recalculate train/test split for clean data
split_date_clean = X_clean['date'].quantile(0.8)
train_mask_clean = X_clean['date'] <= split_date_clean
test_mask_clean = X_clean['date'] > split_date_clean

print(f"\n📅 Final train/test split:")
print(f"  • Train samples: {train_mask_clean.sum()}")
print(f"  • Test samples: {test_mask_clean.sum()}")
print(f"  • Train period: {X_clean.loc[train_mask_clean, 'date'].min()} to {X_clean.loc[train_mask_clean, 'date'].max()}")
print(f"  • Test period: {X_clean.loc[test_mask_clean, 'date'].min()} to {X_clean.loc[test_mask_clean, 'date'].max()}")

# Separate features and targets for modeling
X_features = X_clean.drop('date', axis=1)
Y_targets = Y_clean.drop('date', axis=1)

# Create train/test splits
X_train = X_features[train_mask_clean]
X_test = X_features[test_mask_clean]
Y_train = Y_targets[train_mask_clean]
Y_test = Y_targets[test_mask_clean]

print(f"\n🎯 Final modeling datasets:")
print(f"  • X_train shape: {X_train.shape}")
print(f"  • X_test shape: {X_test.shape}")
print(f"  • Y_train shape: {Y_train.shape}")
print(f"  • Y_test shape: {Y_test.shape}")

# FEATURE IMPORTANCE ANALYSIS
# ============================

print(f"\n🔍 Feature importance analysis...")

# Group features by category for analysis
feature_analysis = {
    'Yield Curve Features': [f for f in yield_curve_features if f in X_features.columns],
    'Macro Features': [f for f in all_macro_features if f in X_features.columns],
}

print(f"\nFeature categories in final dataset:")
for category, features in feature_analysis.items():
    print(f"  • {category}: {len(features)} features")

# Calculate feature correlations with first target (2Y_target_1d)
if len(Y_targets.columns) > 0:
    target_sample = Y_targets.columns[0]
    correlations = X_features.corrwith(Y_targets[target_sample])
    
    print(f"\n📊 Top 10 features correlated with {target_sample}:")
    top_correlations = correlations.abs().sort_values(ascending=False).head(10)
    for feature, corr in top_correlations.items():
        print(f"  • {feature}: {corr:.4f}")

print("✅ Model-ready data construction completed")


In [None]:
## 8. Data Export and Documentation

Save all processed datasets to `/data/processed/` with comprehensive documentation for reproducibility.


In [None]:
print("🔄 Saving processed datasets...")

# Create timestamp for versioning
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# 1. SAVE MAIN DATASETS
# =====================

# Complete feature-engineered dataset
df_scaled.to_csv(f'../data/processed/complete_dataset_{timestamp}.csv', index=False)
print(f"✅ Complete dataset saved: complete_dataset_{timestamp}.csv")

# Feature matrix X (with date)
X_clean.to_csv(f'../data/processed/feature_matrix_{timestamp}.csv', index=False)
print(f"✅ Feature matrix saved: feature_matrix_{timestamp}.csv")

# Target matrix Y (with date)
Y_clean.to_csv(f'../data/processed/target_matrix_{timestamp}.csv', index=False)
print(f"✅ Target matrix saved: target_matrix_{timestamp}.csv")

# 2. SAVE TRAIN/TEST SPLITS
# =========================

# Training sets
X_train.to_csv(f'../data/processed/X_train_{timestamp}.csv', index=False)
Y_train.to_csv(f'../data/processed/Y_train_{timestamp}.csv', index=False)

# Test sets  
X_test.to_csv(f'../data/processed/X_test_{timestamp}.csv', index=False)
Y_test.to_csv(f'../data/processed/Y_test_{timestamp}.csv', index=False)

print(f"✅ Train/test splits saved with timestamp: {timestamp}")

# 3. SAVE FEATURE METADATA
# ========================

feature_metadata = {
    'processing_date': timestamp,
    'dataset_info': {
        'total_samples': len(X_clean),
        'total_features': len(X_features.columns),
        'total_targets': len(Y_targets.columns),
        'train_samples': len(X_train),
        'test_samples': len(X_test),
        'date_range': {
            'start': str(X_clean['date'].min()),
            'end': str(X_clean['date'].max())
        }
    },
    'feature_groups': {
        'yield_curve_features': {
            'count': len([f for f in yield_curve_features if f in X_features.columns]),
            'features': [f for f in yield_curve_features if f in X_features.columns]
        },
        'macro_features': {
            'count': len([f for f in all_macro_features if f in X_features.columns]),
            'features': [f for f in all_macro_features if f in X_features.columns]
        }
    },
    'target_variables': {
        'tenors': target_tenors,
        'horizons': forecast_horizons,
        'columns': target_columns
    },
    'transformations_applied': {
        'yield_transformation': 'continuously_compounded',
        'standardization': 'z_score_by_feature_group',
        'lag_creation': 'macro_variables_lagged',
        'pca_components': 5
    }
}

# Save feature metadata
with open(f'../data/processed/feature_metadata_{timestamp}.json', 'w') as f:
    json.dump(feature_metadata, f, indent=2)

print(f"✅ Feature metadata saved: feature_metadata_{timestamp}.json")

# 4. SAVE SCALERS FOR FUTURE USE
# ==============================

import pickle

# Save all scalers
with open(f'../data/processed/scalers_{timestamp}.pkl', 'wb') as f:
    pickle.dump(scalers, f)

# Save PCA transformer
with open(f'../data/processed/pca_transformer_{timestamp}.pkl', 'wb') as f:
    pickle.dump({'pca': pca, 'scaler': scaler_pca}, f)

print(f"✅ Scalers and transformers saved")

# 5. CREATE DATA DICTIONARY
# =========================

print(f"\n📝 Creating comprehensive data dictionary...")

data_dictionary = {
    'dataset_overview': {
        'purpose': 'Yield curve forecasting and monetary policy scenario analysis',
        'frequency': 'Daily business days',
        'date_range': f"{X_clean['date'].min()} to {X_clean['date'].max()}",
        'total_observations': len(X_clean)
    },
    'feature_descriptions': {},
    'target_descriptions': {},
    'data_sources': {
        'yield_curves': 'US Treasury (FRED API simulation)',
        'macro_indicators': 'Federal Reserve Economic Data (FRED API simulation)',
        'transformations': 'Custom feature engineering pipeline'
    }
}

# Add feature descriptions
feature_descriptions = {
    # Yield curve features
    'yield_slope_10y2y': '10-Year minus 2-Year Treasury yield spread',
    'yield_curvature': '(2Y + 30Y) - 2*(10Y) - captures mid-curve behavior',
    'pca_factor_1': 'First principal component (typically level factor)',
    'pca_factor_2': 'Second principal component (typically slope factor)', 
    'pca_factor_3': 'Third principal component (typically curvature factor)',
    'yield_level': 'Average of all yield tenors',
    'yield_range': 'Range (max - min) of yield curve',
    
    # Macro features examples
    'fed_funds_rate_lag1': 'Federal Funds Rate lagged 1 day',
    'vix_log_lag1': 'Log-transformed VIX lagged 1 day',
    'inflation_term_spread': '10Y - 5Y breakeven inflation spread',
    'economic_momentum': 'Composite economic activity indicator'
}

data_dictionary['feature_descriptions'] = feature_descriptions

# Add target descriptions
target_descriptions = {}
for tenor in target_tenors:
    for horizon in forecast_horizons:
        target_descriptions[f'{tenor}_target_{horizon}d'] = f'{tenor} Treasury yield {horizon} days ahead'

data_dictionary['target_descriptions'] = target_descriptions

# Save data dictionary
with open(f'../data/processed/data_dictionary_{timestamp}.json', 'w') as f:
    json.dump(data_dictionary, f, indent=2)

print(f"✅ Data dictionary saved: data_dictionary_{timestamp}.json")

# 6. FINAL SUMMARY REPORT
# =======================

print(f"\n" + "="*80)
print(f"🎉 FEATURE ENGINEERING PHASE 2 COMPLETED SUCCESSFULLY")
print(f"="*80)

print(f"\n📊 FINAL DATASET SUMMARY:")
print(f"  • Total observations: {len(X_clean):,}")
print(f"  • Total features: {len(X_features.columns):,}")
print(f"  • Total targets: {len(Y_targets.columns):,}")
print(f"  • Training samples: {len(X_train):,}")
print(f"  • Test samples: {len(X_test):,}")
print(f"  • Date range: {X_clean['date'].min()} to {X_clean['date'].max()}")

print(f"\n🎯 TARGET VARIABLES:")
for i, target in enumerate(target_columns):
    print(f"  {i+1:2d}. {target}")

print(f"\n💾 FILES SAVED TO /data/processed/:")
print(f"  • complete_dataset_{timestamp}.csv")
print(f"  • feature_matrix_{timestamp}.csv") 
print(f"  • target_matrix_{timestamp}.csv")
print(f"  • X_train_{timestamp}.csv / Y_train_{timestamp}.csv")
print(f"  • X_test_{timestamp}.csv / Y_test_{timestamp}.csv")
print(f"  • feature_metadata_{timestamp}.json")
print(f"  • data_dictionary_{timestamp}.json")
print(f"  • scalers_{timestamp}.pkl")
print(f"  • pca_transformer_{timestamp}.pkl")
print(f"  • scaling_metadata.json")

print(f"\n🔧 TRANSFORMATIONS APPLIED:")
print(f"  ✅ Data alignment and gap filling")
print(f"  ✅ Yield transformation to continuously compounded rates")
print(f"  ✅ Yield curve feature engineering (slope, curvature, PCA)")
print(f"  ✅ Macroeconomic feature engineering with proper lags")
print(f"  ✅ Z-score standardization by feature groups")
print(f"  ✅ Model-ready train/test splits")

print(f"\n🚀 READY FOR PHASE 3: MODEL DEVELOPMENT")
print(f"   The processed datasets are now ready for:")
print(f"   • Baseline model training")
print(f"   • Advanced ML model development")
print(f"   • Model comparison and evaluation")
print(f"   • Explainability analysis")
print(f"   • Policy scenario simulation")

print(f"\n📝 NEXT STEPS:")
print(f"   1. Load processed data: pd.read_csv('data/processed/feature_matrix_{timestamp}.csv')")
print(f"   2. Train baseline models (Random Walk, VAR, Linear Regression)")
print(f"   3. Develop ML models (Random Forest, XGBoost, LSTM)")
print(f"   4. Evaluate and compare model performance")
print(f"   5. Conduct explainability analysis")

print(f"\n" + "="*80)


In [None]:
# 02. Feature Engineering for Yield Curve Modeling

## Purpose
This notebook transforms raw yield curve data and macroeconomic indicators into engineered features suitable for machine learning models. It creates both traditional financial features and advanced technical indicators.

## Objectives
1. **Create yield curve factors** - level, slope, curvature, twist, butterfly
2. **Generate technical indicators** - moving averages, RSI, Bollinger bands, MACD
3. **Engineer macroeconomic features** - growth rates, lags, volatility measures
4. **Create lag features** for time series modeling
5. **Apply feature scaling and normalization**
6. **Perform feature selection** to identify most informative variables
7. **Generate interaction features** between yield curve and macro variables

## Expected Outputs
- Engineered feature dataset ready for modeling
- Feature importance rankings
- Feature correlation analysis
- Scaled and normalized features
- Feature metadata and documentation

## Dependencies
- Cleaned yield curve data from notebook 01
- Processed macroeconomic data
- Feature engineering configuration parameters
