# Feature Engineering Demo

This notebook demonstrates the feature engineering capabilities developed in Sprint 4:
- Pattern-based features from wavelets and DTW
- Technical indicators
- Transition matrix features
- Feature selection and importance analysis

In [None]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up paths
import sys
sys.path.append('..')

# Import feature engineering components
from src.features import (
    PatternFeatureExtractor,
    TechnicalIndicators,
    TransitionMatrixBuilder,
    FeaturePipeline,
    FeatureSelector
)

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## 1. Generate Sample Financial Data

In [None]:
# Generate synthetic financial data
np.random.seed(42)
n_days = 1000
dates = pd.date_range(start='2020-01-01', periods=n_days, freq='D')

# Generate price data with trends and patterns
trend = np.linspace(100, 150, n_days)
seasonal = 10 * np.sin(2 * np.pi * np.arange(n_days) / 252)  # Annual cycle
noise = np.random.normal(0, 2, n_days)
price = trend + seasonal + noise

# Add some volatility clusters
volatility_regime = np.random.choice([0.5, 2.0], size=n_days, p=[0.8, 0.2])
price += np.random.normal(0, volatility_regime)

# Create OHLCV data
df = pd.DataFrame({
    'open': price + np.random.uniform(-1, 1, n_days),
    'high': price + np.abs(np.random.normal(0, 1, n_days)),
    'low': price - np.abs(np.random.normal(0, 1, n_days)),
    'close': price,
    'volume': np.random.lognormal(15, 0.5, n_days)
}, index=dates)

# Ensure OHLC consistency
df['high'] = df[['open', 'high', 'close']].max(axis=1)
df['low'] = df[['open', 'low', 'close']].min(axis=1)

print(f"Generated {len(df)} days of financial data")
df.head()

In [None]:
# Plot the price data
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)

# Price plot
ax1.plot(df.index, df['close'], label='Close Price', linewidth=1)
ax1.fill_between(df.index, df['low'], df['high'], alpha=0.3, label='High-Low Range')
ax1.set_ylabel('Price')
ax1.set_title('Synthetic Financial Time Series')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Volume plot
ax2.bar(df.index, df['volume'], alpha=0.7, width=1)
ax2.set_ylabel('Volume')
ax2.set_xlabel('Date')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2. Technical Indicators

In [None]:
# Compute technical indicators
tech_calculator = TechnicalIndicators()
indicators = tech_calculator.compute_all_indicators(df)

print(f"Computed {len(indicators.columns)} technical indicators")
print("\nSample indicators:")
print(indicators.columns[:20].tolist())

In [None]:
# Visualize some key indicators
fig, axes = plt.subplots(4, 1, figsize=(14, 12), sharex=True)

# Price with moving averages
ax = axes[0]
ax.plot(df.index, df['close'], label='Close', linewidth=1, alpha=0.8)
ax.plot(df.index, indicators['sma_20'], label='SMA 20', linewidth=2)
ax.plot(df.index, indicators['ema_50'], label='EMA 50', linewidth=2)
ax.set_ylabel('Price')
ax.set_title('Price with Moving Averages')
ax.legend()
ax.grid(True, alpha=0.3)

# RSI
ax = axes[1]
ax.plot(df.index, indicators['rsi_14'], label='RSI(14)', color='purple')
ax.axhline(y=70, color='r', linestyle='--', alpha=0.5, label='Overbought')
ax.axhline(y=30, color='g', linestyle='--', alpha=0.5, label='Oversold')
ax.set_ylabel('RSI')
ax.set_ylim(0, 100)
ax.set_title('Relative Strength Index')
ax.legend()
ax.grid(True, alpha=0.3)

# MACD
ax = axes[2]
ax.plot(df.index, indicators['macd'], label='MACD', linewidth=2)
ax.plot(df.index, indicators['macd_signal'], label='Signal', linewidth=2)
ax.bar(df.index, indicators['macd_histogram'], label='Histogram', alpha=0.3)
ax.set_ylabel('MACD')
ax.set_title('MACD Indicator')
ax.legend()
ax.grid(True, alpha=0.3)

# Bollinger Bands
ax = axes[3]
ax.plot(df.index, df['close'], label='Close', linewidth=1, color='black')
ax.plot(df.index, indicators['bb_upper_20'], label='Upper Band', linestyle='--', alpha=0.7)
ax.plot(df.index, indicators['bb_middle_20'], label='Middle Band', linestyle='-', alpha=0.7)
ax.plot(df.index, indicators['bb_lower_20'], label='Lower Band', linestyle='--', alpha=0.7)
ax.fill_between(df.index, indicators['bb_lower_20'], indicators['bb_upper_20'], alpha=0.1)
ax.set_ylabel('Price')
ax.set_xlabel('Date')
ax.set_title('Bollinger Bands')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Pattern-Based Features

In [None]:
# Extract pattern-based features
pattern_extractor = PatternFeatureExtractor(
    wavelet='morl',
    n_patterns=10,
    pattern_length=20
)

# Prepare windows for pattern extraction
window_size = 100
windows = []
for i in range(window_size, len(df)):
    window = df['close'].iloc[i-window_size:i].values
    windows.append(window)

windows = np.array(windows)
print(f"Created {len(windows)} windows of size {window_size}")

# Fit and transform
pattern_features = pattern_extractor.fit_transform(windows)
print(f"\nExtracted {len(pattern_features.columns)} pattern-based features")
pattern_features.head()

In [None]:
# Visualize some pattern features
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

# Select features to plot
features_to_plot = [
    'wavelet_energy',
    'wavelet_entropy',
    'dtw_min_distance',
    'pattern_transitions'
]

for i, feature in enumerate(features_to_plot):
    if feature in pattern_features.columns:
        ax = axes[i]
        ax.plot(pattern_features[feature].values, linewidth=1)
        ax.set_title(f'{feature}')
        ax.set_xlabel('Window Index')
        ax.set_ylabel('Feature Value')
        ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Transition Matrix Features

In [None]:
# Create pattern sequences for transition analysis
# Simulate pattern sequences (in real use, these would come from pattern matching)
n_sequences = 100
sequence_length = 50
n_patterns = 5

# Generate sequences with some structure
sequences = []
for _ in range(n_sequences):
    # Create sequence with tendency to stay in same state
    sequence = []
    current_state = np.random.randint(0, n_patterns)
    for _ in range(sequence_length):
        sequence.append(current_state)
        # Transition probability
        if np.random.random() < 0.3:  # 30% chance to transition
            current_state = np.random.randint(0, n_patterns)
    sequences.append(sequence)

# Build transition matrices
transition_builder = TransitionMatrixBuilder(
    n_patterns=n_patterns,
    pattern_length=20,
    max_order=2
)

transition_builder.fit(sequences)
transition_features = transition_builder.transform(sequences)

print(f"Extracted {len(transition_features.columns)} transition features")
transition_features.head()

In [None]:
# Visualize transition matrix
trans_matrix = transition_builder.get_transition_matrix(order=1)

plt.figure(figsize=(8, 6))
sns.heatmap(trans_matrix, annot=True, fmt='.2f', cmap='YlOrRd',
            xticklabels=[f'P{i}' for i in range(n_patterns)],
            yticklabels=[f'P{i}' for i in range(n_patterns)])
plt.title('Pattern Transition Matrix (Order 1)')
plt.xlabel('Next Pattern')
plt.ylabel('Current Pattern')
plt.tight_layout()
plt.show()

# Plot stationary distribution
stationary = transition_builder.get_stationary_distribution()
plt.figure(figsize=(8, 5))
plt.bar(range(n_patterns), stationary)
plt.xlabel('Pattern')
plt.ylabel('Stationary Probability')
plt.title('Stationary Distribution of Patterns')
plt.xticks(range(n_patterns), [f'Pattern {i}' for i in range(n_patterns)])
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Complete Feature Pipeline

In [None]:
# Create complete feature pipeline
pipeline = FeaturePipeline(
    use_pattern_features=True,
    use_technical_indicators=True,
    use_transition_features=True,
    wavelet='morl',
    n_patterns=10,
    pattern_length=20,
    feature_window=100,
    prediction_horizon=5,
    scaler_type='standard'
)

# Fit pipeline
print("Fitting feature pipeline...")
pipeline.fit(df)

# Transform data
print("\nTransforming data...")
all_features = pipeline.transform(df)

print(f"\nTotal features extracted: {len(all_features.columns)}")
print(f"Feature matrix shape: {all_features.shape}")
print(f"\nFeature categories:")
print(f"- Technical indicators")
print(f"- Pattern-based features")
print(f"- Transition features")
print(f"- Price features")
print(f"- Time features")

In [None]:
# Create target variable
target = pipeline.create_target(df, target_type='classification')
print(f"Target variable: {target.name}")
print(f"Class distribution:")
print(target.value_counts())

# Align features and target
mask = all_features.index.isin(target.dropna().index)
X = all_features[mask]
y = target[mask]

print(f"\nAligned data shape: X={X.shape}, y={y.shape}")

## 6. Feature Selection and Importance Analysis

In [None]:
# Feature selection
selector = FeatureSelector(
    task_type='classification',
    selection_method='rf',  # Random Forest importance
    n_features=30,
    correlation_threshold=0.95
)

# Fit selector
print("Performing feature selection...")
X_selected = selector.fit_transform(X.fillna(0), y)

print(f"\nSelected {len(selector.selected_features_)} features from {X.shape[1]} total features")
print(f"Removed {len(selector.removed_correlated_)} highly correlated features")

In [None]:
# Plot feature importance
fig = selector.plot_feature_importance(top_n=20, figsize=(10, 8))
plt.show()

# Get feature report
feature_report = selector.get_feature_report()
print("\nTop 10 most important features:")
print(feature_report.head(10)[['feature', 'importance_score', 'rank', 'selected']])

In [None]:
# Plot correlation matrix of selected features
fig = selector.plot_correlation_matrix(selected_only=True, figsize=(12, 10))
plt.show()

In [None]:
# Analyze feature types in selected features
selected_features = selector.selected_features_

feature_types = {
    'Technical': [],
    'Pattern': [],
    'Transition': [],
    'Price': [],
    'Time': []
}

for feature in selected_features:
    if any(ind in feature for ind in ['sma', 'ema', 'rsi', 'macd', 'bb', 'atr']):
        feature_types['Technical'].append(feature)
    elif any(pat in feature for pat in ['wavelet', 'dtw', 'cluster', 'pattern']):
        feature_types['Pattern'].append(feature)
    elif any(trans in feature for trans in ['trans', 'entropy', 'stability']):
        feature_types['Transition'].append(feature)
    elif any(price in feature for price in ['returns', 'volatility', 'ratio']):
        feature_types['Price'].append(feature)
    elif any(time in feature for time in ['day', 'month', 'quarter']):
        feature_types['Time'].append(feature)

# Plot distribution
plt.figure(figsize=(8, 6))
categories = list(feature_types.keys())
counts = [len(features) for features in feature_types.values()]

plt.bar(categories, counts, color='skyblue', edgecolor='navy')
plt.xlabel('Feature Category')
plt.ylabel('Number of Selected Features')
plt.title('Distribution of Selected Features by Category')
plt.grid(True, alpha=0.3)

# Add value labels
for i, count in enumerate(counts):
    plt.text(i, count + 0.5, str(count), ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("Selected features by category:")
for category, features in feature_types.items():
    if features:
        print(f"\n{category} ({len(features)}):")
        print(f"  {', '.join(features[:5])}{'...' if len(features) > 5 else ''}")

## 7. Save Processed Features

In [None]:
# Create output directory
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

# Save features
features_file = output_dir / 'features.csv'
X_selected.to_csv(features_file)
print(f"Saved selected features to {features_file}")

# Save target
target_file = output_dir / 'target.csv'
y.to_csv(target_file)
print(f"Saved target to {target_file}")

# Save pipeline
pipeline_file = output_dir / 'feature_pipeline.pkl'
pipeline.save(str(pipeline_file))
print(f"Saved feature pipeline to {pipeline_file}")

# Save selector
selector_file = output_dir / 'feature_selector.pkl'
selector.save(str(selector_file))
print(f"Saved feature selector to {selector_file}")

print("\nAll feature engineering artifacts saved successfully!")

## Summary

This notebook demonstrated the complete feature engineering pipeline:

1. **Technical Indicators**: Computed 60+ indicators including moving averages, momentum indicators, volatility measures
2. **Pattern Features**: Extracted wavelet-based features, DTW similarities, and cluster memberships
3. **Transition Features**: Built transition matrices and extracted temporal pattern features
4. **Feature Pipeline**: Combined all feature types with proper scaling and missing value handling
5. **Feature Selection**: Selected most important features using Random Forest importance and correlation filtering

The pipeline is now ready for use in machine learning models for financial prediction tasks.