# Feature Engineering

This notebook demonstrates feature engineering for volatility prediction, including technical indicators, sentiment analysis, and volatility targets.

In [None]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from src.utils.config import load_config
from src.features.technical import add_technical_indicators
from src.features.volatility import realized_volatility, rolling_std, garch_volatility
from src.features.sentiment import vader_scores, finbert_scores, aggregate_daily

## Load Raw Data

In [None]:
cfg = load_config()

# Load price data
prices = pd.read_csv(os.path.join(cfg.raw_dir, 'prices.csv'), parse_dates=['Date']).set_index('Date')
print(f"Loaded {len(prices)} rows of price data")
print(f"Date range: {prices.index.min()} to {prices.index.max()}")
prices.head()

## 1. Technical Indicators

In [None]:
# Add technical indicators for each ticker
tickers = ['AAPL', 'MSFT', 'NVDA', 'SPY']
enhanced_prices = prices.copy()

for ticker in tickers:
    close_col = f'{ticker}_close'
    high_col = f'{ticker}_high'
    low_col = f'{ticker}_low'
    vol_col = f'{ticker}_volume'
    
    if all(col in prices.columns for col in [close_col, high_col, low_col, vol_col]):
        ticker_data = prices[[close_col, high_col, low_col, vol_col]].dropna()
        enhanced = add_technical_indicators(ticker_data, close_col, high_col, low_col, vol_col)
        
        # Add enhanced features back to main dataframe
        for col in enhanced.columns:
            if col not in [close_col, high_col, low_col, vol_col]:
                enhanced_prices[col] = enhanced[col]

print(f"Enhanced dataset shape: {enhanced_prices.shape}")
print(f"New technical indicator columns: {[c for c in enhanced_prices.columns if any(x in c for x in ['sma', 'ema', 'rsi', 'macd', 'bb'])]}")

In [None]:
# Visualize technical indicators for AAPL
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
ticker = 'AAPL'
close_col = f'{ticker}_close'

# Price and moving averages
axes[0,0].plot(enhanced_prices.index, enhanced_prices[close_col], label='Close', alpha=0.7)
axes[0,0].plot(enhanced_prices.index, enhanced_prices[f'{ticker}_sma_10'], label='SMA 10', alpha=0.7)
axes[0,0].plot(enhanced_prices.index, enhanced_prices[f'{ticker}_sma_20'], label='SMA 20', alpha=0.7)
axes[0,0].set_title(f'{ticker} Price and Moving Averages')
axes[0,0].legend()

# RSI
axes[0,1].plot(enhanced_prices.index, enhanced_prices[f'{ticker}_rsi_14'], label='RSI', color='orange')
axes[0,1].axhline(y=70, color='r', linestyle='--', alpha=0.7, label='Overbought')
axes[0,1].axhline(y=30, color='g', linestyle='--', alpha=0.7, label='Oversold')
axes[0,1].set_title(f'{ticker} RSI')
axes[0,1].legend()

# MACD
axes[1,0].plot(enhanced_prices.index, enhanced_prices[f'{ticker}_macd'], label='MACD', alpha=0.7)
axes[1,0].plot(enhanced_prices.index, enhanced_prices[f'{ticker}_macd_signal'], label='Signal', alpha=0.7)
axes[1,0].set_title(f'{ticker} MACD')
axes[1,0].legend()

# Bollinger Bands
axes[1,1].plot(enhanced_prices.index, enhanced_prices[close_col], label='Close', alpha=0.7)
axes[1,1].plot(enhanced_prices.index, enhanced_prices[f'{ticker}_bb_hband'], label='Upper BB', alpha=0.7)
axes[1,1].plot(enhanced_prices.index, enhanced_prices[f'{ticker}_bb_lband'], label='Lower BB', alpha=0.7)
axes[1,1].set_title(f'{ticker} Bollinger Bands')
axes[1,1].legend()

plt.tight_layout()
plt.show()

## 2. Volatility Targets

In [None]:
# Compute volatility targets for each ticker
volatility_features = {}

for ticker in tickers:
    close_col = f'{ticker}_close'
    if close_col in enhanced_prices.columns:
        close_prices = enhanced_prices[close_col].dropna()
        
        # Realized volatility (5-day forward)
        rv = realized_volatility(close_prices, window=5)
        
        # Rolling standard deviation
        roll_std = rolling_std(close_prices, window=20)
        
        # GARCH volatility (may take time)
        try:
            garch_vol = garch_volatility(close_prices)
        except:
            garch_vol = pd.Series(index=close_prices.index, dtype=float)
        
        volatility_features[ticker] = {
            'realized_vol': rv,
            'rolling_std': roll_std,
            'garch_vol': garch_vol
        }

print(f"Computed volatility features for {len(volatility_features)} tickers")

In [None]:
# Visualize volatility measures for AAPL
ticker = 'AAPL'
if ticker in volatility_features:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    rv = volatility_features[ticker]['realized_vol']
    roll_std = volatility_features[ticker]['rolling_std']
    garch_vol = volatility_features[ticker]['garch_vol']
    
    ax.plot(rv.index, rv, label='Realized Volatility (5-day)', alpha=0.7)
    ax.plot(roll_std.index, roll_std, label='Rolling Std (20-day)', alpha=0.7)
    if not garch_vol.empty and garch_vol.notna().any():
        ax.plot(garch_vol.index, garch_vol, label='GARCH Volatility', alpha=0.7)
    
    ax.set_title(f'{ticker} Volatility Measures')
    ax.set_ylabel('Volatility')
    ax.legend()
    plt.show()

## 3. Sentiment Analysis

In [None]:
# Load social media data if available
tweets_path = os.path.join(cfg.raw_dir, 'tweets.csv')
reddit_path = os.path.join(cfg.raw_dir, 'reddit_posts.csv')

tweets = pd.DataFrame()
reddit_posts = pd.DataFrame()

if os.path.exists(tweets_path):
    tweets = pd.read_csv(tweets_path, parse_dates=['date'])
    print(f"Loaded {len(tweets)} tweets")

if os.path.exists(reddit_path):
    reddit_posts = pd.read_csv(reddit_path, parse_dates=['created_dt'])
    print(f"Loaded {len(reddit_posts)} Reddit posts")

In [None]:
# Analyze sentiment for tweets
if not tweets.empty:
    print("Analyzing tweet sentiment...")
    tweet_sentiment = vader_scores(tweets['content'].fillna(''))
    tweets_with_sentiment = pd.concat([tweets, tweet_sentiment], axis=1)
    
    # Daily sentiment aggregation
    daily_tweet_sentiment = aggregate_daily(tweets_with_sentiment, 'content', 'date')
    print(f"Daily tweet sentiment shape: {daily_tweet_sentiment.shape}")
    
    # Visualize sentiment over time
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.plot(daily_tweet_sentiment.index, daily_tweet_sentiment['compound'], label='Compound Sentiment')
    ax.plot(daily_tweet_sentiment.index, daily_tweet_sentiment['positive'], label='Positive', alpha=0.7)
    ax.plot(daily_tweet_sentiment.index, daily_tweet_sentiment['negative'], label='Negative', alpha=0.7)
    ax.set_title('Daily Tweet Sentiment')
    ax.set_ylabel('Sentiment Score')
    ax.legend()
    plt.show()
else:
    print("No tweet data available")

In [None]:
# Analyze sentiment for Reddit posts
if not reddit_posts.empty:
    print("Analyzing Reddit sentiment...")
    reddit_sentiment = vader_scores(reddit_posts['title'].fillna(''))
    reddit_with_sentiment = pd.concat([reddit_posts, reddit_sentiment], axis=1)
    
    # Daily sentiment aggregation
    daily_reddit_sentiment = aggregate_daily(reddit_with_sentiment, 'title', 'created_dt')
    print(f"Daily Reddit sentiment shape: {daily_reddit_sentiment.shape}")
    
    # Visualize sentiment over time
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.plot(daily_reddit_sentiment.index, daily_reddit_sentiment['compound'], label='Compound Sentiment')
    ax.plot(daily_reddit_sentiment.index, daily_reddit_sentiment['positive'], label='Positive', alpha=0.7)
    ax.plot(daily_reddit_sentiment.index, daily_reddit_sentiment['negative'], label='Negative', alpha=0.7)
    ax.set_title('Daily Reddit Sentiment')
    ax.set_ylabel('Sentiment Score')
    ax.legend()
    plt.show()
else:
    print("No Reddit data available")

## 4. Feature Summary

In [None]:
# Summary of all features
print("=== Feature Engineering Summary ===")
print(f"Original price data: {prices.shape}")
print(f"Enhanced with technical indicators: {enhanced_prices.shape}")
print(f"Volatility features computed for {len(volatility_features)} tickers")
print(f"Tweet sentiment: {len(tweets)} tweets")
print(f"Reddit sentiment: {len(reddit_posts)} posts")

# Feature categories
technical_cols = [c for c in enhanced_prices.columns if any(x in c for x in ['sma', 'ema', 'rsi', 'macd', 'bb'])]
print(f"\nTechnical indicators: {len(technical_cols)} features")
print(f"Volatility targets: 3 per ticker (realized, rolling, garch)")
print(f"Sentiment features: VADER scores (compound, pos, neg, neu)")