# 04 - Feature Engineering

**Author:** Lucas Little  

**Course:** CSCA 5522: Data Mining Project  

**University:** University of Colorado - Boulder

This notebook engineers features from the price data and merges them with the sentiment data.

## 1. Core imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

print("Environment setup complete!")

## 2. Feature Engineering

In [None]:
data_dir = Path('../data')
processed_data_dir = data_dir / 'processed'
sampled_dir = processed_data_dir / 'sampled'

def calculate_technical_indicators(df):
    df['returns'] = np.log(df['close'] / df['close'].shift(1))
    df['volatility'] = df['returns'].rolling(window=15).std() * np.sqrt(15)
    
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['rsi'] = 100 - (100 / (1 + rs))
    
    exp1 = df['close'].ewm(span=12, adjust=False).mean()
    exp2 = df['close'].ewm(span=26, adjust=False).mean()
    df['macd'] = exp1 - exp2
    
    df['volume_ratio'] = df['volume'] / df['volume'].rolling(window=15).mean()
    
    return df

for i in range(1, 6):
    print(f'\n--- Processing Sample {i} ---')
    price_sample_path = sampled_dir / f'prices_sample_{i}.csv'
    sentiment_sample_path = sampled_dir / f'sentiment_sample_{i}.csv'
    
    if not price_sample_path.exists() or not sentiment_sample_path.exists():
        print(f'⚠️ Sample {i} not found. Skipping.')
        continue
        
    price_df = pd.read_csv(price_sample_path)
    price_df['timestamp'] = pd.to_datetime(price_df['timestamp'])
    price_df.set_index('timestamp', inplace=True)
    
    print(f'Engineering features for price data...')
    price_df = calculate_technical_indicators(price_df)
    price_15min = price_df.resample('15T').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum',
        'returns': 'sum',
        'volatility': 'last',
        'rsi': 'last',
        'macd': 'last',
        'volume_ratio': 'last'
    })
    
    sentiment_df = pd.read_csv(sentiment_sample_path)
    sentiment_df['timestamp'] = pd.to_datetime(sentiment_df['timestamp'])
    sentiment_df.set_index('timestamp', inplace=True)
    
    print(f'Merging price and sentiment data...')
    merged_df = pd.merge(price_15min, sentiment_df, on='timestamp', how='left')
    
    # Forward-fill sentiment data to handle missing values
    sentiment_cols = ['sentiment_mean', 'sentiment_var', 'sentiment_count', 'retweet_count_sum', 'like_count_sum', 'sentiment_momentum']
    merged_df[sentiment_cols] = merged_df[sentiment_cols].fillna(method='ffill')
    
    output_path = sampled_dir / f'aligned_features_sample_{i}.csv'
    merged_df.to_csv(output_path, index=True)
    print(f'✅ Saved aligned features for sample {i} to {output_path}')