# 04 - Anomaly Detection and Feature Engineering

**Author:** Lucas Little  
**Course:** CSCA 5522: Data Mining Project  
**University:** University of Colorado - Boulder

This notebook implements advanced anomaly detection and feature engineering for cryptocurrency volatility prediction.

## Objectives
1. Implement anomaly detection algorithms
2. Create advanced engineered features
3. Detect market regime changes
4. Generate interaction features
5. Prepare final feature set for modeling

In [1]:
# Core imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
from pathlib import Path

# Machine learning imports
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

# Statistical imports
from scipy import stats

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

print("Environment setup complete!")

Environment setup complete!


## 1. Load Processed Data

In [2]:
# Load aligned features from previous notebook
data_dir = Path('data')
processed_data_dir = data_dir / 'processed'

try:
    # Try to load aligned features
    df = pd.read_csv(processed_data_dir / 'aligned_features.csv')
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    print(f"Loaded aligned features: {df.shape}")
except FileNotFoundError:
    # Fallback: load and create basic features
    print("Aligned features not found, creating from price data...")
    
    # Load price data
    prices_df = pd.read_csv(processed_data_dir / 'prices_processed.csv')
    prices_df['timestamp'] = pd.to_datetime(prices_df['timestamp'])
    
    # Create basic features
    df = prices_df.copy()
    df['returns'] = df['close'].pct_change()
    df['volatility'] = df['returns'].rolling(20).std()
    df['volume_ratio'] = df['volume'] / df['volume'].rolling(20).mean()
    
    # Add dummy sentiment features
    np.random.seed(42)
    df['primary_sentiment_mean'] = np.random.normal(0, 0.1, len(df))
    df['primary_sentiment_std'] = np.random.uniform(0.05, 0.2, len(df))
    df['tweet_count'] = np.random.poisson(10, len(df))
    
    print(f"Created basic features: {df.shape}")

print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Features available: {len(df.columns)}")

# Display basic info
print(f"Dataset shape: {df.shape}")

Loaded aligned features: (650881, 101)
Date range: 2018-01-01 00:00:00 to 2019-03-29 00:00:00
Features available: 101
Dataset shape: (650881, 101)


## 2. Anomaly Detection

In [3]:
def detect_price_anomalies(df, contamination=0.1):
    """
    Detect price and volume anomalies using Isolation Forest.
    """
    # Select features for anomaly detection
    anomaly_features = ['returns', 'volatility', 'volume_ratio']
    available_features = [f for f in anomaly_features if f in df.columns]
    
    if len(available_features) == 0:
        print("No suitable features for anomaly detection")
        return df
    
    # Prepare data
    X = df[available_features].dropna()
    
    if len(X) == 0:
        print("No valid data for anomaly detection")
        return df
    
    # Scale features
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Isolation Forest
    iso_forest = IsolationForest(
        contamination=contamination,
        random_state=42,
        n_estimators=100
    )
    
    anomaly_labels = iso_forest.fit_predict(X_scaled)
    anomaly_scores = iso_forest.score_samples(X_scaled)
    
    # Add results to dataframe
    df_result = df.copy()
    df_result['anomaly_score'] = np.nan
    df_result['is_anomaly'] = 0
    
    # Map results back to original dataframe
    df_result.loc[X.index, 'anomaly_score'] = anomaly_scores
    df_result.loc[X.index, 'is_anomaly'] = (anomaly_labels == -1).astype(int)
    
    return df_result

def detect_statistical_anomalies(df, z_threshold=3):
    """
    Detect statistical anomalies using Z-score method.
    """
    df_result = df.copy()
    
    # Z-score anomalies for returns
    if 'returns' in df.columns:
        returns_clean = df['returns'].dropna()
        if len(returns_clean) > 0:
            z_scores = np.abs(stats.zscore(returns_clean))
            df_result['returns_z_score'] = np.nan
            df_result.loc[returns_clean.index, 'returns_z_score'] = z_scores
            df_result['returns_anomaly'] = (df_result['returns_z_score'] > z_threshold).astype(int)
    
    # Volume anomalies
    if 'volume' in df.columns:
        volume_clean = df['volume'].dropna()
        if len(volume_clean) > 0:
            # Use log transformation for volume
            log_volume = np.log1p(volume_clean)
            z_scores_vol = np.abs(stats.zscore(log_volume))
            df_result['volume_z_score'] = np.nan
            df_result.loc[volume_clean.index, 'volume_z_score'] = z_scores_vol
            df_result['volume_anomaly'] = (df_result['volume_z_score'] > z_threshold).astype(int)
    
    return df_result

print("Detecting anomalies...")

# Apply anomaly detection
df = detect_price_anomalies(df, contamination=0.05)
df = detect_statistical_anomalies(df, z_threshold=3)

# Summary of anomalies
if 'is_anomaly' in df.columns:
    anomaly_count = df['is_anomaly'].sum()
    anomaly_pct = (anomaly_count / len(df)) * 100
    print(f"Isolation Forest anomalies: {anomaly_count} ({anomaly_pct:.2f}%)")

if 'returns_anomaly' in df.columns:
    returns_anomaly_count = df['returns_anomaly'].sum()
    returns_anomaly_pct = (returns_anomaly_count / len(df)) * 100
    print(f"Returns Z-score anomalies: {returns_anomaly_count} ({returns_anomaly_pct:.2f}%)")

if 'volume_anomaly' in df.columns:
    volume_anomaly_count = df['volume_anomaly'].sum()
    volume_anomaly_pct = (volume_anomaly_count / len(df)) * 100
    print(f"Volume Z-score anomalies: {volume_anomaly_count} ({volume_anomaly_pct:.2f}%)")

print(f"\nDataset shape after anomaly detection: {df.shape}")

Detecting anomalies...
Isolation Forest anomalies: 32543 (5.00%)
Returns Z-score anomalies: 1730 (0.27%)
Volume Z-score anomalies: 7726 (1.19%)

Dataset shape after anomaly detection: (650881, 107)


## 3. Market Regime Detection

In [4]:
def detect_market_regimes(df, window=50):
    """
    Detect market regimes (bull, bear, sideways) based on price trends.
    """
    df_result = df.copy()
    
    if 'close' not in df.columns:
        print("Close price not available for regime detection")
        return df_result
    
    # Calculate rolling statistics
    df_result['price_sma'] = df_result['close'].rolling(window=window).mean()
    df_result['price_trend'] = df_result['close'] / df_result['price_sma'] - 1
    
    # Calculate rolling volatility
    if 'returns' in df.columns:
        df_result['rolling_volatility'] = df_result['returns'].rolling(window=window).std()
    
    # Define regime thresholds
    bull_threshold = 0.05  # 5% above SMA
    bear_threshold = -0.05  # 5% below SMA
    
    # Classify regimes
    conditions = [
        df_result['price_trend'] > bull_threshold,
        df_result['price_trend'] < bear_threshold
    ]
    choices = ['bull', 'bear']
    
    df_result['market_regime'] = np.select(conditions, choices, default='sideways')
    
    # Create regime dummy variables
    df_result['regime_bull'] = (df_result['market_regime'] == 'bull').astype(int)
    df_result['regime_bear'] = (df_result['market_regime'] == 'bear').astype(int)
    df_result['regime_sideways'] = (df_result['market_regime'] == 'sideways').astype(int)
    
    return df_result

print("Detecting market regimes...")

# Apply regime detection
df = detect_market_regimes(df, window=50)

# Summary of regimes
if 'market_regime' in df.columns:
    regime_counts = df['market_regime'].value_counts()
    print("\nMarket Regime Distribution:")
    for regime, count in regime_counts.items():
        pct = (count / len(df)) * 100
        print(f"  {regime}: {count} ({pct:.1f}%)")

print(f"\nDataset shape after regime detection: {df.shape}")

Detecting market regimes...

Market Regime Distribution:
  sideways: 650881 (100.0%)

Dataset shape after regime detection: (650881, 114)


## 4. Advanced Feature Engineering

In [5]:
def create_interaction_features(df):
    """
    Create interaction features between different data types.
    """
    df_result = df.copy()
    
    # Price-Volume interactions
    if all(col in df.columns for col in ['returns', 'volume_ratio']):
        df_result['returns_volume_interaction'] = df_result['returns'] * df_result['volume_ratio']
        df_result['abs_returns_volume'] = abs(df_result['returns']) * df_result['volume_ratio']
    
    # Price-Sentiment interactions
    if all(col in df.columns for col in ['returns', 'primary_sentiment_mean']):
        df_result['returns_sentiment_interaction'] = df_result['returns'] * df_result['primary_sentiment_mean']
        df_result['sentiment_momentum'] = df_result['primary_sentiment_mean'] * df_result['returns'].rolling(5).mean()
    
    # Volume-Sentiment interactions
    if all(col in df.columns for col in ['volume_ratio', 'tweet_count']):
        df_result['volume_attention_interaction'] = df_result['volume_ratio'] * np.log1p(df_result['tweet_count'])
    
    return df_result

def create_time_features(df):
    """
    Create time-based features.
    """
    df_result = df.copy()
    
    if 'timestamp' in df.columns:
        # Extract time components
        df_result['hour'] = df_result['timestamp'].dt.hour
        df_result['day_of_week'] = df_result['timestamp'].dt.dayofweek
        df_result['month'] = df_result['timestamp'].dt.month
        
        # Cyclical encoding
        df_result['hour_sin'] = np.sin(2 * np.pi * df_result['hour'] / 24)
        df_result['hour_cos'] = np.cos(2 * np.pi * df_result['hour'] / 24)
        df_result['day_sin'] = np.sin(2 * np.pi * df_result['day_of_week'] / 7)
        df_result['day_cos'] = np.cos(2 * np.pi * df_result['day_of_week'] / 7)
        
        # Market session indicators
        df_result['is_weekend'] = (df_result['day_of_week'] >= 5).astype(int)
        df_result['is_business_hours'] = ((df_result['hour'] >= 9) & (df_result['hour'] <= 17)).astype(int)
    
    return df_result

def create_momentum_features(df):
    """
    Create momentum and trend features.
    """
    df_result = df.copy()
    
    if 'close' in df.columns:
        # Price momentum
        for period in [5, 10, 20]:
            df_result[f'momentum_{period}'] = df_result['close'] / df_result['close'].shift(period) - 1
    
    if 'returns' in df.columns:
        # Return momentum
        for period in [3, 5, 10]:
            df_result[f'return_momentum_{period}'] = df_result['returns'].rolling(period).mean()
        
        # Consecutive returns
        df_result['positive_return_streak'] = (df_result['returns'] > 0).astype(int)
        df_result['negative_return_streak'] = (df_result['returns'] < 0).astype(int)
    
    return df_result

print("Creating advanced features...")

# Apply feature engineering
df = create_interaction_features(df)
print(f"After interaction features: {df.shape}")

df = create_time_features(df)
print(f"After time features: {df.shape}")

df = create_momentum_features(df)
print(f"After momentum features: {df.shape}")

print(f"\nTotal features created: {len(df.columns)}")

Creating advanced features...
After interaction features: (650881, 119)
After time features: (650881, 128)
After momentum features: (650881, 136)

Total features created: 136


## 5. Feature Selection and Analysis

In [6]:
def analyze_feature_importance(df, target_col='volatility'):
    """
    Analyze feature importance using correlation.
    """
    # Identify feature columns
    exclude_cols = ['timestamp', 'market_regime']
    feature_cols = [col for col in df.columns if col not in exclude_cols and not col.startswith('future_')]
    
    # Prepare data
    X = df[feature_cols].select_dtypes(include=[np.number])
    
    if target_col not in df.columns:
        print(f"Target column '{target_col}' not found. Using 'returns' as fallback.")
        target_col = 'returns' if 'returns' in df.columns else X.columns[0]
    
    y = df[target_col]
    
    # Remove rows with missing target
    valid_idx = ~(X.isnull().all(axis=1) | y.isnull())
    X_clean = X.loc[valid_idx]
    y_clean = y.loc[valid_idx]
    
    if len(X_clean) == 0:
        print("No valid data for feature analysis")
        return
    
    # Fill remaining missing values
    X_clean = X_clean.fillna(X_clean.median())
    
    # Calculate correlations
    correlations = X_clean.corrwith(y_clean).abs().sort_values(ascending=False)
    
    # Display top features
    print(f"\nTop 20 Features by Correlation with {target_col}:")
    print("=" * 50)
    for i, (feature, corr) in enumerate(correlations.head(20).items(), 1):
        print(f"{i:2d}. {feature:<30} {corr:.4f}")
    
    return correlations

# Analyze feature importance
if 'volatility' in df.columns:
    correlations = analyze_feature_importance(df, 'volatility')
else:
    correlations = analyze_feature_importance(df, 'returns')

# Data quality summary
print("\n" + "=" * 60)
print("DATA QUALITY SUMMARY")
print("=" * 60)
print(f"Total samples: {len(df):,}")
print(f"Total features: {len(df.columns)}")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")

# Missing values summary
missing_pct = (df.isnull().sum() / len(df)) * 100
missing_features = missing_pct[missing_pct > 0].sort_values(ascending=False)

if len(missing_features) > 0:
    print(f"\nFeatures with missing values:")
    for feature, pct in missing_features.head(10).items():
        print(f"  {feature}: {pct:.1f}%")
else:
    print("\nNo missing values detected!")

print("\n✅ Feature engineering complete!")


Top 20 Features by Correlation with returns:
 1. returns                        1.0000
 2. log_returns                    1.0000
 3. negative_return_streak         0.7981
 4. positive_return_streak         0.7981
 5. returns_volume_interaction     0.7251
 6. return_momentum_3              0.5771
 7. bb_position                    0.4494
 8. returns_rolling_mean_5         0.4477
 9. return_momentum_5              0.4477
10. momentum_5                     0.4477
11. williams_r                     0.4413
12. stoch_k                        0.4413
13. cci                            0.4332
14. rsi_14                         0.3591
15. returns_rolling_mean_10        0.3164
16. return_momentum_10             0.3164
17. momentum_10                    0.3164
18. returns_rolling_min_5          0.2994
19. returns_rolling_max_5          0.2983
20. rsi_30                         0.2519

DATA QUALITY SUMMARY
Total samples: 650,881
Total features: 136
Date range: 2018-01-01 00:00:00 to 2019-03-29 00:

## 6. Save Enhanced Dataset

In [7]:
# Save the enhanced dataset
output_path = processed_data_dir / 'enhanced_features.csv'
df.to_csv(output_path, index=False)
print(f"Enhanced dataset saved to: {output_path}")

# Create feature categories for documentation
feature_categories = {
    'price_features': [col for col in df.columns if any(x in col.lower() for x in ['open', 'high', 'low', 'close', 'price'])],
    'volume_features': [col for col in df.columns if 'volume' in col.lower()],
    'returns_features': [col for col in df.columns if 'return' in col.lower()],
    'volatility_features': [col for col in df.columns if 'volatility' in col.lower() or 'vol' in col.lower()],
    'sentiment_features': [col for col in df.columns if 'sentiment' in col.lower() or 'tweet' in col.lower()],
    'anomaly_features': [col for col in df.columns if 'anomaly' in col.lower() or 'z_score' in col.lower()],
    'regime_features': [col for col in df.columns if 'regime' in col.lower()],
    'time_features': [col for col in df.columns if any(x in col.lower() for x in ['hour', 'day', 'month', 'weekend', 'business'])],
    'momentum_features': [col for col in df.columns if 'momentum' in col.lower() or 'streak' in col.lower()],
    'interaction_features': [col for col in df.columns if 'interaction' in col.lower()]
}

# Save feature categories
categories_summary = {}
for category, features in feature_categories.items():
    categories_summary[category] = len(features)
    
categories_df = pd.DataFrame([categories_summary])
categories_path = processed_data_dir / 'feature_categories.csv'
categories_df.to_csv(categories_path, index=False)
print(f"Feature categories saved to: {categories_path}")

print("\n=== FEATURE ENGINEERING SUMMARY ===")
for category, count in categories_summary.items():
    print(f"{category}: {count} features")

print(f"\nTotal engineered features: {sum(categories_summary.values())}")
print("\n🎯 Ready for modeling!")

Enhanced dataset saved to: data/processed/enhanced_features.csv
Feature categories saved to: data/processed/feature_categories.csv

=== FEATURE ENGINEERING SUMMARY ===
price_features: 13 features
volume_features: 21 features
returns_features: 30 features
volatility_features: 40 features
sentiment_features: 5 features
anomaly_features: 6 features
regime_features: 4 features
time_features: 9 features
momentum_features: 9 features
interaction_features: 3 features

Total engineered features: 140

🎯 Ready for modeling!
