# Feature Engineering Analysis

This notebook analyzes technical indicators and features for cryptocurrency prediction.

## Contents
1. Setup and Data Loading
2. Technical Indicators Calculation
3. Feature Visualization
4. Feature Importance Analysis
5. Feature Selection
6. Feature Engineering Pipeline Testing


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import MinMaxScaler

# Import our feature engineering module
from src.preprocessing.feature_engineering import FeatureEngineer


## 1. Setup and Data Loading


In [None]:
# Load data from previous analysis
data_path = 'data/raw/bitcoin_binance_latest.csv'
df = pd.read_csv(data_path, index_col='timestamp', parse_dates=True)

# Initialize feature engineer
engineer = FeatureEngineer()

# Display data info
print("Data Shape:", df.shape)
df.head()


## 2. Technical Indicators Calculation


In [None]:
def calculate_all_indicators(df):
    """Calculate all technical indicators"""
    
    # RSI
    df['rsi'] = engineer.calculate_rsi(df['close'])
    
    # MACD
    macd, signal, hist = engineer.calculate_macd(df['close'])
    df['macd'] = macd
    df['macd_signal'] = signal
    df['macd_hist'] = hist
    
    # Bollinger Bands
    bb_upper, bb_middle, bb_lower = engineer.calculate_bollinger_bands(df['close'])
    df['bb_upper'] = bb_upper
    df['bb_middle'] = bb_middle
    df['bb_lower'] = bb_lower
    
    # Moving Averages
    sma_20, sma_50 = engineer.calculate_moving_averages(df['close'])
    df['sma_20'] = sma_20
    df['sma_50'] = sma_50
    
    # Momentum
    roc, momentum = engineer.calculate_momentum_indicators(df['close'])
    df['roc'] = roc
    df['momentum'] = momentum
    
    return df

# Calculate indicators
df_indicators = calculate_all_indicators(df.copy())
df_indicators.head()


## 3. Feature Visualization


In [None]:
def plot_technical_indicators(df):
    """Plot technical indicators"""
    fig = make_subplots(rows=4, cols=1,
                        subplot_titles=('Price and Moving Averages',
                                       'RSI',
                                       'MACD',
                                       'Bollinger Bands'))
    
    # Price and MAs
    fig.add_trace(go.Scatter(x=df.index, y=df['close'], name='Price'), row=1, col=1)
    fig.add_trace(go.Scatter(x=df.index, y=df['sma_20'], name='SMA20'), row=1, col=1)
    fig.add_trace(go.Scatter(x=df.index, y=df['sma_50'], name='SMA50'), row=1, col=1)
    
    # RSI
    fig.add_trace(go.Scatter(x=df.index, y=df['rsi'], name='RSI'), row=2, col=1)
    fig.add_hline(y=70, line_dash='dash', line_color='red', row=2, col=1)
    fig.add_hline(y=30, line_dash='dash', line_color='green', row=2, col=1)
    
    # MACD
    fig.add_trace(go.Scatter(x=df.index, y=df['macd'], name='MACD'), row=3, col=1)
    fig.add_trace(go.Scatter(x=df.index, y=df['macd_signal'], name='Signal'), row=3, col=1)
    fig.add_trace(go.Bar(x=df.index, y=df['macd_hist'], name='Histogram'), row=3, col=1)
    
    # Bollinger Bands
    fig.add_trace(go.Scatter(x=df.index, y=df['close'], name='Price'), row=4, col=1)
    fig.add_trace(go.Scatter(x=df.index, y=df['bb_upper'], name='Upper BB'), row=4, col=1)
    fig.add_trace(go.Scatter(x=df.index, y=df['bb_lower'], name='Lower BB'), row=4, col=1)
    
    fig.update_layout(height=1200, showlegend=True)
    return fig

fig = plot_technical_indicators(df_indicators)
fig.show()


## 4. Feature Importance Analysis


In [None]:
def analyze_feature_importance(df):
    """Analyze feature importance using different methods"""
    # Prepare data
    df = df.dropna()
    
    # Create target variable (next day's return)
    target = df['close'].pct_change().shift(-1).dropna()
    features = df.iloc[:-1]  # Remove last row to match target
    
    # Select features
    feature_cols = ['rsi', 'macd', 'macd_signal', 'roc', 'momentum', 'volume']
    X = features[feature_cols]
    y = target
    
    # Calculate feature importance using f_regression
    selector = SelectKBest(score_func=f_regression, k='all')
    selector.fit(X, y)
    
    # Create importance DataFrame
    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': selector.scores_
    })
    importance_df = importance_df.sort_values('importance', ascending=False)
    
    return importance_df

importance_df = analyze_feature_importance(df_indicators)
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=importance_df)
plt.title('Feature Importance Analysis')
plt.show()


## 5. Feature Selection


In [None]:
def select_features(df, importance_df, threshold=0.5):
    """Select features based on importance threshold"""
    # Normalize importance scores
    max_importance = importance_df['importance'].max()
    importance_df['normalized_importance'] = importance_df['importance'] / max_importance
    
    # Select features above threshold
    selected_features = importance_df[
        importance_df['normalized_importance'] > threshold
    ]['feature'].tolist()
    
    return selected_features

selected_features = select_features(df_indicators, importance_df)
print("Selected features:", selected_features)


## 6. Feature Engineering Pipeline Testing


In [None]:
def test_feature_pipeline(df, selected_features):
    """Test the complete feature engineering pipeline"""
    # Create feature pipeline
    pipeline_df = df.copy()
    
    # Calculate technical indicators
    pipeline_df = calculate_all_indicators(pipeline_df)
    
    # Select features
    pipeline_df = pipeline_df[selected_features].copy()
    
    # Scale features
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(pipeline_df)
    pipeline_df = pd.DataFrame(
        scaled_features,
        columns=pipeline_df.columns,
        index=pipeline_df.index
    )
    
    return pipeline_df

# Test pipeline
final_features = test_feature_pipeline(df, selected_features)
print("Final feature shape:", final_features.shape)
final_features.head()
