# Exploratory Data Analysis (EDA)
## Crypto Volatility Detection

This notebook analyzes the features we computed and determines the volatility spike threshold.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Imports successful!")

Imports successful!


## 1. Load Feature Data

In [None]:
# Load features
features_path = '../data/processed/features.parquet'
df = pd.read_parquet(features_path)

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

print(f"Loaded {len(df)} rows")
print(f"Time range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing': missing,
    'Percentage': missing_pct
}).sort_values('Missing', ascending=False)

print("Missing Values:")
print(missing_df[missing_df['Missing'] > 0])

## 2. Price and Return Analysis

In [None]:
# Plot price over time
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Price
axes[0].plot(df['timestamp'], df['price'], linewidth=0.5, alpha=0.7)
axes[0].set_title('Midprice Over Time', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Time')
axes[0].set_ylabel('Price (USD)')
axes[0].grid(True, alpha=0.3)

# Spread
axes[1].plot(df['timestamp'], df['spread_bps'], linewidth=0.5, alpha=0.7, color='orange')
axes[1].set_title('Bid-Ask Spread Over Time (basis points)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Time')
axes[1].set_ylabel('Spread (bps)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Analyze returns
return_cols = [col for col in df.columns if 'return_mean' in col or 'return_std' in col]

fig, axes = plt.subplots(len(return_cols), 1, figsize=(14, 4 * len(return_cols)))

if len(return_cols) == 1:
    axes = [axes]

for i, col in enumerate(return_cols):
    axes[i].plot(df['timestamp'], df[col], linewidth=0.5, alpha=0.7)
    axes[i].set_title(f'{col}', fontsize=12, fontweight='bold')
    axes[i].set_xlabel('Time')
    axes[i].set_ylabel('Value')
    axes[i].grid(True, alpha=0.3)
    axes[i].axhline(y=0, color='r', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

## 3. Volatility Analysis & Threshold Selection

We need to compute **forward-looking volatility** to create our labels.

In [None]:
# Compute forward-looking volatility (our target)
# We'll use 60-second horizon as specified in the assignment

HORIZON_SECONDS = 60

# Sort by timestamp to ensure correct ordering
df = df.sort_values('timestamp').reset_index(drop=True)

# Calculate rolling forward volatility
# Using rolling window looking FORWARD (shift backwards)
df['price_pct_change'] = df['price'].pct_change()

# Estimate number of ticks in 60 seconds (will vary by market activity)
# Let's calculate average ticks per second
time_diff = (df['timestamp'].max() - df['timestamp'].min()).total_seconds()
ticks_per_second = len(df) / time_diff
window_size = int(ticks_per_second * HORIZON_SECONDS)

print(f"Average ticks per second: {ticks_per_second:.2f}")
print(f"Window size for {HORIZON_SECONDS}s: {window_size} ticks")

# Compute forward-looking volatility (shifted backwards)
df['future_volatility'] = df['price_pct_change'].shift(-window_size).rolling(window=window_size).std()

# Drop NaN values at the end
df_clean = df.dropna(subset=['future_volatility']).copy()

print(f"\nAfter computing future volatility: {len(df_clean)} valid rows")

In [None]:
# Plot distribution of future volatility
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df_clean['future_volatility'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Distribution of Future Volatility', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Volatility (std of returns)')
axes[0].set_ylabel('Frequency')
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(df_clean['future_volatility'], vert=True)
axes[1].set_title('Future Volatility Box Plot', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Volatility')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Percentile analysis to choose threshold
percentiles = [50, 75, 80, 85, 90, 95, 99]
percentile_values = {}

print("Volatility Percentiles:")
print("-" * 40)
for p in percentiles:
    value = np.percentile(df_clean['future_volatility'], p)
    percentile_values[p] = value
    count = (df_clean['future_volatility'] >= value).sum()
    pct = (count / len(df_clean)) * 100
    print(f"  {p}th percentile: {value:.6f} ({count} spikes, {pct:.1f}%)")

# Visualize percentiles
plt.figure(figsize=(12, 6))
plt.plot(df_clean['timestamp'], df_clean['future_volatility'], 
         linewidth=0.5, alpha=0.5, label='Future Volatility')

colors = plt.cm.YlOrRd(np.linspace(0.3, 1, len(percentiles)))
for i, (p, value) in enumerate(percentile_values.items()):
    plt.axhline(y=value, color=colors[i], linestyle='--', 
                label=f'{p}th percentile: {value:.6f}', alpha=0.8)

plt.title('Future Volatility with Percentile Thresholds', fontsize=14, fontweight='bold')
plt.xlabel('Time')
plt.ylabel('Volatility')
plt.legend(loc='best')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 4. Select Threshold & Create Labels

In [None]:
# Choose threshold - typically 90th or 95th percentile for rare events
# Adjust based on your data!
THRESHOLD_PERCENTILE = 90
THRESHOLD = np.percentile(df_clean['future_volatility'], THRESHOLD_PERCENTILE)

print(f"Selected Threshold: {THRESHOLD:.6f} ({THRESHOLD_PERCENTILE}th percentile)")

# Create binary labels
df_clean['label'] = (df_clean['future_volatility'] >= THRESHOLD).astype(int)

# Class distribution
label_counts = df_clean['label'].value_counts()
print(f"\nClass Distribution:")
print(f"  No Spike (0): {label_counts.get(0, 0)} ({label_counts.get(0, 0)/len(df_clean)*100:.1f}%)")
print(f"  Spike (1):    {label_counts.get(1, 0)} ({label_counts.get(1, 0)/len(df_clean)*100:.1f}%)")

In [None]:
# Visualize labeled data
fig, ax = plt.subplots(figsize=(14, 6))

# Plot all points
no_spike = df_clean[df_clean['label'] == 0]
spike = df_clean[df_clean['label'] == 1]

ax.scatter(no_spike['timestamp'], no_spike['future_volatility'], 
           s=1, alpha=0.3, label='No Spike', color='blue')
ax.scatter(spike['timestamp'], spike['future_volatility'], 
           s=3, alpha=0.7, label='Spike', color='red')

ax.axhline(y=THRESHOLD, color='red', linestyle='--', 
           label=f'Threshold: {THRESHOLD:.6f}', linewidth=2)

ax.set_title('Volatility Spikes Over Time', fontsize=14, fontweight='bold')
ax.set_xlabel('Time')
ax.set_ylabel('Future Volatility')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Feature Correlation Analysis

In [None]:
# Select numeric features for correlation
feature_cols = [col for col in df_clean.columns if 
                any(x in col for x in ['return', 'spread', 'tick_count', 'price_std'])]
feature_cols.append('future_volatility')

# Correlation matrix
corr_matrix = df_clean[feature_cols].corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 6. Save Labeled Dataset

In [None]:
# Save with labels and future volatility
output_path = '../data/processed/features_labeled.parquet'
df_clean.to_parquet(output_path, index=False)
print(f"✓ Saved labeled dataset to {output_path}")
print(f"  Shape: {df_clean.shape}")

## Summary

**Key Findings:**
- Dataset size: [fill in]
- Time range: [fill in]
- Selected threshold (τ): [fill in]
- Spike rate: [fill in]%
- Most correlated features with future volatility: [fill in]

**Next Steps:**
1. Document these findings in `docs/feature_spec.md`
2. Generate Evidently report
3. Move to Milestone 3: Model training