# 📊 Exploratory Data Analysis

**Project**: Cryptocurrency Market Intelligence System  
**Author**: [Your Name]  
**Course**: INSY 8413 | Introduction to Big Data Analytics  
**Date**: July 26, 2025

## 🎯 Objectives
1. Analyze price trends and patterns across cryptocurrencies
2. Examine trading volume and market behavior
3. Study correlations between different cryptocurrencies
4. Investigate volatility patterns and risk characteristics
5. Identify key insights for machine learning model development

## 📈 Analysis Areas
- **Price Analysis**: Trends, distributions, and movements
- **Volume Analysis**: Trading activity and liquidity
- **Volatility Analysis**: Risk assessment and market stress
- **Correlation Analysis**: Inter-cryptocurrency relationships
- **Technical Indicators**: RSI, MACD, Bollinger Bands analysis

In [None]:
# Import required libraries
import sys
import os

# Add src directory to path
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
from utils import CRYPTO_SYMBOLS, load_data, calculate_correlation_matrix

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (15, 8)

print("📚 Libraries imported successfully!")
print(f"🕐 EDA started at: {datetime.now()}")

## 📊 Load Processed Data

In [None]:
# Load all processed data
SYMBOLS = ['BTC', 'ETH', 'BNB', 'ADA', 'SOL']
INTERVALS = ['5m', '1h']

# Dictionary to store all data
crypto_data = {}

print("📊 Loading processed cryptocurrency data...")
print("=" * 50)

for symbol in SYMBOLS:
    crypto_data[symbol] = {}
    
    for interval in INTERVALS:
        filename = f"{symbol}_{interval}_features.csv"
        df = load_data(filename, '../data/processed')
        
        if df is not None and not df.empty:
            # Convert datetime column to index if needed
            if 'datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df['datetime'])
                df.set_index('datetime', inplace=True)
            elif df.index.name != 'datetime':
                try:
                    df.index = pd.to_datetime(df.index)
                except:
                    print(f"⚠️ Could not convert index to datetime for {symbol} {interval}")
            
            crypto_data[symbol][interval] = df
            print(f"✅ {symbol} {interval}: {len(df):,} records, {len(df.columns)} features")
        else:
            print(f"❌ No data found for {symbol} {interval}")

print(f"\n📈 Data loading completed!")

## 💰 Price Analysis

Let's start with comprehensive price analysis across all cryptocurrencies.

In [None]:
# Create comprehensive price analysis
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('Cryptocurrency Price Analysis (6 Months)', fontsize=16, fontweight='bold')

# Plot 1: Price trends (normalized to 100 for comparison)
ax1 = axes[0, 0]
for symbol in SYMBOLS:
    if symbol in crypto_data and '1h' in crypto_data[symbol]:
        df = crypto_data[symbol]['1h']
        normalized_price = (df['close'] / df['close'].iloc[0]) * 100
        ax1.plot(df.index, normalized_price, label=symbol, linewidth=2)

ax1.set_title('Normalized Price Trends (Base = 100)', fontweight='bold')
ax1.set_ylabel('Normalized Price')
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.axhline(y=100, color='black', linestyle='--', alpha=0.5)

# Plot 2: Price distribution (log scale)
ax2 = axes[0, 1]
price_data = []
labels = []
for symbol in SYMBOLS:
    if symbol in crypto_data and '1h' in crypto_data[symbol]:
        price_data.append(crypto_data[symbol]['1h']['close'])
        labels.append(symbol)

ax2.boxplot(price_data, labels=labels)
ax2.set_title('Price Distribution', fontweight='bold')
ax2.set_ylabel('Price (USD)')
ax2.set_yscale('log')
ax2.grid(True, alpha=0.3)

# Plot 3: Daily returns distribution
ax3 = axes[0, 2]
for symbol in SYMBOLS:
    if symbol in crypto_data and '1h' in crypto_data[symbol]:
        returns = crypto_data[symbol]['1h']['returns_1d'].dropna() * 100
        ax3.hist(returns, bins=50, alpha=0.6, label=symbol, density=True)

ax3.set_title('Hourly Returns Distribution', fontweight='bold')
ax3.set_xlabel('Returns (%)')
ax3.set_ylabel('Density')
ax3.legend()
ax3.grid(True, alpha=0.3)
ax3.axvline(x=0, color='black', linestyle='--', alpha=0.5)

# Plot 4: Volatility comparison
ax4 = axes[1, 0]
volatility_data = []
for symbol in SYMBOLS:
    if symbol in crypto_data and '1h' in crypto_data[symbol]:
        vol = crypto_data[symbol]['1h']['volatility_30d'].mean() * 100
        volatility_data.append(vol)

bars = ax4.bar(SYMBOLS, volatility_data, color=sns.color_palette("husl", len(SYMBOLS)))
ax4.set_title('Average 30-Day Volatility', fontweight='bold')
ax4.set_ylabel('Volatility (%)')

# Add value labels on bars
for bar, value in zip(bars, volatility_data):
    ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height(), 
             f'{value:.1f}%', ha='center', va='bottom', fontweight='bold')

# Plot 5: Price vs Volume relationship (BTC example)
ax5 = axes[1, 1]
if 'BTC' in crypto_data and '1h' in crypto_data['BTC']:
    btc_data = crypto_data['BTC']['1h']
    scatter = ax5.scatter(btc_data['volume'], btc_data['close'], 
                         c=btc_data['price_change_pct'], cmap='RdYlGn', alpha=0.6)
    ax5.set_title('BTC: Price vs Volume', fontweight='bold')
    ax5.set_xlabel('Volume')
    ax5.set_ylabel('Price (USD)')
    plt.colorbar(scatter, ax=ax5, label='Price Change %')

# Plot 6: Market cap comparison (approximate)
ax6 = axes[1, 2]
market_caps = []
for symbol in SYMBOLS:
    if symbol in crypto_data and '1h' in crypto_data[symbol]:
        # Approximate market cap using volume * price (not exact but for comparison)
        avg_price = crypto_data[symbol]['1h']['close'].mean()
        avg_volume = crypto_data[symbol]['1h']['volume'].mean()
        approx_market_activity = avg_price * avg_volume
        market_caps.append(approx_market_activity)
    else:
        market_caps.append(0)

bars = ax6.bar(SYMBOLS, market_caps, color=sns.color_palette("husl", len(SYMBOLS)))
ax6.set_title('Market Activity (Price × Volume)', fontweight='bold')
ax6.set_ylabel('Market Activity (USD)')
ax6.set_yscale('log')

plt.tight_layout()
plt.show()

## 📊 Volume and Trading Analysis

In [None]:
# Volume and trading analysis
fig, axes = plt.subplots(2, 2, figsize=(18, 12))
fig.suptitle('Trading Volume and Market Activity Analysis', fontsize=16, fontweight='bold')

# Plot 1: Volume trends over time
ax1 = axes[0, 0]
for symbol in SYMBOLS:
    if symbol in crypto_data and '1h' in crypto_data[symbol]:
        df = crypto_data[symbol]['1h']
        # Use 7-day rolling average for smoother visualization
        volume_ma = df['volume'].rolling(window=7*24).mean()  # 7 days * 24 hours
        ax1.plot(df.index, volume_ma, label=symbol, linewidth=2)

ax1.set_title('Trading Volume Trends (7-Day Moving Average)', fontweight='bold')
ax1.set_ylabel('Volume')
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.set_yscale('log')

# Plot 2: Volume distribution
ax2 = axes[0, 1]
volume_data = []
labels = []
for symbol in SYMBOLS:
    if symbol in crypto_data and '1h' in crypto_data[symbol]:
        volume_data.append(crypto_data[symbol]['1h']['volume'])
        labels.append(symbol)

ax2.boxplot(volume_data, labels=labels)
ax2.set_title('Volume Distribution by Cryptocurrency', fontweight='bold')
ax2.set_ylabel('Volume')
ax2.set_yscale('log')
ax2.grid(True, alpha=0.3)

# Plot 3: Volume vs Price Change correlation
ax3 = axes[1, 0]
if 'BTC' in crypto_data and '1h' in crypto_data['BTC']:
    btc_data = crypto_data['BTC']['1h']
    # Remove outliers for better visualization
    volume_q99 = btc_data['volume'].quantile(0.99)
    price_change_q99 = btc_data['price_change_pct'].quantile(0.99)
    price_change_q01 = btc_data['price_change_pct'].quantile(0.01)
    
    filtered_data = btc_data[
        (btc_data['volume'] <= volume_q99) & 
        (btc_data['price_change_pct'] <= price_change_q99) &
        (btc_data['price_change_pct'] >= price_change_q01)
    ]
    
    ax3.scatter(filtered_data['volume'], filtered_data['price_change_pct'], 
               alpha=0.5, color='orange')
    ax3.set_title('BTC: Volume vs Price Change', fontweight='bold')
    ax3.set_xlabel('Volume')
    ax3.set_ylabel('Price Change (%)')
    ax3.grid(True, alpha=0.3)
    ax3.axhline(y=0, color='black', linestyle='--', alpha=0.5)

# Plot 4: Average daily trading patterns (hour of day)
ax4 = axes[1, 1]
if 'BTC' in crypto_data and '1h' in crypto_data['BTC']:
    btc_data = crypto_data['BTC']['1h']
    hourly_volume = btc_data.groupby(btc_data.index.hour)['volume'].mean()
    hourly_volatility = btc_data.groupby(btc_data.index.hour)['volatility_7d'].mean() * 100
    
    ax4_twin = ax4.twinx()
    
    line1 = ax4.plot(hourly_volume.index, hourly_volume.values, 
                     color='blue', marker='o', label='Volume')
    line2 = ax4_twin.plot(hourly_volatility.index, hourly_volatility.values, 
                          color='red', marker='s', label='Volatility')
    
    ax4.set_title('BTC: Hourly Trading Patterns', fontweight='bold')
    ax4.set_xlabel('Hour of Day (UTC)')
    ax4.set_ylabel('Average Volume', color='blue')
    ax4_twin.set_ylabel('Average Volatility (%)', color='red')
    
    # Combine legends
    lines = line1 + line2
    labels = [l.get_label() for l in lines]
    ax4.legend(lines, labels, loc='upper right')
    
    ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 🔗 Correlation Analysis

Understanding how different cryptocurrencies move together is crucial for portfolio management and risk assessment.

In [None]:
# Correlation analysis
print("🔗 CRYPTOCURRENCY CORRELATION ANALYSIS")
print("=" * 50)

# Create price correlation matrix
price_data = {}
returns_data = {}

for symbol in SYMBOLS:
    if symbol in crypto_data and '1h' in crypto_data[symbol]:
        df = crypto_data[symbol]['1h']
        price_data[symbol] = df['close']
        returns_data[symbol] = df['returns_1d']

# Combine into DataFrames
price_df = pd.DataFrame(price_data)
returns_df = pd.DataFrame(returns_data)

# Calculate correlation matrices
price_corr = price_df.corr()
returns_corr = returns_df.corr()

# Create correlation visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('Cryptocurrency Correlation Analysis', fontsize=16, fontweight='bold')

# Price correlation heatmap
sns.heatmap(price_corr, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={'label': 'Correlation'}, ax=axes[0])
axes[0].set_title('Price Correlation Matrix', fontweight='bold')

# Returns correlation heatmap
sns.heatmap(returns_corr, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={'label': 'Correlation'}, ax=axes[1])
axes[1].set_title('Returns Correlation Matrix', fontweight='bold')

plt.tight_layout()
plt.show()

# Print correlation insights
print("\n📊 CORRELATION INSIGHTS")
print("-" * 30)
print("Price Correlations:")
for i, symbol1 in enumerate(SYMBOLS):
    for j, symbol2 in enumerate(SYMBOLS):
        if i < j and symbol1 in price_corr.columns and symbol2 in price_corr.columns:
            corr_value = price_corr.loc[symbol1, symbol2]
            print(f"{symbol1}-{symbol2}: {corr_value:.3f}")

print("\nReturns Correlations:")
for i, symbol1 in enumerate(SYMBOLS):
    for j, symbol2 in enumerate(SYMBOLS):
        if i < j and symbol1 in returns_corr.columns and symbol2 in returns_corr.columns:
            corr_value = returns_corr.loc[symbol1, symbol2]
            print(f"{symbol1}-{symbol2}: {corr_value:.3f}")

## 🎯 Key Insights and Summary

In [None]:
# Generate comprehensive insights summary
print("🎯 EXPLORATORY DATA ANALYSIS SUMMARY")
print("=" * 60)

insights = []

# Price insights
for symbol in SYMBOLS:
    if symbol in crypto_data and '1h' in crypto_data[symbol]:
        df = crypto_data[symbol]['1h']
        
        total_return = ((df['close'].iloc[-1] / df['close'].iloc[0]) - 1) * 100
        avg_volatility = df['volatility_30d'].mean() * 100
        max_drawdown = ((df['close'].cummax() - df['close']) / df['close'].cummax()).max() * 100
        
        insights.append({
            'symbol': symbol,
            'total_return': total_return,
            'volatility': avg_volatility,
            'max_drawdown': max_drawdown,
            'current_price': df['close'].iloc[-1],
            'avg_volume': df['volume'].mean()
        })

insights_df = pd.DataFrame(insights)

print("\n📈 PERFORMANCE SUMMARY (6 Months)")
print("-" * 40)
for _, row in insights_df.iterrows():
    symbol = row['symbol']
    print(f"{CRYPTO_SYMBOLS[symbol]} ({symbol}):")
    print(f"  Total Return: {row['total_return']:+.1f}%")
    print(f"  Volatility: {row['volatility']:.1f}%")
    print(f"  Max Drawdown: {row['max_drawdown']:.1f}%")
    print(f"  Current Price: ${row['current_price']:,.2f}")
    print()

print(f"\n✅ Exploratory Data Analysis completed successfully!")
print(f"➡️ Next step: Machine Learning Model Development (04_machine_learning.ipynb)")