# 📊 Cryptocurrency Data Collection

**Project**: Cryptocurrency Market Intelligence System  
**Author**: [Your Name]  
**Course**: INSY 8413 | Introduction to Big Data Analytics  
**Date**: July 26, 2025

## 🎯 Objectives
1. Collect 6 months of historical data for 5 major cryptocurrencies
2. Gather data at multiple timeframes (5-minute and hourly intervals)
3. Include price, volume, and trading metrics
4. Validate data quality and completeness

## 📈 Cryptocurrencies
- **Bitcoin (BTC)** - Market leader
- **Ethereum (ETH)** - Smart contracts platform
- **Binance Coin (BNB)** - Exchange token
- **Cardano (ADA)** - Proof-of-stake blockchain
- **Solana (SOL)** - High-performance blockchain

In [None]:
# Import required libraries
import sys
import os

# Add src directory to path
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
from data_collector import BinanceDataCollector
from utils import CRYPTO_SYMBOLS, print_data_summary

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📚 Libraries imported successfully!")
print(f"🕐 Analysis started at: {datetime.now()}")

## 🔧 Initialize Data Collector

In [None]:
# Initialize the Binance data collector
collector = BinanceDataCollector()

print("🚀 Binance Data Collector initialized!")
print(f"📡 Base URL: {collector.base_url}")
print(f"⏱️ Rate limit: {collector.min_request_interval}s between requests")

## 💰 Current Market Prices

In [None]:
# Get current prices for all cryptocurrencies
print("📊 Fetching current market prices...")
current_prices = collector.get_current_prices()

# Display current prices
print("\n💰 CURRENT CRYPTOCURRENCY PRICES")
print("=" * 40)
for symbol, price in current_prices.items():
    crypto_name = CRYPTO_SYMBOLS[symbol]
    print(f"{crypto_name:15} ({symbol}): ${price:>10,.2f}")

# Create a simple price comparison chart
plt.figure(figsize=(12, 6))
symbols = list(current_prices.keys())
prices = list(current_prices.values())
colors = sns.color_palette("husl", len(symbols))

bars = plt.bar(symbols, prices, color=colors)
plt.title('Current Cryptocurrency Prices (USD)', fontsize=16, fontweight='bold')
plt.xlabel('Cryptocurrency', fontsize=12)
plt.ylabel('Price (USD)', fontsize=12)
plt.yscale('log')  # Log scale due to large price differences

# Add value labels on bars
for bar, price in zip(bars, prices):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height(), 
             f'${price:,.0f}', ha='center', va='bottom', fontweight='bold')

plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 📈 Historical Data Collection

Now we'll collect 6 months of historical data for each cryptocurrency at different time intervals.

In [None]:
# Define collection parameters
INTERVALS = ['5m', '1h']  # 5-minute and hourly data
MONTHS_BACK = 6

print(f"📅 Collecting {MONTHS_BACK} months of historical data")
print(f"⏰ Time intervals: {', '.join(INTERVALS)}")
print(f"🪙 Cryptocurrencies: {', '.join(CRYPTO_SYMBOLS.keys())}")
print("\n🔄 Starting data collection...")

# Collect all data
all_historical_data = collector.collect_all_cryptocurrencies(
    intervals=INTERVALS, 
    months_back=MONTHS_BACK
)

print("\n✅ Data collection completed!")

## 📊 Data Quality Assessment

In [None]:
# Assess data quality for each cryptocurrency and interval
print("🔍 DATA QUALITY ASSESSMENT")
print("=" * 50)

data_summary = {}

for symbol in all_historical_data:
    data_summary[symbol] = {}
    print(f"\n📊 {CRYPTO_SYMBOLS[symbol]} ({symbol})")
    print("-" * 30)
    
    for interval in all_historical_data[symbol]:
        df = all_historical_data[symbol][interval]
        
        # Calculate summary statistics
        records = len(df)
        date_range = f"{df.index.min().date()} to {df.index.max().date()}"
        missing_values = df.isnull().sum().sum()
        avg_price = df['close'].mean()
        price_volatility = df['close'].std()
        avg_volume = df['volume'].mean()
        
        data_summary[symbol][interval] = {
            'records': records,
            'date_range': date_range,
            'missing_values': missing_values,
            'avg_price': avg_price,
            'price_volatility': price_volatility,
            'avg_volume': avg_volume
        }
        
        print(f"  {interval:3} interval: {records:>6,} records | {date_range} | Missing: {missing_values}")
        print(f"      Avg Price: ${avg_price:>8,.2f} | Volatility: ${price_volatility:>8,.2f}")
        print(f"      Avg Volume: {avg_volume:>12,.0f}")

print(f"\n📁 Raw data files saved to: ../data/raw/")

## 📈 Quick Data Visualization

In [None]:
# Create a quick visualization of price trends
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Cryptocurrency Price Trends (Hourly Data - Last 6 Months)', fontsize=16, fontweight='bold')

symbols = list(all_historical_data.keys())
colors = sns.color_palette("husl", len(symbols))

for i, symbol in enumerate(symbols):
    row = i // 3
    col = i % 3
    
    if symbol in all_historical_data and '1h' in all_historical_data[symbol]:
        df = all_historical_data[symbol]['1h']
        
        axes[row, col].plot(df.index, df['close'], color=colors[i], linewidth=1.5)
        axes[row, col].set_title(f'{CRYPTO_SYMBOLS[symbol]} ({symbol})', fontweight='bold')
        axes[row, col].set_ylabel('Price (USD)')
        axes[row, col].grid(True, alpha=0.3)
        axes[row, col].tick_params(axis='x', rotation=45)

# Remove empty subplot
if len(symbols) < 6:
    fig.delaxes(axes[1, 2])

plt.tight_layout()
plt.show()

## 📋 Collection Summary

In [None]:
# Create and save collection summary
import json

collection_summary = {
    'collection_date': datetime.now().isoformat(),
    'project_title': 'Cryptocurrency Market Intelligence System',
    'cryptocurrencies': list(CRYPTO_SYMBOLS.keys()),
    'intervals': INTERVALS,
    'months_collected': MONTHS_BACK,
    'current_prices': current_prices,
    'data_summary': data_summary
}

# Save summary to file
os.makedirs('../data/raw', exist_ok=True)
with open('../data/raw/collection_summary.json', 'w') as f:
    json.dump(collection_summary, f, indent=2, default=str)

print("📊 COLLECTION SUMMARY")
print("=" * 30)
print(f"📅 Collection Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"🪙 Cryptocurrencies: {len(CRYPTO_SYMBOLS)}")
print(f"⏰ Time Intervals: {len(INTERVALS)}")
print(f"📈 Total Datasets: {len(CRYPTO_SYMBOLS) * len(INTERVALS)}")

total_records = sum(
    len(all_historical_data[symbol][interval]) 
    for symbol in all_historical_data 
    for interval in all_historical_data[symbol]
)
print(f"📊 Total Records: {total_records:,}")
print(f"💾 Summary saved to: ../data/raw/collection_summary.json")

print("\n✅ Data collection notebook completed successfully!")
print("➡️ Next step: Data Cleaning and Preprocessing (02_data_cleaning.ipynb)")