# Bitcoin Trading Agent - Data Collection

This notebook handles all data collection from multiple sources:
- Investing.com Bitcoin historical data (via Crawl4AI)
- CoinMarketCap API for current prices
- Yahoo Finance as backup
- Binance API for additional market data

## Observations Log
We will document key findings and observations after each step.

In [1]:
# Import required libraries
import os
import sys
import pandas as pd
import numpy as np
import requests
import yfinance as yf
from datetime import datetime, timedelta
import asyncio
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv()

# Fix Python path for imports - use absolute path
current_dir = os.path.dirname(os.path.abspath(''))
project_root = os.path.dirname(current_dir) if os.path.basename(current_dir) == 'notebooks' else current_dir
src_path = os.path.join(project_root, 'src')

# Add both possible paths to ensure imports work
if os.path.exists(src_path):
    sys.path.insert(0, src_path)
    sys.path.insert(0, project_root)
else:
    # Fallback paths 
    sys.path.insert(0, '../src')
    sys.path.insert(0, '..')

print("✅ Libraries loaded successfully")
print(f"Environment variables loaded: {os.path.exists('../.env') or os.path.exists('.env')}")
print(f"Python paths added: {[p for p in sys.path[:3]]}")

✅ Libraries loaded successfully
Environment variables loaded: True
Python paths added: ['..', '../src', 'd:\\Apziva\\Project 5\\Bitcoin-Trading-Agent\\env\\Scripts\\python313.zip']


## 1. Crawl4AI Setup for Investing.com

Using Crawl4AI to scrape Bitcoin historical data from Investing.com with advanced features like JavaScript rendering and anti-detection.

In [2]:
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import LLMExtractionStrategy, LLMConfig
import json

async def scrape_investing_btc_data():
    """
    Scrape Bitcoin historical data from Investing.com using Crawl4AI
    """
    url = "https://www.investing.com/crypto/bitcoin/historical-data"

    # Define extraction strategy using Groq with Llama 3.3 70B
    extraction_strategy = LLMExtractionStrategy(
        llm_config=LLMConfig(
            provider="groq",
            api_token=os.getenv('GROQ_API_KEY'),
            model="llama-3.3-70b-versatile"
        ),
        instruction="""
        Extract the historical Bitcoin price data from the table. 
        Return a JSON array with objects containing:
        - date: The date in YYYY-MM-DD format
        - price: The closing price as a number
        - open: The opening price as a number  
        - high: The highest price as a number
        - low: The lowest price as a number
        - volume: The volume as a number
        - change_pct: The percentage change as a number
        """
    )

    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url=url,
            extraction_strategy=extraction_strategy,
            css_selector=".historical-data-table, table[data-test='historical-data-table']",
            wait_for="css:.historical-data-table",
            timeout=30000
        )

        return result
# Create data directory if it doesn't exist
os.makedirs('../data', exist_ok=True)

# NEW: Try enhanced DataCollector with multiple sources first
print("🔄 Using Enhanced DataCollector with Multiple Sources...")

# Multiple import strategies to handle different path configurations
collector = None
enhanced_available = False

# Strategy 1: Try direct import from data_collector
try:
    from data_collector import DataCollector
    print("✅ Imported DataCollector directly")
    enhanced_available = True
except ImportError as e1:
    print(f"⚠️ Direct import failed: {e1}")
    
    # Strategy 2: Try import from src.data_collector
    try:
        from src.data_collector import DataCollector
        print("✅ Imported DataCollector from src module")
        enhanced_available = True
    except ImportError as e2:
        print(f"⚠️ src module import failed: {e2}")
        
        # Strategy 3: Try with sys.path manipulation
        try:
            import sys
            import os
            project_root = os.path.dirname(os.path.abspath('.'))
            src_path = os.path.join(project_root, 'src')
            if src_path not in sys.path:
                sys.path.insert(0, src_path)
            
            from data_collector import DataCollector
            print("✅ Imported DataCollector with path manipulation")
            enhanced_available = True
        except ImportError as e3:
            print(f"⚠️ All import strategies failed:")
            print(f"  - Direct: {e1}")
            print(f"  - Module: {e2}")  
            print(f"  - Path: {e3}")
            enhanced_available = False

# If enhanced collector is available, try to use it
if enhanced_available:
    try:
        # Initialize enhanced data collector
        collector = DataCollector()
        
        print(f"Available sources: {list(collector.sources.keys())}")
        
        # Try collecting data from all sources
        all_data = collector.collect_all_data(period='6mo')
        
        print(f"\n📊 Enhanced Data Collection Results:")
        successful_sources = []
        for source, data in all_data.items():
            if data is not None and not data.empty:
                print(f"✅ {source}: {len(data)} records")
                # Save individual source data
                data.to_csv(f'../data/btc_{source}_enhanced.csv', index=False)
                successful_sources.append(source)
            else:
                print(f"❌ {source}: No data")
        
        # Get current price from best source
        current_price = collector.get_current_price()
        if current_price:
            print(f"\n💰 Current BTC Price from enhanced collector: ${current_price:,.2f}")
        else:
            print("\n⚠️ Could not get current price from enhanced collector")
            
        # Combine successful sources
        if successful_sources:
            try:
                # Try to import standardize_dataframes function
                if enhanced_available:
                    try:
                        from data_collector import standardize_dataframes
                    except ImportError:
                        try:
                            from src.data_collector import standardize_dataframes
                        except ImportError:
                            print("⚠️ Could not import standardize_dataframes, using local function")
                            # Define local version if import fails
                            def standardize_dataframes(*dataframes, source_names=None):
                                if source_names is None:
                                    source_names = [f'source_{i}' for i in range(len(dataframes))]
                                
                                combined_data = []
                                for df, source in zip(dataframes, source_names):
                                    if df is not None and not df.empty:
                                        df_copy = df.copy()
                                        df_copy['source'] = source
                                        combined_data.append(df_copy)
                                
                                if combined_data:
                                    return pd.concat(combined_data, ignore_index=True)
                                return None
                
                successful_dfs = [all_data[source] for source in successful_sources]
                enhanced_combined = standardize_dataframes(*successful_dfs, source_names=successful_sources)
                
                if enhanced_combined is not None:
                    print(f"\n✅ Enhanced Combined Dataset: {len(enhanced_combined)} records")
                    print(f"Sources: {enhanced_combined['source'].unique()}")
                    enhanced_combined.to_csv('../data/btc_enhanced_combined.csv', index=False)
                    
                    # Set this as our primary combined data
                    combined_btc_data_enhanced = enhanced_combined
            except Exception as combine_error:
                print(f"⚠️ Error combining enhanced data: {combine_error}")
                
    except Exception as collector_error:
        print(f"❌ Enhanced DataCollector runtime error: {collector_error}")
        print("Falling back to original scraping methods...")
        enhanced_available = False
else:
    print("❌ Enhanced DataCollector not available")
    print("Falling back to original scraping methods...")

# FALLBACK: Original scraping methods if enhanced collector fails
if not enhanced_available or 'successful_sources' not in locals() or not successful_sources:
    def fallback_investing_scraper():
        """
        Fallback method using requests + BeautifulSoup if all else fails
        """
        try:
            from bs4 import BeautifulSoup
            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            
            url = "https://www.investing.com/crypto/bitcoin/historical-data"
            response = requests.get(url, headers=headers)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find the historical data table
                table = soup.find('table', {'data-test': 'historical-data-table'})
                if not table:
                    table = soup.find('table', class_='historical-data-table')
                
                if table:
                    rows = table.find_all('tr')[1:]  # Skip header
                    
                    data = []
                    for row in rows:
                        cols = row.find_all('td')
                        if len(cols) >= 6:
                            date_str = cols[0].text.strip()
                            price = cols[1].text.replace(',', '').replace('$', '').strip()
                            open_price = cols[2].text.replace(',', '').replace('$', '').strip()
                            high = cols[3].text.replace(',', '').replace('$', '').strip()
                            low = cols[4].text.replace(',', '').replace('$', '').strip()
                            volume = cols[5].text.replace(',', '').strip()
                            
                            data.append({
                                'date': date_str,
                                'price': float(price) if price else None,
                                'open': float(open_price) if open_price else None,
                                'high': float(high) if high else None,
                                'low': float(low) if low else None,
                                'volume': volume
                            })
                    
                    return pd.DataFrame(data)
                
        except Exception as e:
            print(f"Fallback scraper error: {str(e)}")
            return None
        
        return None

    print("\n🔄 Trying fallback scraping method...")
    investing_df = fallback_investing_scraper()
    
    if investing_df is not None:
        print(f"✅ Fallback method successful: {len(investing_df)} records")
        investing_df.to_csv('../data/btc_investing_raw.csv', index=False)
    else:
        print("❌ Fallback scraping method also failed")
else:
    if 'successful_sources' in locals() and successful_sources:
        print(f"\n✅ Enhanced data collection successful from {len(successful_sources)} sources")
        print("Skipping fallback scraper")
    else:
        print("\n⚠️ Enhanced collector loaded but no successful sources found")

🔄 Using Enhanced DataCollector with Multiple Sources...
⚠️ Direct import failed: No module named 'data_collector'
⚠️ src module import failed: No module named 'src'
⚠️ All import strategies failed:
  - Direct: No module named 'data_collector'
  - Module: No module named 'src'
  - Path: No module named 'data_collector'
❌ Enhanced DataCollector not available
Falling back to original scraping methods...

🔄 Trying fallback scraping method...
❌ Fallback scraping method also failed


# GPT 5 Scraper 

In [3]:
# import requests
# import pandas as pd

# url = "https://www.investing.com/crypto/bitcoin/historical-data"
# headers = {
#     "User-Agent": "Mozilla/5.0",
# }

# # Fetch the page
# resp = requests.get(url, headers=headers)

# # Extract first table (raw form)
# raw_df = pd.read_html(resp.text)[0]

# print(raw_df)


### Observation 1: Investing.com Scraping Results

**Key Findings:**
- [ ] Document success rate of Crawl4AI scraping
- [ ] Note data quality and completeness
- [ ] Record any anti-bot detection issues
- [ ] Validate date ranges and data format

## 2. CoinMarketCap API Integration

Getting current Bitcoin price and market data from CoinMarketCap API for real-time updates.

In [4]:
def get_coinmarketcap_data():
    """
    Fetch current Bitcoin data from CoinMarketCap API
    """
    api_key = os.getenv('COINMARKETCAP_API_KEY')
    
    if not api_key or api_key == 'your_coinmarketcap_api_key_here':
        print("⚠️ CoinMarketCap API key not configured")
        return None
    
    url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/quotes/latest'
    parameters = {
        'symbol': 'BTC',
        'convert': 'USD'
    }
    headers = {
        'Accepts': 'application/json',
        'X-CMC_PRO_API_KEY': api_key,
    }
    
    try:
        response = requests.get(url, headers=headers, params=parameters)
        data = response.json()
        
        if response.status_code == 200 and 'data' in data:
            btc_data = data['data']['BTC']
            quote = btc_data['quote']['USD']
            
            current_data = {
                'timestamp': datetime.now().isoformat(),
                'price': quote['price'],
                'volume_24h': quote['volume_24h'],
                'percent_change_1h': quote['percent_change_1h'],
                'percent_change_24h': quote['percent_change_24h'],
                'percent_change_7d': quote['percent_change_7d'],
                'market_cap': quote['market_cap'],
                'last_updated': quote['last_updated']
            }
            
            return current_data
        else:
            print(f"❌ CoinMarketCap API error: {data}")
            return None
            
    except Exception as e:
        print(f"❌ Error fetching CoinMarketCap data: {str(e)}")
        return None

# Fetch current data
cmc_data = get_coinmarketcap_data()

if cmc_data:
    print("✅ Current Bitcoin data from CoinMarketCap:")
    print(f"Price: ${cmc_data['price']:,.2f}")
    print(f"24h Change: {cmc_data['percent_change_24h']:.2f}%")
    print(f"Volume 24h: ${cmc_data['volume_24h']:,.0f}")
    print(f"Market Cap: ${cmc_data['market_cap']:,.0f}")
else:
    print("⚠️ CoinMarketCap data not available - using backup sources")

✅ Current Bitcoin data from CoinMarketCap:
Price: $117,494.00
24h Change: -1.35%
Volume 24h: $63,639,141,349
Market Cap: $2,338,964,050,347


### Observation 2: CoinMarketCap API Performance

**Key Findings:**
- [ ] Note API response time and reliability
- [ ] Document rate limits encountered
- [ ] Compare price accuracy with other sources
- [ ] Record data freshness (last_updated)

## 3. Yahoo Finance Integration

Using yfinance as a reliable backup source for Bitcoin historical and current data.

In [5]:
def get_yahoo_btc_data(period='1y'):
    """
    Fetch Bitcoin data from Yahoo Finance
    period options: 1d, 5d, 1mo, 3mo, 6mo, 1y, 2y, 5y, 10y, ytd, max
    """
    try:
        btc_ticker = yf.Ticker("BTC-USD")
        hist_data = btc_ticker.history(period=period)
        
        if not hist_data.empty:
            # Reset index to get date as column
            hist_data = hist_data.reset_index()
            
            # Rename columns to match our standard format
            hist_data.columns = [col.lower() for col in hist_data.columns]
            hist_data['date'] = hist_data['date'].dt.strftime('%Y-%m-%d')
            hist_data['price'] = hist_data['close']
            
            # Get current info
            info = btc_ticker.info
            current_price = info.get('regularMarketPrice', hist_data['close'].iloc[-1])
            
            print(f"✅ Yahoo Finance data: {len(hist_data)} records")
            print(f"Date range: {hist_data['date'].iloc[0]} to {hist_data['date'].iloc[-1]}")
            print(f"Current price: ${current_price:,.2f}")
            print(f"Latest close: ${hist_data['close'].iloc[-1]:,.2f}")
            
            return hist_data, current_price
        else:
            print("❌ No data returned from Yahoo Finance")
            return None, None
            
    except Exception as e:
        print(f"❌ Error fetching Yahoo Finance data: {str(e)}")
        return None, None

# Fetch Yahoo Finance data
yahoo_df, yahoo_current = get_yahoo_btc_data(period='6mo')

if yahoo_df is not None:
    # Save Yahoo data
    yahoo_df.to_csv('../data/btc_yahoo_raw.csv', index=False)
    
    # Show recent data
    print("\nRecent data from Yahoo Finance:")
    print(yahoo_df[['date', 'open', 'high', 'low', 'close', 'volume']].tail())
else:
    print("⚠️ Yahoo Finance data not available")

✅ Yahoo Finance data: 182 records
Date range: 2025-02-16 to 2025-08-16
Current price: $117,494.00
Latest close: $117,494.00

Recent data from Yahoo Finance:
           date           open           high            low          close  \
177  2025-08-12  118717.664062  120302.468750  118228.718750  120172.906250   
178  2025-08-13  120168.976562  123682.453125  118939.632812  123344.062500   
179  2025-08-14  123339.398438  124457.117188  117254.882812  118359.578125   
180  2025-08-15  118365.781250  119332.312500  116864.570312  117398.351562   
181  2025-08-16  117411.218750  117948.859375  117355.820312  117494.000000   

           volume  
177   72803657984  
178   90904808795  
179  104055627395  
180   68665353159  
181   63639142400  


### Observation 3: Yahoo Finance Data Quality

**Key Findings:**
- [ ] Compare data consistency with other sources
- [ ] Note any gaps or anomalies in historical data
- [ ] Document volume data accuracy
- [ ] Record API response reliability

## 4. Data Integration and Standardization

Combining data from all sources into a standardized format for analysis.

In [6]:
def standardize_btc_data(*dataframes, source_names=None):
    """
    Standardize and combine Bitcoin data from multiple sources
    """
    if source_names is None:
        source_names = [f'source_{i}' for i in range(len(dataframes))]
    
    combined_data = []
    
    for df, source in zip(dataframes, source_names):
        if df is not None and not df.empty:
            # Create a copy to avoid modifying original
            df_copy = df.copy()
            
            # Ensure we have required columns
            required_cols = ['date', 'open', 'high', 'low', 'close', 'volume']
            
            # Map common column variations
            if 'price' in df_copy.columns and 'close' not in df_copy.columns:
                df_copy['close'] = df_copy['price']
            
            # Convert date to datetime if it's not already
            if 'date' in df_copy.columns:
                df_copy['date'] = pd.to_datetime(df_copy['date'])
            
            # Add source column
            df_copy['source'] = source
            
            # Select and reorder columns
            available_cols = [col for col in required_cols if col in df_copy.columns]
            df_final = df_copy[available_cols + ['source']].copy()
            
            combined_data.append(df_final)
            print(f"✅ Processed {len(df_final)} records from {source}")
        else:
            print(f"⚠️ No data available from {source}")
    
    if combined_data:
        # Combine all dataframes
        final_df = pd.concat(combined_data, ignore_index=True)
        
        # Remove duplicates based on date and source
        final_df = final_df.drop_duplicates(subset=['date', 'source'])
        
        # Sort by date
        final_df = final_df.sort_values('date').reset_index(drop=True)
        
        return final_df
    else:
        return None

# Combine available data sources
available_dfs = []
source_names = []

if 'investing_df' in locals() and investing_df is not None:
    available_dfs.append(investing_df)
    source_names.append('investing')

if 'yahoo_df' in locals() and yahoo_df is not None:
    available_dfs.append(yahoo_df)
    source_names.append('yahoo')

# Standardize and combine data
if available_dfs:
    combined_btc_data = standardize_btc_data(*available_dfs, source_names=source_names)
    
    if combined_btc_data is not None:
        print(f"\n✅ Combined dataset: {len(combined_btc_data)} total records")
        print(f"Date range: {combined_btc_data['date'].min()} to {combined_btc_data['date'].max()}")
        print(f"Sources: {combined_btc_data['source'].unique()}")
        
        # Save combined data
        combined_btc_data.to_csv('../data/btc_combined_raw.csv', index=False)
        
        # Show data summary
        print("\nData summary by source:")
        print(combined_btc_data.groupby('source').agg({
            'date': ['count', 'min', 'max'],
            'close': ['mean', 'min', 'max']
        }).round(2))
    else:
        print("❌ Failed to combine data sources")
else:
    print("❌ No data sources available")

✅ Processed 182 records from yahoo

✅ Combined dataset: 182 total records
Date range: 2025-02-16 00:00:00 to 2025-08-16 00:00:00
Sources: ['yahoo']

Data summary by source:
        date                           close                     
       count        min        max      mean       min        max
source                                                           
yahoo    182 2025-02-16 2025-08-16  100382.4  76271.95  123344.06


### Observation 4: Data Integration Results

**Key Findings:**
- [ ] Document data overlap and gaps between sources
- [ ] Note price discrepancies between sources
- [ ] Record data quality metrics
- [ ] Identify preferred data source for different time periods

## 5. Data Quality Checks

Performing comprehensive data validation and quality assessment.

In [7]:
def perform_data_quality_checks(df):
    """
    Comprehensive data quality assessment
    """
    if df is None or df.empty:
        print("❌ No data to check")
        return None
    
    print("🔍 Data Quality Assessment")
    print("="*50)
    
    # Basic info
    print(f"📊 Dataset Shape: {df.shape}")
    print(f"📅 Date Range: {df['date'].min()} to {df['date'].max()}")
    print(f"🔢 Total Days: {(df['date'].max() - df['date'].min()).days} days")
    
    # Missing values
    print("\n🔍 Missing Values:")
    missing_summary = df.isnull().sum()
    for col, missing in missing_summary.items():
        if missing > 0:
            pct = (missing / len(df)) * 100
            print(f"  {col}: {missing} ({pct:.1f}%)")
        else:
            print(f"  {col}: ✅ No missing values")
    
    # Data types
    print("\n📋 Data Types:")
    for col, dtype in df.dtypes.items():
        print(f"  {col}: {dtype}")
    
    # Price statistics
    if 'close' in df.columns:
        print("\n💰 Price Statistics:")
        price_stats = df['close'].describe()
        for stat, value in price_stats.items():
            print(f"  {stat}: ${value:,.2f}")
        
        # Price anomalies
        print("\n⚠️ Price Anomaly Checks:")
        
        # Check for negative prices
        negative_prices = (df['close'] <= 0).sum()
        print(f"  Negative/Zero prices: {negative_prices}")
        
        # Check for extreme price changes (>50% in one day)
        df_sorted = df.sort_values('date')
        price_changes = df_sorted['close'].pct_change().abs()
        extreme_changes = (price_changes > 0.5).sum()
        print(f"  Extreme daily changes (>50%): {extreme_changes}")
        
        if extreme_changes > 0:
            extreme_dates = df_sorted[price_changes > 0.5]['date'].tolist()
            print(f"    Dates with extreme changes: {extreme_dates[:5]}")
    
    # Duplicate checks
    print("\n🔄 Duplicate Checks:")
    date_duplicates = df['date'].duplicated().sum()
    print(f"  Duplicate dates: {date_duplicates}")
    
    # Source distribution
    if 'source' in df.columns:
        print("\n📈 Data Source Distribution:")
        source_counts = df['source'].value_counts()
        for source, count in source_counts.items():
            pct = (count / len(df)) * 100
            print(f"  {source}: {count} records ({pct:.1f}%)")
    
    return {
        'shape': df.shape,
        'missing_values': missing_summary.to_dict(),
        'date_range': (df['date'].min(), df['date'].max()),
        'price_stats': price_stats.to_dict() if 'close' in df.columns else None,
        'anomalies': {
            'negative_prices': negative_prices if 'close' in df.columns else 0,
            'extreme_changes': extreme_changes if 'close' in df.columns else 0
        }
    }

# Run quality checks on combined data
if 'combined_btc_data' in locals() and combined_btc_data is not None:
    quality_report = perform_data_quality_checks(combined_btc_data)
else:
    print("⚠️ No combined data available for quality checks")
    quality_report = None

🔍 Data Quality Assessment
📊 Dataset Shape: (182, 7)
📅 Date Range: 2025-02-16 00:00:00 to 2025-08-16 00:00:00
🔢 Total Days: 181 days

🔍 Missing Values:
  date: ✅ No missing values
  open: ✅ No missing values
  high: ✅ No missing values
  low: ✅ No missing values
  close: ✅ No missing values
  volume: ✅ No missing values
  source: ✅ No missing values

📋 Data Types:
  date: datetime64[ns]
  open: float64
  high: float64
  low: float64
  close: float64
  volume: int64
  source: object

💰 Price Statistics:
  count: $182.00
  mean: $100,382.40
  std: $12,759.50
  min: $76,271.95
  25%: $86,865.89
  50%: $103,642.03
  75%: $109,182.90
  max: $123,344.06

⚠️ Price Anomaly Checks:
  Negative/Zero prices: 0
  Extreme daily changes (>50%): 0

🔄 Duplicate Checks:
  Duplicate dates: 0

📈 Data Source Distribution:
  yahoo: 182 records (100.0%)


### Observation 5: Data Quality Assessment

**Critical Findings:**
- [ ] Document completeness percentage by source
- [ ] Note any data quality issues requiring cleaning
- [ ] Record price anomalies and potential causes
- [ ] Assess suitability for trading strategy development

**Next Steps:**
- [ ] Address missing data through interpolation or source switching
- [ ] Implement data cleaning pipeline for anomalies
- [ ] Set up automated data quality monitoring
- [ ] Prepare data for EDA in next notebook

## 6. Export Summary

Final data export and summary for use in subsequent notebooks.

In [8]:
# Create summary of all data collection efforts
collection_summary = {
    'timestamp': datetime.now().isoformat(),
    'sources_attempted': ['investing.com', 'coinmarketcap', 'yahoo_finance'],
    'sources_successful': [],
    'total_records': 0,
    'date_range': None,
    'files_created': [],
    'quality_report': quality_report
}

# Check which sources were successful
if 'investing_df' in locals() and investing_df is not None:
    collection_summary['sources_successful'].append('investing.com')
    collection_summary['files_created'].append('btc_investing_raw.csv')

if 'yahoo_df' in locals() and yahoo_df is not None:
    collection_summary['sources_successful'].append('yahoo_finance')
    collection_summary['files_created'].append('btc_yahoo_raw.csv')

if 'cmc_data' in locals() and cmc_data is not None:
    collection_summary['sources_successful'].append('coinmarketcap')

if 'combined_btc_data' in locals() and combined_btc_data is not None:
    collection_summary['total_records'] = len(combined_btc_data)
    collection_summary['date_range'] = [
        combined_btc_data['date'].min().isoformat(),
        combined_btc_data['date'].max().isoformat()
    ]
    collection_summary['files_created'].append('btc_combined_raw.csv')

# Save collection summary
import json
with open('../data/collection_summary.json', 'w') as f:
    json.dump(collection_summary, f, indent=2, default=str)

print("📋 Data Collection Summary")
print("="*40)
print(f"✅ Successful sources: {len(collection_summary['sources_successful'])}/{len(collection_summary['sources_attempted'])}")
print(f"📊 Total records collected: {collection_summary['total_records']}")
print(f"📁 Files created: {len(collection_summary['files_created'])}")
print(f"🏁 Ready for EDA: {'✅' if collection_summary['total_records'] > 0 else '❌'}")

if collection_summary['total_records'] > 0:
    print(f"\n📅 Date range: {collection_summary['date_range'][0]} to {collection_summary['date_range'][1]}")
    print("\n🎯 Next step: Run notebook 02_eda_analysis.ipynb for exploratory data analysis")
else:
    print("\n❌ No data collected - check API keys and network connectivity")

📋 Data Collection Summary
✅ Successful sources: 2/3
📊 Total records collected: 182
📁 Files created: 2
🏁 Ready for EDA: ✅

📅 Date range: 2025-02-16T00:00:00 to 2025-08-16T00:00:00

🎯 Next step: Run notebook 02_eda_analysis.ipynb for exploratory data analysis


### Final Observation: Data Collection Complete

**Summary of Results:**
- [ ] Record final success rate of all data sources
- [ ] Document total data points collected
- [ ] Note any persistent issues with specific sources
- [ ] Confirm readiness for EDA phase

**Key Takeaways for Trading Strategy:**
1. Data reliability ranking by source
2. Recommended fallback strategy for data outages
3. Data freshness considerations for live trading
4. Quality thresholds for strategy execution