# 01 - Data Collection

This notebook fetches DEX protocol data from CoinGecko and DefiLlama APIs.
It builds a daily panel dataset with market cap, TVL, volume, fees, and revenue metrics.

In [None]:
import os
import sys
import datetime as dt

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.getcwd()))

import pandas as pd
from dotenv import load_dotenv

from src import data_collection as dc

# Load environment variables
load_dotenv(override=False)

In [None]:
# Define date range
# Using last 150 days to ensure >= 90-day coverage
end_date = dt.date(2025, 9, 10)
start_date = end_date - dt.timedelta(days=150)

print(f"Collecting data from {start_date} to {end_date}")

In [None]:
# Load protocol configuration
protocols = dc.load_protocols('../protocols.yml')
print(f"Loaded {len(protocols)} protocols:")
for p in protocols:
    print(f"  - {p['name']} ({len(p.get('chains', []))} chains)")

In [None]:
# Build the panel dataset
print("Building panel from APIs...")
panel = dc.build_panel(start_date, end_date)

print(f"\nPanel shape: {panel.shape}")
print(f"Protocols: {panel['protocol'].nunique()}")
print(f"Date range: {panel['date'].min()} to {panel['date'].max()}")

In [None]:
# Display sample data
print("\nSample data:")
panel.head(10)

In [None]:
# Data summary
print("\nData summary by protocol:")
summary = panel.groupby('protocol').agg({
    'date': ['min', 'max', 'count'],
    'market_cap_circulating': ['mean', 'std'],
    'tvl': ['mean', 'std'],
    'volume_24h': ['mean', 'std']
}).round(2)
summary

In [None]:
# Save raw panel
os.makedirs('../data/processed', exist_ok=True)
csv_path = '../data/processed/panel_raw.csv'
panel.to_csv(csv_path, index=False)
print(f"Raw panel saved to {csv_path}")