# üìä Day 1: Data Collection

**Objective:** Collect YouTube creator data using FREE API

**Dataset:** 1,031 channels across 11 niches (from seed CSVs)

---

## 1. Setup

In [None]:
import os
import sys
import pandas as pd
import numpy as np

# Add src to path
sys.path.insert(0, '../src')

from dotenv import load_dotenv
load_dotenv('../.env')

print("Setup complete!")

## 2. Load Seed Channels

In [None]:
from data_collection.free_channel_sources import (
    load_all_seed_channels,
    get_channels_summary,
    get_micro_creators
)

# Load all seed channels
seed_channels = load_all_seed_channels('../data/seed_channels')
print(f"\nLoaded {len(seed_channels)} channels")

In [None]:
# View summary
summary = get_channels_summary('../data/seed_channels')

print("\nüìä Channels by Niche:")
for niche, count in summary['by_niche'].items():
    print(f"  ‚Ä¢ {niche}: {count}")

if 'subscriber_stats' in summary:
    stats = summary['subscriber_stats']
    print(f"\nüìà Subscriber Stats:")
    print(f"  ‚Ä¢ Min: {stats['min']:,}")
    print(f"  ‚Ä¢ Max: {stats['max']:,}")
    print(f"  ‚Ä¢ Median: {stats['median']:,}")

In [None]:
# Preview data
seed_channels.head(10)

## 3. Configure API Key

In [None]:
API_KEY = os.getenv('YOUTUBE_API_KEY')

if not API_KEY or API_KEY == 'your_api_key_here':
    print("‚ö†Ô∏è Please set your API key!")
    print("1. Go to https://console.cloud.google.com/")
    print("2. Create project ‚Üí Enable YouTube Data API v3")
    print("3. Create API Key ‚Üí Copy to ../.env file")
else:
    print(f"‚úÖ API Key loaded: {API_KEY[:10]}...")

## 4. Initialize Collector

In [None]:
from data_collection.youtube_api import YouTubeDataCollector

collector = YouTubeDataCollector(api_key=API_KEY)

print(f"Daily quota: {collector.daily_quota:,} units")
print(f"Quota used: {collector.quota_used}")
print(f"Remaining: {collector.get_remaining_quota():,} units")

## 5. Test with Single Channel

In [None]:
# Test with first channel
test_id = seed_channels['channel_id'].iloc[0]
print(f"Testing with: {test_id}")

result = collector.get_channel_stats(test_id)

if result:
    print(f"\n‚úÖ Success!")
    print(f"  Channel: {result['title']}")
    print(f"  Subscribers: {result['subscriber_count']:,}")
    print(f"  Videos: {result['video_count']}")
else:
    print("‚ùå Failed - check API key")

## 6. Collect Data (Full Dataset)

In [None]:
# Prepare channel list with niche mapping
channel_ids = seed_channels['channel_id'].tolist()
niche_map = dict(zip(seed_channels['channel_id'], seed_channels['niche']))

print(f"Channels to collect: {len(channel_ids)}")
print(f"Estimated quota: ~{len(channel_ids) * 4:,} units")
print(f"Available quota: {collector.get_remaining_quota():,} units")

# NOTE: 1,031 channels √ó 4 units = ~4,124 units
# This is within the 10,000 daily limit!

In [None]:
# Collect all data (this will take ~30-60 minutes)
channels_df, videos_df = collector.collect_channels_batch(
    channel_ids=channel_ids,
    niche_map=niche_map,
    save_path='../data/raw',
    videos_per_channel=30
)

## 7. Save Results

In [None]:
# Save to CSV
channels_df.to_csv('../data/raw/channels.csv', index=False)
videos_df.to_csv('../data/raw/videos.csv', index=False)

print(f"\n‚úÖ Data saved!")
print(f"  ‚Ä¢ channels.csv: {len(channels_df)} rows")
print(f"  ‚Ä¢ videos.csv: {len(videos_df)} rows")

## 8. Quick Analysis

In [None]:
print("\nüìä Collection Summary")
print("="*50)
print(f"Channels: {len(channels_df)}")
print(f"Videos: {len(videos_df)}")
print(f"Avg videos/channel: {len(videos_df)/len(channels_df):.1f}")

print("\nBy Niche:")
print(channels_df['niche'].value_counts())

print("\nSubscriber Distribution:")
print(channels_df['subscriber_count'].describe())

In [None]:
# Visualize
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Niche distribution
channels_df['niche'].value_counts().plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Channels by Niche')
axes[0].set_xlabel('Niche')
axes[0].tick_params(axis='x', rotation=45)

# Subscriber distribution (log)
axes[1].hist(np.log10(channels_df['subscriber_count'] + 1), bins=30, color='steelblue', edgecolor='black')
axes[1].set_title('Subscriber Distribution (log10)')
axes[1].set_xlabel('Log10(Subscribers)')

plt.tight_layout()
plt.savefig('../data/raw/distribution_plots.png', dpi=150)
plt.show()

## ‚úÖ Day 1 Complete!

**Next:** Day 2 - Feature Engineering

Run: `notebooks/02_feature_engineering.ipynb`