# Notebook 01: YouTube Data Collection

## Objective
Collect YouTube video data from two independent sources:
1. Web Scraping using Selenium
2. YouTube Data API v3

Target: 3000+ videos from each source for model training and comparison.

In [None]:
import sys
import os

# Add source directory to Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src import scraping, youtube_api
from dotenv import load_dotenv
import pandas as pd

print("="*80)
print("YOUTUBE DATA COLLECTION")
print("="*80)

## Part 1: Web Scraping

Collect video metadata using Selenium WebDriver.
This method extracts publicly visible data from YouTube search results.

In [None]:
print("\nPART 1: WEB SCRAPING")
print("="*80)
print("Note: This may take 30-60 minutes for 3000+ videos.")
print("Set headless=True to run without browser window.\n")

# Collect diverse dataset using automated function
scraped_data = scraping.collect_diverse_dataset(
    target_count=3000,
    output_file="../data/raw/scraped_videos.csv",
    headless=True
)

print(f"\nScraped Data Collection Complete: {len(scraped_data)} videos")
print(f"Saved to: data/raw/scraped_videos.csv")

# Display sample
if scraped_data:
    df_scraped = pd.read_csv("../data/raw/scraped_videos.csv")
    print(f"\nSample of scraped data:")
    print(df_scraped.head(3))
    print(f"\nColumns: {list(df_scraped.columns)}")

## Part 2: YouTube Data API

Collect structured video metadata using official YouTube Data API v3.
Requires API key from Google Cloud Console.

In [None]:
print("\nPART 2: YOUTUBE DATA API")
print("="*80)
print("Note: Requires YouTube API key in .env file")
print("Get API key from: https://console.cloud.google.com/\n")

# Check if API key exists
load_dotenv()
api_key = os.getenv("YOUTUBE_API_KEY")

if not api_key:
    print("WARNING: YOUTUBE_API_KEY not found!")
    print("Please create a .env file with: YOUTUBE_API_KEY=your_api_key_here")
    print("Skipping API collection...")
    api_data = []
else:
    # Collect API data
    api_data = youtube_api.collect_diverse_dataset(
        target_count=3000,
        output_file="../data/raw/api_videos.csv"
    )
    
    print(f"\nAPI Data Collection Complete: {len(api_data)} videos")
    print(f"Saved to: data/raw/api_videos.csv")
    
    # Display sample
    if api_data:
        df_api = pd.read_csv("../data/raw/api_videos.csv")
        print(f"\nSample of API data:")
        print(df_api.head(3))
        print(f"\nColumns: {list(df_api.columns)}")

## Summary

In [None]:
print("\n" + "="*80)
print("DATA COLLECTION SUMMARY")
print("="*80)
print(f"\nScraped Videos: {len(scraped_data):,}")
print(f"API Videos: {len(api_data):,}")
print(f"Total Videos: {len(scraped_data) + len(api_data):,}")
print("\n" + "="*80)
print("Next Steps:")
print("  1. Run notebook 02_preprocessing.ipynb")
print("  2. Run notebooks 03 & 04 to train models")
print("  3. Run notebook 05 for evaluation")
print("="*80)