In [1]:
import os
import time
import pandas as pd
from datetime import datetime, UTC
from googleapiclient.discovery import build
from dotenv import load_dotenv

# ====== Basic Configuration ======

# Load environment variables from .env file
load_dotenv()

# Read API key from environment variable
API_KEY = os.getenv("YOUTUBE_API_KEY")
if not API_KEY:
    raise ValueError("YOUTUBE_API_KEY not found in environment variables. Please create a .env file with your API key.")
youtube = build("youtube", "v3", developerKey=API_KEY)

# Original data directory (your previous us_*.csv files are here)
DATA_DIR = os.path.expanduser("~/Desktop/597HUD/datas/")
FOLLOWUP_DIR = os.path.expanduser("~/Desktop/597HUD/followups/")
os.makedirs(FOLLOWUP_DIR, exist_ok=True)

# ====== Read all video_ids ======
def load_all_video_ids(data_dir):
    video_ids = set()
    for fname in os.listdir(data_dir):
        if fname.endswith(".csv") and fname.startswith("us_"):
            fpath = os.path.join(data_dir, fname)
            try:
                df = pd.read_csv(fpath, usecols=["video_id"])
                video_ids.update(df["video_id"].dropna().astype(str).tolist())
                print(f"‚úÖ Read {fname}  {len(df)}  video IDs")
            except Exception as e:
                print(f"‚ö†Ô∏è Unable to read {fname}: {e}")
    return list(video_ids)

# ====== Batch crawl latest statistics ======
def fetch_video_stats(video_ids):
    all_data = []
    for i in range(0, len(video_ids), 50):
        batch_ids = video_ids[i:i+50]
        try:
            req = youtube.videos().list(
                part="statistics",
                id=",".join(batch_ids)
            )
            resp = req.execute()

            for item in resp.get("items", []):
                stats = item.get("statistics", {}) or {}
                video = {
                    "video_id": item["id"],
                    "views": int(stats.get("viewCount", 0) or 0),
                    "likes": int(stats.get("likeCount", 0) or 0),
                    "comments": int(stats.get("commentCount", 0) or 0),
                    "crawl_date": datetime.now(UTC).isoformat()
                }
                all_data.append(video)

            # Pause slightly every 50 videos to prevent rate limiting
            time.sleep(0.5)

        except Exception as e:
            print(f"‚ùå Batch {i//50+1} error: {e}")
            time.sleep(2)
    return pd.DataFrame(all_data)

# ====== Main Process ======
if __name__ == "__main__":
    print("üöÄ Start tracking YouTube video latest data...\n")

    video_ids = load_all_video_ids(DATA_DIR)
    print(f"\nCollected {len(video_ids)}  video IDs, ready to update statistics.\n")

    df_update = fetch_video_stats(video_ids)
    print(f"‚úÖ Successfully crawled {len(df_update)}  updated records.")

    # Save file
    date_str = datetime.now().strftime("%Y-%m-%d")
    out_path = os.path.join(FOLLOWUP_DIR, f"followup_{date_str}.csv")
    df_update.to_csv(out_path, index=False, encoding="utf-8-sig")

    print(f"üì¶ Saved to: {out_path}")
    print("üéØ Today's tracking task completed!")

üöÄ Start tracking YouTube video latest data...

‚úÖ Read us_science_technology.csv  480  video IDs
‚úÖ Read us_howto_style.csv  288  video IDs
‚úÖ Read us_education.csv  480  video IDs
‚úÖ Read us_news_politics.csv  480  video IDs
‚úÖ Read us_entertainment.csv  480  video IDs
‚úÖ Read us_gaming.csv  480  video IDs
‚úÖ Read us_music.csv  480  video IDs
‚úÖ Read us_comedy.csv  480  video IDs
‚úÖ Read us_travel_vlog.csv  480  video IDs
‚úÖ Read us_sports.csv  480  video IDs

Collected 4316  video IDs, ready to update statistics.

‚úÖ Successfully crawled 4297  updated records.
üì¶ Saved to: /Users/xuchen/Desktop/597HUD/followups/followup_2025-10-11.csv
üéØ Today's tracking task completedÔºÅ


üì¶ Follow-up files contain 4297  video_ids
‚úÖ Read us_science_technology.csv: 480  records
‚úÖ Read us_howto_style.csv: 288  records
‚úÖ Read us_education.csv: 480  records
‚úÖ Read us_news_politics.csv: 480  records
‚úÖ Read us_entertainment.csv: 480  records
‚úÖ Read us_gaming.csv: 480  records
‚úÖ Read us_music.csv: 480  records
‚úÖ Read us_comedy.csv: 480  records
‚úÖ Read us_travel_vlog.csv: 480  records
‚úÖ Read us_sports.csv: 480  records

üìä Original files contain 4316  unique video_ids

üÜï Number of IDs in follow-up but not in original: 0
Examples: []

üï≥Ô∏è Number of IDs in original but not in follow-up: 19
Examples: ['zOeCNHAYoHE', 'JQsN_mr-6k0', 'N00kuq9wkDs', 'u8bT6Y_BwLY', 'EpNEv9NPvWk', 'JXGIEc9k_jI', 'Ej46GY1LzhE', 'NnKrpgFV3ok', 'JCtYRw_W8Ns', '9R3NZfGAPOg']

üìÑ Saved separately:
   missing_in_original.csv  (in follow-up but not in original)
   missing_in_followup.csv  (in original but not in follow-up)


set()
