In [None]:
import os
import re
import time
import pandas as pd
from datetime import datetime, timedelta
from googleapiclient.discovery import build
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# ‚úÖ Read API key from environment variable
API_KEY = os.getenv("YOUTUBE_API_KEY")
if not API_KEY:
    raise ValueError("YOUTUBE_API_KEY not found in environment variables. Please create a .env file with your API key.")

youtube = build("youtube", "v3", developerKey=API_KEY)

# ‚úÖ 10 simplified keywords
KEYWORDS = [
    "music",
    "gaming",
    "sports",
    "education",
    "entertainment",
    "news politics",
    "howto style",
    "science technology",
    "comedy",
    "travel vlog"
]

def get_top_videos_by_keyword(keyword, region="US", max_results=480):
    # ‚úÖ Time range: past 1 day
    end_time = datetime.utcnow()
    start_time = end_time - timedelta(days=1)

    published_after = start_time.isoformat("T") + "Z"
    published_before = end_time.isoformat("T") + "Z"

    videos = []
    next_page_token = None

    while len(videos) < max_results:
        search_req = youtube.search().list(
            part="id",
            q=keyword,
            type="video",
            regionCode=region,
            publishedAfter=published_after,
            publishedBefore=published_before,
            order="viewCount",
            maxResults=50,
            pageToken=next_page_token
        )
        search_resp = search_req.execute()

        video_ids = [item["id"]["videoId"] for item in search_resp["items"]]
        if not video_ids:
            break

        stats_req = youtube.videos().list(
            part="snippet,statistics,contentDetails",
            id=",".join(video_ids)
        )
        stats_resp = stats_req.execute()

        for item in stats_resp["items"]:
            stats = item.get("statistics", {}) or {}  # ‚úÖ Defined here

            video = {
                "video_id": item["id"],
                "title": item["snippet"]["title"],
                "description": item["snippet"].get("description", ""),
                "views": int(item["statistics"].get("viewCount", 0) or 0),
                "likes": int(stats.get("likeCount", 0) or 0),  # ‚úÖ Added likes
                "comments": int(item["statistics"].get("commentCount", 0) or 0),
                "hashtags": item["snippet"].get("tags", []),
                "channel": item["snippet"].get("channelTitle", ""),
                "published_at": item["snippet"].get("publishedAt", ""),
                "category_id": item["snippet"].get("categoryId", ""),
                "duration": item["contentDetails"].get("duration", ""),
                "definition": item["contentDetails"].get("definition", ""),
                # ‚úÖ Added crawl timestamp
                "crawl_date": datetime.utcnow().isoformat()
            }
            videos.append(video)

        next_page_token = search_resp.get("nextPageToken")
        if not next_page_token:
            break

        # ‚úÖ Rate limiting to prevent quota exhaustion
        time.sleep(0.5)

    # Sort by views
    videos_sorted = sorted(videos, key=lambda x: x["views"], reverse=True)

    for idx, v in enumerate(videos_sorted, start=1):
        v["id"] = idx

    return videos_sorted[:max_results]


if __name__ == "__main__":
    save_dir = os.path.expanduser("~/Desktop/597HUD/datas/")
    os.makedirs(save_dir, exist_ok=True)

    for kw in KEYWORDS:
        try:
            print(f"üîç Fetching videos for: {kw}")
            top_videos = get_top_videos_by_keyword(kw, region="US")
            if not top_videos:
                print(f"‚ö†Ô∏è No results for {kw}, skipping.")
                continue

            df = pd.DataFrame(top_videos)
            cols = ["id"] + [c for c in df.columns if c != "id"]
            df = df[cols]

            fname = re.sub(r"[^A-Za-z0-9_]+", "_", kw)
            outname = os.path.join(save_dir, f"us_{fname}.csv")
            df.to_csv(outname, index=False, encoding="utf-8-sig")

            print(f"‚úÖ Saved: {outname}, {len(df)} records\n")

        except Exception as e:
            print(f"‚ùå Error for {kw}: {e}\n")

üîç Fetching videos for: gaming


  end_time = datetime.utcnow()
  "crawl_date": datetime.utcnow().isoformat()


‚úÖ Saved: /Users/xuchen/Desktop/597HUD/datas/us_gaming.csv,  480  records

üîç Fetching videos for: sports


  end_time = datetime.utcnow()
  "crawl_date": datetime.utcnow().isoformat()


‚úÖ Saved: /Users/xuchen/Desktop/597HUD/datas/us_sports.csv,  480  records

üîç Fetching videos for: education


  end_time = datetime.utcnow()
  "crawl_date": datetime.utcnow().isoformat()


‚úÖ Saved: /Users/xuchen/Desktop/597HUD/datas/us_education.csv,  480  records

üîç Fetching videos for: entertainment


  end_time = datetime.utcnow()
  "crawl_date": datetime.utcnow().isoformat()


‚úÖ Saved: /Users/xuchen/Desktop/597HUD/datas/us_entertainment.csv,  480  records

üîç Fetching videos for: news politics


  end_time = datetime.utcnow()
  "crawl_date": datetime.utcnow().isoformat()


‚úÖ Saved: /Users/xuchen/Desktop/597HUD/datas/us_news_politics.csv,  480  records

üîç Fetching videos for: howto style


  end_time = datetime.utcnow()
  "crawl_date": datetime.utcnow().isoformat()


‚úÖ Saved: /Users/xuchen/Desktop/597HUD/datas/us_howto_style.csv,  114  records

üîç Fetching videos for: science technology


  end_time = datetime.utcnow()
  "crawl_date": datetime.utcnow().isoformat()


‚úÖ Saved: /Users/xuchen/Desktop/597HUD/datas/us_science_technology.csv,  278  records

üîç Fetching videos for: comedy


  end_time = datetime.utcnow()
  "crawl_date": datetime.utcnow().isoformat()


‚úÖ Saved: /Users/xuchen/Desktop/597HUD/datas/us_comedy.csv,  480  records

üîç Fetching videos for: travel vlog


  end_time = datetime.utcnow()
  "crawl_date": datetime.utcnow().isoformat()


‚úÖ Saved: /Users/xuchen/Desktop/597HUD/datas/us_travel_vlog.csv,  480  records

