In [None]:
import json
import requests
from dotenv import load_dotenv
import os
import pandas as pd
import re
import isodate
from tqdm import tqdm


# Load the .env file
load_dotenv(dotenv_path=".env")  # You can omit dotenv_path if it's in the same folder

# Now you can access the variables
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")

METADATA_FILE = 'youtube_metadata_cache.json'

In [None]:
def load_metadata_cache():
    if os.path.exists(METADATA_FILE):
        with open(METADATA_FILE, 'r', encoding='utf-8') as f:
            return json.load(f)
    return {}

def save_metadata_cache(cache):
    with open(METADATA_FILE, 'w', encoding='utf-8') as f:
        json.dump(cache, f, indent=2, ensure_ascii=False)

def fetch_video_metadata(video_id, api_key, cache):
    if video_id in cache:
        return cache[video_id]

    url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id={video_id}&key={api_key}"
    response = requests.get(url)
    data = response.json()

    if 'items' in data and data['items']:
        metadata = data['items'][0]
        cache[video_id] = metadata
        return metadata
    else:
        print(f"Warning: Video ID {video_id} not found or inaccessible.")
        cache[video_id] = None
        return None

def extract_video_id(video_link):
    match = re.search(r'v=([^&]+)', video_link)
    return match.group(1) if match else None

def get_duration_seconds_from_metadata(metadata):
    if metadata and 'contentDetails' in metadata and 'duration' in metadata['contentDetails']:
        duration_iso = metadata['contentDetails']['duration']
        try:
            duration = isodate.parse_duration(duration_iso)
            return int(duration.total_seconds())
        except Exception as e:
            print(f"Error parsing duration: {e}")
            return None
    return None

In [None]:
# Load your existing watch history
df = pd.read_csv('watch-history.csv')

In [None]:
# Load metadata cache
cache = load_metadata_cache()
print(len(cache.keys()))

In [None]:
durations = []
categories = []
links_to_process = df.head(5000)
previous_cache_len = len(cache)

for idx, link in tqdm(zip(links_to_process.index, links_to_process['video_link']), total=len(links_to_process), desc="Fetching metadata"):
    video_id = extract_video_id(link)
    if video_id:
        metadata = fetch_video_metadata(video_id, YOUTUBE_API_KEY, cache)

        # Extract duration
        duration_seconds = get_duration_seconds_from_metadata(metadata)

        # Extract categoryId
        category_id = None
        if metadata and 'snippet' in metadata and 'categoryId' in metadata['snippet']:
            category_id = metadata['snippet']['categoryId']

        durations.append((idx, duration_seconds))
        categories.append((idx, category_id))
    else:
        durations.append((idx, None))
        categories.append((idx, None))

    # Save cache only if it grows
    if len(cache) > previous_cache_len:
        save_metadata_cache(cache)
        previous_cache_len = len(cache)

In [None]:
# Create or reset the columns
df['duration_seconds'] = None
df['category_id'] = None

# Apply extracted values back to DataFrame
for idx, dur in durations:
    df.at[idx, 'duration_seconds'] = dur

for idx, cat_id in categories:
    df.at[idx, 'category_id'] = cat_id

# Save the enriched file
df.to_csv('watch-history-enriched.csv', index=False)

print(f"✅ Done! Enriched file saved as watch-history-enriched.csv")