In [None]:

import sys
import json
import os
import warnings

import pandas as pd

import helper
from youtube.config import youtube_watch_history_file, video_details_download_folder

warnings.filterwarnings('ignore')

key = helper.get_google_api_key()

df = pd.read_json(youtube_watch_history_file)

# extract video_id from titleUrl
df['video_id'] = df['titleUrl'].str.extract(r'v=(.*)')

# keep rows where 'details' column is NaN which means not Ads
df = df[df['details'].isna()].reset_index()

# create a list of video_ids from the existing 'video_id' column
video_ids = df['video_id'].tolist()
print(f"total videos: {len(video_ids)}")

# warn and stop if there are more than 10000 videos
# remove this if you know what you are doing and have an API key with higher quota
if len(video_ids) > 10000:
    print("WARNING: "
          "You have more than 10,000 videos in your history. Please use the splitter.py script to split the watch history file into smaller files before proceeding. The Google API has a limit of 10,000 requests per day. After splitting the file, you can run this script for each file separately.")
    sys.exit(1)

# download all video details in parallel
import time
from concurrent.futures import ThreadPoolExecutor
import requests

print("start downloading ... please wait ...")
start = time.perf_counter()


def download_image(vid):
    response = requests.get(
        "https://content.googleapis.com/youtube/v3/videos?id={}&part=contentDetails,snippet&key={}".format(vid, key))
    data = response.json()
    data = json.dumps(data, indent=4)
    with open(os.path.join(video_details_download_folder, f'{vid}.json'), 'w') as f:
        f.write(str(data))


# todo check cpu vs. cores etc.
max_workers = os.cpu_count()
print(f"max_workers: {max_workers}")
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    executor.map(download_image, video_ids)

print(f"total time: {time.perf_counter() - start}")
print("done downloading ... please wait ...")
