# YouTube API scraping module on NUTRITION

### Step 1. Imports and define the API keys

In [None]:
# Standard library imports
import os
import csv
import time
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from datetime import datetime
from calendar import monthrange

# Third-party library imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import whisper
#from langdetect import detect
from pytube import YouTube

import datetime
current_year, current_month = datetime.datetime.now().year, datetime.datetime.now().month
today = datetime.datetime.now().strftime("%Y%m%d")

# List of API keys

API_KEY_1 = 'AIzaSXXXX'
API_KEY_2 = 'AIzaXXX'
API_KEY_3 = 'AIzaXXX'
API_KEY_4 = 'AIzaXXX'

API_KEYS = [
    API_KEY_1,
    API_KEY_2,
    API_KEY_3,
    API_KEY_4
]

### Step 2. Define YouTube scraping functions

In [None]:
import os
import csv
import time
from datetime import datetime
from calendar import monthrange
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

def get_youtube_service(api_key):
    return build('youtube', 'v3', developerKey=api_key)

def search_videos(youtube, keyword, start_date, end_date, max_results=50):
    all_video_ids = []
    request = youtube.search().list(
        q=keyword,
        part='snippet',
        type='video',
        publishedAfter=start_date,
        publishedBefore=end_date,
        maxResults=max_results
    )

    while request:
        response = request.execute()
        video_ids = [item['id']['videoId'] for item in response['items']]
        all_video_ids.extend(video_ids)
        time.sleep(1)  # Throttling to avoid hitting API limits
        request = youtube.search().list_next(request, response)

    return all_video_ids

def get_video_details(youtube, video_ids):
    details = youtube.videos().list(
        part='snippet,statistics',
        id=','.join(video_ids)
    ).execute()

    return details['items']

def append_to_csv(data, filename, header=None):
    write_header = not os.path.exists(filename)
    with open(filename, 'a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if write_header and header:
            writer.writerow(header)
        writer.writerows(data)

def process_month(youtube, year, month):
    start_date = f"{year}-{month:02d}-01T00:00:00Z"
    end_date = f"{year}-{month:02d}-{monthrange(year, month)[1]}T23:59:59Z"
    
    all_video_ids = search_videos(youtube, SEARCH_TERM, start_date, end_date)
    data = []
    for chunk in [all_video_ids[i:i + 50] for i in range(0, len(all_video_ids), 50)]:
        video_details = get_video_details(youtube, chunk)
        for video in video_details:
            data.append([
                video['id'],  # Adding video ID as the first element
                video['snippet']['title'],
                video['snippet']['channelTitle'],
                video['snippet']['publishedAt'],
                video['statistics'].get('viewCount'),
                video['statistics'].get('likeCount')
            ])
    
    append_to_csv(data, FILENAME, header=['Video ID', 'Title', 'Channel', 'Upload Date', 'View Count', 'Like Count'])


### Step 3. Define nutrition focused search terms

In [None]:
# List of search terms - NUTRITION and METABOLISM

FILENAME = f'../youtube_nutrition_{today}.csv' # Continuation file

# THURSDAYS
TERMS = [
'lion diet',
'carnivore diet',
'BBBE',
'meat only diet',
'zero carb diet',

'ozempic',
'semaglutide',
"SGLT",
'GLP1',
'wegovy',
'mounjaro'
]

print(FILENAME)

In [None]:
# FRIDAYS

FILENAME = f'../youtube_nutrition_{today}.csv' # Continuation file

TERMS = [

'atkins diet',
'paleo diet',
'mediterranean diet',
'LOWCARB',

'plant based',
'vegan',

'fasting',
'intermittent fasting',
'OMAD one meal a day',
'autophagy',

'keto diet',
'ketogenic',
'ketosis',
'LCHF low carb high fat'
]

### Step 4. Hard code the start date. Start the scraping

In [None]:
# UPDATE THE Hardcoded values BELOW:
SEARCH_TERM = ''

def main(search_term=None):
    global SEARCH_TERM
    if search_term is not None:
        SEARCH_TERM = search_term
    
    start_year = 2023  # UPDATE THE STARTING YEAR
    start_month = 1  # UPDATE THE STARTING MONTH
    year, month = start_year, start_month
    current_year, current_month = datetime.now().year, datetime.now().month

    api_key_index = 0

    while year < current_year or (year == current_year and month <= current_month):
        if api_key_index >= len(API_KEYS):
            print("All API keys have exceeded their quota. Exiting.")
            break

        api_key = API_KEYS[api_key_index]
        try:
            youtube = get_youtube_service(api_key)
            process_month(youtube, year, month)
            print(f"Processed {year}-{month:02d}")

            month += 1
            if month > 12:
                month = 1
                year += 1

            if year > current_year or (year == current_year and month > current_month):
                break

        except HttpError as e:
            if e.resp.status in [403, 429]:
                print(f"Quota exceeded with API key {api_key}. Switching to next API key.")
                api_key_index += 1
            else:
                print(f"Error encountered: {e}")
                break

    print(f"Data saved to {FILENAME}")

if __name__ == "__main__":
    for term in TERMS:
        print(f"Processing term: {term}")
        main(term)