# Pulling data from YouTube using Pagination

### Works Cited
This pagination logic was adapted and inspired by the `get_video_ids` function from the INFO 492 Intensive Capstone course, Weeks 1 and 2 lab (`youtube-p1.ipynb`). The function demonstrates efficient retrieval of video IDs using YouTube API's pagination feature, which was instrumental in shaping the approach taken here for fetching and processing fashion-related video data.

In [1]:
from googleapiclient.discovery import build
import pandas as pd
from datetime import datetime
import re
import os
import time
import config

In [2]:
# set up YouTube Data API 
api_key = config.API_KEY
youtube = build('youtube', 'v3', developerKey=api_key)

### Making my YouTube Data Pull Code Reusable

In [3]:
def get_youtube_data(brand_name):
    published_after = datetime(2023, 9, 1).isoformat() + 'Z'
    published_before = datetime(2024, 4, 24).isoformat() + 'Z'
    keywords = ['haul', 'clothing', 'clothes', 'shop', 'shopping', 'try on', 'try-on', 'review', 'styling']
    social_media_links = ['pinterest', 'youtube', 'twitter', 'instagram', 'tiktok',
                          'reddit', 'twitch', 'facebook', 'thmatc', 'spotify']

    # fetch initial search results
    search_results = []
    request = youtube.search().list(
        q=brand_name,
        part='snippet',
        type='video',
        publishedAfter=published_after,
        publishedBefore=published_before,
        maxResults=50
    )
    response = request.execute()
    next_page_token = response.get('nextPageToken')
    
    while next_page_token is not None:
        # send request to YouTube API
        request = youtube.search().list(
            q=brand_name,
            part='snippet',
            type='video',
            publishedAfter=published_after,
            publishedBefore=published_before,
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()
        # add items from response 
        search_results.extend(response.get('items',[]))
        # get next page token for pagination
        next_page_token = response.get('nextPageToken')

    # process search results to extract relevant vid data
    brand_videos = []
    item_link_pattern = re.compile(r'(.+?) - (?:\$(\d+)\n)?(https?://\S+)')

    for search_result in search_results:
        # gets and stores video id
        video_id = search_result['id']['videoId']
        video_response = youtube.videos().list(
            # receive snippet part of data - title, description, tags, etc.
            part="snippet",
            id=video_id
        ).execute()

        # access description field of snipper
        description = video_response['items'][0]['snippet']['description']
        # extract links from description
        links = re.findall(r'(https?://\S+)', description)
        # makes all titles lowercase so code can match on any version of title:
        title = search_result['snippet']['title'].lower()

        # filters based on brand name and fashion related keywords
        if brand_name.lower() in title and any(keyword in title for keyword in keywords):
            # filters out social media links
            # filtered_links = [link for link in links if not any(keyword in link for keyword in social_media_links)]
            # match on regex pattern instead of filtering out other stuff...
            matches = item_link_pattern.findall(description)
            item_details = [{'name': match[0], 'link': match[1]} for match in matches]
            # get link to video
            video_link = f"https://www.youtube.com/watch?v={video_id}"
            brand_videos.append({
                'title': search_result['snippet']['title'],
                'links': item_details, 
                'videoLink': video_link
            })

    # process data and format for csv
    brand_youtube_data = []
    for video in brand_videos:
        if video['links']:
            brand_youtube_data.append({
                'Title': video['title'],
                'Links': '\n'.join(video['links']),
                'VideoLink': video['videoLink']
            })

    return brand_youtube_data

In [4]:
brands = ['Princess Polly', 'Mango']
all_brand_youtube_data = {}

for brand in brands:
    # start timing
    start_time = time.time()
    youtube_data = get_youtube_data(brand)
    all_brand_youtube_data[f'{brand.lower().replace(" ", "_")}_youtube_data'] = youtube_data
    # save data to csv files
    file_name = f'{brand.lower().replace(" ", "_")}_youtube_data'
    filename = f"../data/youtube_data/{file_name}.csv"
    pd.DataFrame(youtube_data).to_csv(filename, index=False)
    # calculate time
    elapsed_time = time.time() - start_time  
    print(f"CSV file saved: {filename} (runtime: {elapsed_time:.2f} seconds)")

TypeError: sequence item 0: expected str instance, dict found

In [4]:
# list of brands
brands = ['Adidas', 'Nike', 'Sandy Liang']

# done
# 'Aritzia', 'Skims', 'Shein', 'Princess Polly'
# 'Abercrombie & Fitch', 'Abercrombie and Fitch', 'Abercrombie', 'Amazon'
# 'Brandy', 'Brandy Melville', 'Uniqlo'
# 'Alo Yoga', 'Alo', 'Reformation'
# 'Boohoo', 'Nasty Gal', 'Patagonia', 'Hollister'
# 'VS', 'Victoria\'s Secret', 'PINK', 'Victorias Secret'
# 'Alice + Olivia', 'Alice & Olivia', 'Alica and Olivia', 'Billabong'
# 'Primark', 'Yes Friends', 'ASOS', 'Forever 21'
# 'Adidas', 'Nike', 'Sandy Liang'

In [5]:
# dict to store YouTube data for all brands
# keys: brand names, values: data from YouTube
all_brand_youtube_data = {}

# iterate over each brand and fetch and process data
for brand in brands:
    youtube_data = get_youtube_data(brand)
    # creates key for brand's data and assigns YouTube data to it to add to dict
    file_name = f'{brand.lower().replace(" ", "_")}_youtube_data'
    all_brand_youtube_data[file_name] = youtube_data

In [6]:
# save data to csv files (in youtube_data folder in the data folder)
for name, data in all_brand_youtube_data.items():
    filename = f"../data/youtube_data/{name}.csv"
    pd.DataFrame(data).to_csv(filename, index=False)
    print(f"CSV file saved: {filename}")

CSV file saved: ../data/youtube_data/adidas_youtube_data.csv
CSV file saved: ../data/youtube_data/nike_youtube_data.csv
CSV file saved: ../data/youtube_data/sandy_liang_youtube_data.csv
