#### Youtube API Scraping

In [1]:
#import needed packages 
from googleapiclient.discovery import build
import pandas as pd

# load keys from  environmental var
from dotenv import load_dotenv
import os

In [9]:
# Load .env file
load_dotenv()

# Retrieve the API key
#api_key = os.getenv("API_KEY")
#second API key 
api_key = os.getenv("API_KEY2")
#third API key 
#api_key = os.getenv("API_KEY3")
#print(api_key)  

In [10]:
#instantiate a client 
youtube = build('youtube', 'v3', developerKey=api_key)

In [11]:
#create crime keywords in lower case
crime_keywords = [
    "homicide", "murder", "killing", "manslaughter", "shooting",
    "sex abuse", "rape", "assault", "domestic violence", "gender violence",
    "assault with dangerous weapon", "aggravated assault", "attempted manslaughter", "battery",
    "robbery", "theft", "mugging", "stealing", "robbing",
    "burglary", "break in", "forced entry", "housebreaking", "tresspassing",
    "theft auto", "motor vehicle theft", "carjacking", "vehicle larceny",
    "theft other",
    "motor vehicle theft",
    "arson", "destruction of property", "set on fire"
]

#dc names 
dc_names = [
    "Washington DC", "Washington D.C.", "DC", "D.C.", "District of Columbia",
    "The District", "District", "Capital City", "The Capital",
]



In [12]:
#import time

# Function to search for videos (with pagination support and delay)
def search_youtube(query):
    all_videos = []
    next_page_token = None

    while len(all_videos) < 500:  # Adjust the limit to 5,000 videos
        request = youtube.search().list(
            part="snippet",
            q=query,
            type="video",
            maxResults=50,  # Max allowed per request
            pageToken=next_page_token  # Token for the next page
        )
        response = request.execute()

        # Add videos from the current page to the list
        all_videos.extend(response['items'])

        # Check if there's another page
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break  # Exit if there are no more pages

       # time.sleep(1)  # Add a 1-second delay between requests to avoid rate limits

    return all_videos[:500]  # Limit to 5,000 videos

# Function to get video details (like count, view count)
#def get_video_details(video_id):
   # request = youtube.videos().list(
   #     part="statistics",
   #     id=video_id
   # )
   # response = request.execute()
    #return response['items'][0]['statistics']

# Loop over each combination of dc_names and crime_keywords
def scrape_videos():
    all_videos = []
    
    for dc_name in dc_names:
        for crime_keyword in crime_keywords:
            query = f"{dc_name} {crime_keyword}"
            print(f"Searching for: {query}")
            videos = search_youtube(query)
            
            for video in videos:
                video_id = video['id']['videoId']
                stats = get_video_details(video_id)
                
                # Only keep videos with more than 100 views
                view_count = int(stats.get('viewCount', 0))
                if view_count > 100:
                    video['views'] = view_count  # Add view count to the video data
                    all_videos.append(video)
            
            # Stop if we've reached 500 videos
            if len(all_videos) >= 500:
                break
        if len(all_videos) >= 500:
            break
    
    return all_videos[:500]  # Limit to 500 videos


#### Function to get the video details 

In [16]:
# Function to get video details (like count, view count, description, comments, and publication date, plus channel info)
def get_video_details(video_id):
    request = youtube.videos().list(
        part="snippet,statistics",  
        id=video_id
    )
    response = request.execute()
    video_data = response['items'][0]
    
    # Extract relevant details
    stats = video_data['statistics']
    snippet = video_data['snippet']
    
    # Include channel details
    video_details = {
        'viewCount': int(stats.get('viewCount', 0)),
        'commentCount': int(stats.get('commentCount', 0)),
        'description': snippet.get('description', ''),
        'publishedAt': snippet.get('publishedAt', ''),
        'likeCount': int(stats.get('likeCount', 0)),
        'channelId': snippet.get('channelId', ''),  # Channel ID
        'channelTitle': snippet.get('channelTitle', '')  # Channel Title
    }
    
    return video_details


In [19]:
# Scrape and store results in DataFrame
#create a function that scrape and stores the vidoes into a dataframe 
def scrape_videos_to_df():
    all_videos_data = []
      # Call the scrape_videos function
    videos = scrape_videos()
    
    for video in videos:
        title = video['snippet']['title']
        video_id = video['id']['videoId']
        
        # Get additional video details 
        video_details = get_video_details(video_id)
        
        # Collecting video data into a dictionary
        video_data = {
            'Title': title,
            'URL': f"https://www.youtube.com/watch?v={video_id}",
            'Likes': video_details['likeCount'],
            'Views': video_details['viewCount'],
            'Comments Count': video_details['commentCount'],
            'Description': video_details['description'],
            'Published At': video_details['publishedAt'],
            'Channel ID': video_details['channelId'],  
            'Channel Title': video_details['channelTitle'] 
        }
        
        # Add the video data to the list
        all_videos_data.append(video_data)
    
    # Convert the list of dictionaries to a DataFrame
    yt_crime_data = pd.DataFrame(all_videos_data)
    
    return yt_crime_data
# Get the data and display it
yt_crime_data = scrape_videos_to_df()

# Print the DataFrame (optional)
print(yt_crime_data)

Searching for: Washington DC homicide
Searching for: Washington DC murder
                                                 Title  \
0                 Inside the DC Mansion Murders (2015)   
1                       Suspects wanted in DC homicide   
2        DC Police search for suspect in homicide case   
3    D.C.&#39;s First Female Homicide Detective Unc...   
4    32-year-old mother, special police officer ide...   
..                                                 ...   
495  32-year-old mother, special police officer ide...   
496       DC Mansion Murder Suspect Taken Into Custody   
497        Suspect on the run after Foot Locker murder   
498  Video captures moments before mom allegedly sh...   
499     Why violent crime is rising in Washington D.C.   

                                             URL  Likes    Views  \
0    https://www.youtube.com/watch?v=wfcN2iaE2vE  13918  1552574   
1    https://www.youtube.com/watch?v=SDoOr39lAV0      1      278   
2    https://www.youtube.

In [21]:
# Save the DataFrame to a CSV file
yt_crime_data.to_csv('yt_crime_data_2.csv', index=False)

#### Running the function multiple times to scrape 500 videos per scrape because Youtube has a token limit

In [11]:
# scraping the next 500 videos and appending to the csv file 
def scrape_videos_to_df():
    # Load the existing data from the CSV file
    try:
        yt_crime_data = pd.read_csv('yt-crime_data_2.csv')
    except FileNotFoundError:
        # If the file doesn't exist, start with an empty DataFrame
        yt_crime_data = pd.DataFrame()  
    
    # Extract existing video IDs from the CSV
    existing_video_ids = yt_crime_data['URL'].apply(lambda x: x.split('v=')[-1]).tolist() if not yt_crime_data.empty else []
    
    all_videos_data = []
    
    # Call the scrape_videos function to get new videos
    videos = scrape_videos()
    
    for video in videos:
        video_id = video['id']['videoId']
        
        # Skip videos that are already in the CSV file (by checking the video ID)
        if video_id in existing_video_ids:
            continue
        
        # Get additional video details
        video_details = get_video_details(video_id)
        
        # Collecting video data into a dictionary
        video_data = {
            'Title': video['snippet']['title'],
            'URL': f"https://www.youtube.com/watch?v={video_id}",
            'Likes': video_details['likeCount'],
            'Views': video_details['viewCount'],
            'Comments Count': video_details['commentCount'],
            'Description': video_details['description'],
            'Published At': video_details['publishedAt'],
            'Channel ID': video_details['channelId'],
            'Channel Title': video_details['channelTitle']  
        }
        
        # Add the video data to the list
        all_videos_data.append(video_data)
    
    # Convert the list of dictionaries to a DataFrame
    new_videos_data = pd.DataFrame(all_videos_data)
    
    # If there are new videos, append them to the existing CSV file
    if not new_videos_data.empty:
        yt_crime_data = pd.concat([yt_crime_data, new_videos_data], ignore_index=True)
        yt_crime_data.to_csv('yt-crime_data_2.csv', index=False)  # Save the updated DataFrame to the CSV file
    
    return yt_crime_data

# Get the updated data and display it
yt_crime_data = scrape_videos_to_df()

# Print the updated DataFrame (optional)
print(yt_crime_data)

Searching for: Washington DC homicide
Searching for: Washington DC murder
                                                 Title  \
0                 Inside the DC Mansion Murders (2015)   
1                       Suspects wanted in DC homicide   
2        DC Police search for suspect in homicide case   
3    32-year-old mother, special police officer ide...   
4    D.C.&#39;s First Female Homicide Detective Unc...   
..                                                 ...   
495                    3 dead in multiple DC shootings   
496  Still questions in unsolved murder in young wo...   
497  Two Men Arrested in DC Murder of Memphis Rappe...   
498  Suspect in DC Hotel Homicide Had Lengthy Crimi...   
499  DC Police identify suspect in murder case wher...   

                                             URL  Likes    Views  \
0    https://www.youtube.com/watch?v=wfcN2iaE2vE  13918  1552637   
1    https://www.youtube.com/watch?v=SDoOr39lAV0      1      280   
2    https://www.youtube.

#### Repeat the same procedure as above to get the next 500 videos 

In [7]:
# scrape and add to existing csv file 
def scrape_videos_to_df(filename='yt-crime_data_2.csv'):
    #Load the existing data from the CSV file
    try:
        yt_crime_data = pd.read_csv(filename)
    except FileNotFoundError:
        yt_crime_data = pd.DataFrame()  # If the file doesn't exist, start with an empty DataFrame
    
    #Extract existing video IDs from the 'URL' column 
    existing_video_ids = yt_crime_data['URL'].apply(lambda x: x.split('v=')[-1]).tolist() if not yt_crime_data.empty else []

    # Initialize a list to store new video data
    all_videos_data = []

    # Call the scrape_videos function to get new videos
    videos = scrape_videos()  

    # Loop through the scraped videos and check if they already exist in the DataFrame
    for video in videos:
        video_id = video['id']['videoId']

        # Skip videos that are already in the CSV file (by checking the video ID)
        if video_id in existing_video_ids:
            continue

        # Get additional video details (this assumes the get_video_details function is defined)
        video_details = get_video_details(video_id)

        # Collect video data into a dictionary
        video_data = {
            'Title': video['snippet']['title'],
            'URL': f"https://www.youtube.com/watch?v={video_id}",
            'Likes': video_details['likeCount'],
            'Views': video_details['viewCount'],
            'Comments Count': video_details['commentCount'],
            'Description': video_details['description'],
            'Published At': video_details['publishedAt'],
            'Channel ID': video_details['channelId'], 
            'Channel Title': video_details['channelTitle'] 
        }

        # Add the video data to the list
        all_videos_data.append(video_data)

    # Convert the list of dictionaries to a DataFrame
    new_videos_data = pd.DataFrame(all_videos_data)

    # If there are new videos, append them to the existing DataFrame and save to CSV
    if not new_videos_data.empty:
        yt_crime_data = pd.concat([yt_crime_data, new_videos_data], ignore_index=True)
        # Save the updated DataFrame to the CSV file
        yt_crime_data.to_csv(filename, index=False)  

    return yt_crime_data

# Get the updated data and display it
yt_crime_data = scrape_videos_to_df('yt-crime_data_2.csv')

# Print the updated DataFrame (optional)
print(yt_crime_data)

Searching for: Washington DC homicide
Searching for: Washington DC murder
                                                 Title  \
0                 Inside the DC Mansion Murders (2015)   
1                       Suspects wanted in DC homicide   
2        DC Police search for suspect in homicide case   
3    32-year-old mother, special police officer ide...   
4    D.C.&#39;s First Female Homicide Detective Unc...   
..                                                 ...   
617  Man Charged In Murder Of D.C. Jogger | NBC Nig...   
618                 D.C. Mansion Murder Suspect Caught   
619  DC man charged with murder in shooting death o...   
620  DC Police Sergeant charged with second-degree ...   
621  Murder victim&#39;s grandmother angry at DC po...   

                                             URL  Likes    Views  \
0    https://www.youtube.com/watch?v=wfcN2iaE2vE  13918  1552637   
1    https://www.youtube.com/watch?v=SDoOr39lAV0      1      280   
2    https://www.youtube.

#### Scraping while handling pagination

In [15]:

#get 500 more views with pagination
# Function to handle YouTube video scraping with pagination
def scrape_videos_with_pagination(max_results=600):
    # Initialize an empty list to store the video results
    all_videos = []
    
    # Define the initial API request parameters
    next_page_token = None  # Start with no page token
    results_fetched = 0  # Keep track of how many results we've fetched
    
    while results_fetched < max_results:
        # Make the API request with the nextPageToken for pagination
        search_response = youtube.search().list(
            part="snippet",
            maxResults=50,  # Maximum results per page
            pageToken=next_page_token  # Token to fetch the next page
        ).execute()
        
        # Append the fetched videos to the list
        all_videos.extend(search_response['items'])
        results_fetched += len(search_response['items'])  # Count how many we've fetched
        
        # Check if there's a next page
        next_page_token = search_response.get('nextPageToken')
        
        # If no nextPageToken, we've reached the end of the results
        if not next_page_token:
            break

    return all_videos

def scrape_videos_to_df(filename='yt-crime_data_2.csv'):
    # Load the existing data from the CSV file
    try:
        yt_crime_data = pd.read_csv(filename)
    except FileNotFoundError:
          # If the file doesn't exist, start with an empty DataFrame
        yt_crime_data = pd.DataFrame()
    
    # Extract existing video IDs from the 'URL' column 
    existing_video_ids = yt_crime_data['URL'].apply(lambda x: x.split('v=')[-1]).tolist() if not yt_crime_data.empty else []

    # Initialize a list to store new video data
    all_videos_data = []

    # Call the scrape_videos_with_pagination function to get new videos
    videos = scrape_videos_with_pagination(max_results=600) 

    # Loop through the scraped videos and check if they already exist in the DataFrame
    for video in videos:
        # Check if 'id' and 'videoId' are present in the video object
        if 'id' in video and 'videoId' in video['id']:
            video_id = video['id']['videoId']
        else:
            # Skip videos that don't have a valid 'videoId'
            continue

        # Skip videos that are already in the CSV file (by checking the video ID)
        if video_id in existing_video_ids:
            continue

        # Get additional video details 
        video_details = get_video_details(video_id)

        # Collecting video data into a dictionary
        video_data = {
            'Title': video['snippet']['title'],
            'URL': f"https://www.youtube.com/watch?v={video_id}",
            'Likes': video_details['likeCount'],
            'Views': video_details['viewCount'],
            'Comments Count': video_details['commentCount'],
            'Description': video_details['description'],
            'Published At': video_details['publishedAt'],
            'Channel ID': video_details['channelId'],  # Channel ID
            'Channel Title': video_details['channelTitle']  # Channel Title
        }

        # Add the video data to the list
        all_videos_data.append(video_data)

    # Convert the list of dictionaries to a DataFrame
    new_videos_data = pd.DataFrame(all_videos_data)

    # append new vidoes to the existing DataFrame and save to CSV
    if not new_videos_data.empty:
        yt_crime_data = pd.concat([yt_crime_data, new_videos_data], ignore_index=True)
        yt_crime_data.to_csv(filename, index=False)  # Save the updated DataFrame to the CSV file

    return yt_crime_data

# Get the updated data and display it
yt_crime_data = scrape_videos_to_df('yt-crime_data_2.csv')

# view the updated DataFrame
print(yt_crime_data)

                                                 Title  \
0                 Inside the DC Mansion Murders (2015)   
1                       Suspects wanted in DC homicide   
2        DC Police search for suspect in homicide case   
3    32-year-old mother, special police officer ide...   
4    D.C.&#39;s First Female Homicide Detective Unc...   
..                                                 ...   
717                           😢【2ch感動スレ】感動の迷言集～チャッカマン～   
718  Pudo ser el mejor delantero del mundo pero su ...   
719              兄の爪切りを見ただけで拒絶反応を起こした猫がとんでもないことになりました…   
720  UNKNOWN SISTERS OF MUKESH AMBANI!!! #mukeshamb...   
721                               😢【2ch感動スレ】感動の迷言集～真相～   

                                             URL  Likes    Views  \
0    https://www.youtube.com/watch?v=wfcN2iaE2vE  13918  1552637   
1    https://www.youtube.com/watch?v=SDoOr39lAV0      1      280   
2    https://www.youtube.com/watch?v=H3Au26Fws64      8     1709   
3    https://www.youtube.com/wa

#### Get all videos including those with less than 100 views  


In [18]:
# Function to handle YouTube video scraping with pagination
def scrape_videos_with_pagination(max_results=500):
    # Initialize an empty list to store the video results
    all_videos = []
    
    # Define the initial API request parameters
    # Start with no page token
    next_page_token = None  
    # Keep track of how many results we've fetched
    results_fetched = 0  
    
    while results_fetched < max_results:
        # Make the API request with the nextPageToken for pagination
        search_response = youtube.search().list(
            part="snippet",
            maxResults=50,  # Maximum results per page
            pageToken=next_page_token  # Token to fetch the next page
        ).execute()
        
        # Append the fetched videos to the list
        all_videos.extend(search_response['items'])
        results_fetched += len(search_response['items'])  # Count how many we've fetched
        
        # Check if there's a next page
        next_page_token = search_response.get('nextPageToken')
        
        # If no nextPageToken, we've reached the end of the results
        if not next_page_token:
            break

    return all_videos

def scrape_videos_to_df(filename='yt-crime_data_2.csv'):
    # Load the existing data from the CSV file
    try:
        yt_crime_data = pd.read_csv(filename)
    except FileNotFoundError:
        # If the file doesn't exist, start with an empty DataFrame
        yt_crime_data = pd.DataFrame()  
    
    # Extract existing video IDs from the 'URL' column 
    existing_video_ids = yt_crime_data['URL'].apply(lambda x: x.split('v=')[-1]).tolist() if not yt_crime_data.empty else []

    # Initialize a list to store new video data
    all_videos_data = []

    #Call the scrape_videos_with_pagination function to get new videos
    videos = scrape_videos_with_pagination(max_results=600)  
    #Loop through the scraped videos and check if they already exist in the DataFrame
    for video in videos:
        # Check if 'id' and 'videoId' are present in the video object
        if 'id' in video and 'videoId' in video['id']:
            video_id = video['id']['videoId']
        else:
            # Skip videos that don't have a valid 'videoId'
            continue

        # Skip videos that are already in the CSV file
        if video_id in existing_video_ids:
            continue

        # Get additional video details
        video_details = get_video_details(video_id)

        # Collect video data into a dictionary
        video_data = {
            'Title': video['snippet']['title'],
            'URL': f"https://www.youtube.com/watch?v={video_id}",
            'Likes': video_details['likeCount'],
            'Views': video_details['viewCount'],
            'Comments Count': video_details['commentCount'],
            'Description': video_details['description'],
            'Published At': video_details['publishedAt'],
            'Channel ID': video_details['channelId'],  # Channel ID
            'Channel Title': video_details['channelTitle']  # Channel Title
        }

        # Add the video data to the list
        all_videos_data.append(video_data)

    # convert the list of dictionaries to a DataFrame
    new_videos_data = pd.DataFrame(all_videos_data)

    # append new videos to the existing DataFrame and save to CSV
    if not new_videos_data.empty:
        yt_crime_data = pd.concat([yt_crime_data, new_videos_data], ignore_index=True)
        yt_crime_data.to_csv(filename, index=False)  # Save the updated DataFrame to the CSV file

    return yt_crime_data

# Get the updated data and display it
yt_crime_data = scrape_videos_to_df('yt-crime_data_2.csv')

# Print the updated DataFrame (optional)
print(yt_crime_data)

                                                 Title  \
0                 Inside the DC Mansion Murders (2015)   
1                       Suspects wanted in DC homicide   
2        DC Police search for suspect in homicide case   
3    32-year-old mother, special police officer ide...   
4    D.C.&#39;s First Female Homicide Detective Unc...   
..                                                 ...   
784  Live🔴Akhilesh Yadav सदन में Sambhal पर बोले हि...   
785  Comprando a decoração do meu quarto novo 🌸🩷 #v...   
786  Me quede asi 👁️👄👁️ #minivlog #bakabakamx #lipg...   
787          TÍPICAS TRAVESURAS DE HERMANOS: (PARTE 3)   
788                CUANDO TE TOCA REPARTIDORA PREMIUM:   

                                             URL   Likes    Views  \
0    https://www.youtube.com/watch?v=wfcN2iaE2vE   13918  1552637   
1    https://www.youtube.com/watch?v=SDoOr39lAV0       1      280   
2    https://www.youtube.com/watch?v=H3Au26Fws64       8     1709   
3    https://www.youtube.co