#### Youtube API Scraping

In [1]:
#import needed packages 
from googleapiclient.discovery import build
import pandas as pd

# load keys from  environmental var
from dotenv import load_dotenv
import os

In [5]:
# Load .env file
load_dotenv()

# Retrieve the API key
#api_key = os.getenv("API_KEY")
#second API key 
#api_key = os.getenv("API_KEY2")
#third API key 
api_key = os.getenv("API_KEY3")
#print(api_key)  

In [6]:
#instantiate a client 
youtube = build('youtube', 'v3', developerKey=api_key)

In [7]:
#create crime keywords in lower case
crime_keywords = [
    "homicide", "murder", "killing", "manslaughter", "shooting",
    "sex abuse", "rape", "assault", "domestic violence", "gender violence",
    "assault with dangerous weapon", "aggravated assault", "attempted manslaughter", "battery",
    "robbery", "theft", "mugging", "stealing", "robbing",
    "burglary", "break in", "forced entry", "housebreaking", "tresspassing",
    "theft auto", "motor vehicle theft", "carjacking", "vehicle larceny",
    "theft other",
    "motor vehicle theft",
    "arson", "destruction of property", "set on fire"
]

#dc names 
dc_names = [
    "Washington DC", "Washington D.C.", "DC", "D.C.", "District of Columbia",
    "The District", "District", "Capital City", "The Capital",
]



In [8]:
#import time

# Function to search for videos (with pagination support and delay)
def search_youtube(query):
    all_videos = []
    next_page_token = None

    while len(all_videos) < 500:  # Adjust the limit to 5,000 videos
        request = youtube.search().list(
            part="snippet",
            q=query,
            type="video",
            maxResults=50,  # Max allowed per request
            pageToken=next_page_token  # Token for the next page
        )
        response = request.execute()

        # Add videos from the current page to the list
        all_videos.extend(response['items'])

        # Check if there's another page
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break  # Exit if there are no more pages

       # time.sleep(1)  # Add a 1-second delay between requests to avoid rate limits

    return all_videos[:500]  # Limit to 5,000 videos

# Function to get video details (like count, view count)
#def get_video_details(video_id):
   # request = youtube.videos().list(
   #     part="statistics",
   #     id=video_id
   # )
   # response = request.execute()
    #return response['items'][0]['statistics']

# Loop over each combination of dc_names and crime_keywords
def scrape_videos():
    all_videos = []
    
    for dc_name in dc_names:
        for crime_keyword in crime_keywords:
            query = f"{dc_name} {crime_keyword}"
            print(f"Searching for: {query}")
            videos = search_youtube(query)
            
            for video in videos:
                video_id = video['id']['videoId']
                stats = get_video_details(video_id)
                
                # Only keep videos with more than 100 views
                view_count = int(stats.get('viewCount', 0))
                if view_count > 100:
                    video['views'] = view_count  # Add view count to the video data
                    all_videos.append(video)
            
            # Stop if we've reached 500 videos
            if len(all_videos) >= 500:
                break
        if len(all_videos) >= 500:
            break
    
    return all_videos[:500]  # Limit to 5,000 videos


In [17]:
#skip the next chunk 

In [7]:
# Function to get video details (like count, view count, description, comments, and publication date)
def get_video_details(video_id):
    request = youtube.videos().list(
        part="snippet,statistics",  # 'snippet' for description, 'statistics' for views, comments, etc.
        id=video_id
    )
    response = request.execute()
    video_data = response['items'][0]
    
    # Extract relevant details
    stats = video_data['statistics']
    snippet = video_data['snippet']
    
    video_details = {
        'viewCount': int(stats.get('viewCount', 0)),
        'commentCount': int(stats.get('commentCount', 0)),
        'description': snippet.get('description', ''),
        'publishedAt': snippet.get('publishedAt', ''),
        'likeCount': int(stats.get('likeCount', 0)),
    }
    
    return video_details

# Scrape and store results in DataFrame
def scrape_videos_to_df():
    all_videos_data = []
    
    videos = scrape_videos()  # Call the scrape_videos function
    
    for video in videos:
        title = video['snippet']['title']
        video_id = video['id']['videoId']
        
        # Get additional video details
        video_details = get_video_details(video_id)
        
        # Collecting video data into a dictionary
        video_data = {
            'Title': title,
            'URL': f"https://www.youtube.com/watch?v={video_id}",
            'Likes': video_details['likeCount'],
            'Views': video_details['viewCount'],
            'Comments': video_details['commentCount'],
            'Description': video_details['description'],
            'Published At': video_details['publishedAt']
        }
        
        # Add the video data to the list
        all_videos_data.append(video_data)
    
    # Convert the list of dictionaries to a DataFrame
    yt_crime_data = pd.DataFrame(all_videos_data)
    
    return yt_crime_data

# Get the data and display it
yt_crime_data = scrape_videos_to_df()

# Print the DataFrame (optional)
print(yt_crime_data)

Searching for: Washington DC homicide
Searching for: Washington DC murder
                                                 Title  \
0                 Inside the DC Mansion Murders (2015)   
1                       Suspects wanted in DC homicide   
2        DC Police search for suspect in homicide case   
3    D.C.&#39;s First Female Homicide Detective Unc...   
4    32-year-old mother, special police officer ide...   
..                                                 ...   
495  Deadly 24 hours across DC with 3 homicides rep...   
496       DC Mansion Murder Suspect Taken Into Custody   
497  32-year-old mother, special police officer ide...   
498  Video captures moments before mom allegedly sh...   
499     Why violent crime is rising in Washington D.C.   

                                             URL  Likes    Views  Comments  \
0    https://www.youtube.com/watch?v=wfcN2iaE2vE  13918  1552574      1510   
1    https://www.youtube.com/watch?v=SDoOr39lAV0      1      278         

#### This is the right code to use

In [9]:
# Function to get video details (like count, view count, description, comments, and publication date, plus channel info)
def get_video_details(video_id):
    request = youtube.videos().list(
        part="snippet,statistics",  # 'snippet' for description, 'statistics' for views, comments, etc.
        id=video_id
    )
    response = request.execute()
    video_data = response['items'][0]
    
    # Extract relevant details
    stats = video_data['statistics']
    snippet = video_data['snippet']
    
    # Include channel details
    video_details = {
        'viewCount': int(stats.get('viewCount', 0)),
        'commentCount': int(stats.get('commentCount', 0)),
        'description': snippet.get('description', ''),
        'publishedAt': snippet.get('publishedAt', ''),
        'likeCount': int(stats.get('likeCount', 0)),
        'channelId': snippet.get('channelId', ''),  # Channel ID
        'channelTitle': snippet.get('channelTitle', '')  # Channel Title
    }
    
    return video_details


In [19]:
# Scrape and store results in DataFrame
def scrape_videos_to_df():
    all_videos_data = []
    
    videos = scrape_videos()  # Call the scrape_videos function
    
    for video in videos:
        title = video['snippet']['title']
        video_id = video['id']['videoId']
        
        # Get additional video details (including channel info)
        video_details = get_video_details(video_id)
        
        # Collecting video data into a dictionary
        video_data = {
            'Title': title,
            'URL': f"https://www.youtube.com/watch?v={video_id}",
            'Likes': video_details['likeCount'],
            'Views': video_details['viewCount'],
            'Comments Count': video_details['commentCount'],
            'Description': video_details['description'],
            'Published At': video_details['publishedAt'],
            'Channel ID': video_details['channelId'],  # Channel ID
            'Channel Title': video_details['channelTitle']  # Channel Title
        }
        
        # Add the video data to the list
        all_videos_data.append(video_data)
    
    # Convert the list of dictionaries to a DataFrame
    yt_crime_data = pd.DataFrame(all_videos_data)
    
    return yt_crime_data
# Get the data and display it
yt_crime_data = scrape_videos_to_df()

# Print the DataFrame (optional)
print(yt_crime_data)

Searching for: Washington DC homicide
Searching for: Washington DC murder
                                                 Title  \
0                 Inside the DC Mansion Murders (2015)   
1                       Suspects wanted in DC homicide   
2        DC Police search for suspect in homicide case   
3    D.C.&#39;s First Female Homicide Detective Unc...   
4    32-year-old mother, special police officer ide...   
..                                                 ...   
495  32-year-old mother, special police officer ide...   
496       DC Mansion Murder Suspect Taken Into Custody   
497        Suspect on the run after Foot Locker murder   
498  Video captures moments before mom allegedly sh...   
499     Why violent crime is rising in Washington D.C.   

                                             URL  Likes    Views  \
0    https://www.youtube.com/watch?v=wfcN2iaE2vE  13918  1552574   
1    https://www.youtube.com/watch?v=SDoOr39lAV0      1      278   
2    https://www.youtube.

In [21]:
# Save the DataFrame to a CSV file
yt_crime_data.to_csv('yt_crime_data_2.csv', index=False)

In [None]:
#scraping the next 500 videos 

In [11]:
# Assuming the CSV file is 'yt-crime_data_2.csv'
def scrape_videos_to_df():
    # Load the existing data from the CSV file
    try:
        yt_crime_data = pd.read_csv('yt-crime_data_2.csv')
    except FileNotFoundError:
        yt_crime_data = pd.DataFrame()  # If the file doesn't exist, start with an empty DataFrame
    
    # Extract existing video IDs from the CSV
    existing_video_ids = yt_crime_data['URL'].apply(lambda x: x.split('v=')[-1]).tolist() if not yt_crime_data.empty else []
    
    all_videos_data = []
    
    # Call the scrape_videos function to get new videos
    videos = scrape_videos()
    
    for video in videos:
        video_id = video['id']['videoId']
        
        # Skip videos that are already in the CSV file (by checking the video ID)
        if video_id in existing_video_ids:
            continue
        
        # Get additional video details (including channel info)
        video_details = get_video_details(video_id)
        
        # Collecting video data into a dictionary
        video_data = {
            'Title': video['snippet']['title'],
            'URL': f"https://www.youtube.com/watch?v={video_id}",
            'Likes': video_details['likeCount'],
            'Views': video_details['viewCount'],
            'Comments Count': video_details['commentCount'],
            'Description': video_details['description'],
            'Published At': video_details['publishedAt'],
            'Channel ID': video_details['channelId'],  # Channel ID
            'Channel Title': video_details['channelTitle']  # Channel Title
        }
        
        # Add the video data to the list
        all_videos_data.append(video_data)
    
    # Convert the list of dictionaries to a DataFrame
    new_videos_data = pd.DataFrame(all_videos_data)
    
    # If there are new videos, append them to the existing CSV file
    if not new_videos_data.empty:
        yt_crime_data = pd.concat([yt_crime_data, new_videos_data], ignore_index=True)
        yt_crime_data.to_csv('yt-crime_data_2.csv', index=False)  # Save the updated DataFrame to the CSV file
    
    return yt_crime_data

# Get the updated data and display it
yt_crime_data = scrape_videos_to_df()

# Print the updated DataFrame (optional)
print(yt_crime_data)

Searching for: Washington DC homicide
Searching for: Washington DC murder
                                                 Title  \
0                 Inside the DC Mansion Murders (2015)   
1                       Suspects wanted in DC homicide   
2        DC Police search for suspect in homicide case   
3    32-year-old mother, special police officer ide...   
4    D.C.&#39;s First Female Homicide Detective Unc...   
..                                                 ...   
495                    3 dead in multiple DC shootings   
496  Still questions in unsolved murder in young wo...   
497  Two Men Arrested in DC Murder of Memphis Rappe...   
498  Suspect in DC Hotel Homicide Had Lengthy Crimi...   
499  DC Police identify suspect in murder case wher...   

                                             URL  Likes    Views  \
0    https://www.youtube.com/watch?v=wfcN2iaE2vE  13918  1552637   
1    https://www.youtube.com/watch?v=SDoOr39lAV0      1      280   
2    https://www.youtube.