In [None]:
##### Retrieve Youtube video details using Python and the Youtube Data API
##### Followed the ideas from this video to parse the Youtube channel data - https://www.youtube.com/watch?v=SwSbnmqk3zY

In [None]:
from googleapiclient.discovery import build
import pandas as pd
import seaborn as sb
import os
import csv
import nltk ##natural language toolkit to get the library of commonly used words
from nltk.corpus import stopwords
import string

In [None]:
api_key = <<use the api key that was created>>

#HandleList = ["@SQLBI", "@Tableau", "@manutd", "@mancity", "@realmadrid", "@Databricks" ]

api_service_name = 'youtube'
api_version = 'v3'

ytdata = build(api_service_name, api_version, developerKey=api_key)

In [None]:
## Process code from here - function get_handle_stats retrieves the channel data using googleapi's channel list function 
## multiple Channel Handles are passed to this function and it retrieves the data needed from the response json

def get_handle_stats(ytdata, HandleList):
    
    all_ytdata = []
    
    for handle in HandleList:
        request = ytdata.channels().list(part="snippet,contentDetails,statistics", 
            forHandle= handle)
        response = request.execute()
    
    ## response is a json object with different details of the channel
    ## parse the json response
    ## playlistid below can be used to get the list of all videos uploaded to this channel
    
        ytinfo = dict(Channel = response['items'][0]['snippet']['title'],
                      ChannelID = response['items'][0]['id'],
                      ChannelImg = response['items'][0]['snippet']['thumbnails']['high']['url'],
                      Subscribers = response['items'][0]['statistics']['subscriberCount'],
                      Views = response['items'][0]['statistics']['viewCount'],
                      Videos = response['items'][0]['statistics']['videoCount'],
                      PlaylistID = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
                        )
                   
        all_ytdata.append(ytinfo)
    
    return all_ytdata

In [None]:
channel_stats = get_handle_stats(ytdata, HandleList)
## channel_stats dictionary is converted to a dataframe for further processing
channel_data = pd.DataFrame(channel_stats)

In [None]:
channel_data['Subscribers'] = pd.to_numeric(channel_data['Subscribers'])
channel_data['Views'] = pd.to_numeric(channel_data['Views'])
channel_data['Videos'] = pd.to_numeric(channel_data['Videos'])
#channel_data.dtypes
channel_data
#ChannelID and PlaylistID seems to give the same details

In [None]:
## Verifying the values 

channel = channel_data.iloc[0]["Channel"]
playlistid1 = channel_data.iloc[0]["PlaylistID"]
len(channel_data)

In [None]:
## function get_video_ids - Given a playlistid of a channel, all the videos uploaded in the channel is retrieved 

def get_video_ids(ytdata, playlistid):
    
    request = ytdata.playlistItems().list(
        part="contentDetails",
        playlistId=playlistid,
        maxResults=50
    )
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    ## only 50 video ids can be retrieved in a single response, next_page_token is used as a pointer to retrieve the next 50
    next_page_token = response.get('nextPageToken')    
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = ytdata.playlistItems().list(
                    part="contentDetails",
                    playlistId=playlistid,
                    maxResults=50,
                    pageToken=next_page_token
                )
            response = request.execute()

            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])

            next_page_token = response.get('nextPageToken')                
    return video_ids

In [None]:
def get_video_details(ytdata, video_ids, channel, subscribers, ChannelImg):
    
    ##only 50 videos can be processed at time. To get all videoids in the channel we need to loop through
    all_video_stats = []
    
    for i in range(0, len(video_ids), 50): 
        request = ytdata.videos().list(
                part = "snippet, statistics, contentDetails",
                id = ','.join(video_ids[i:i+50]))
        response = request.execute()
#    return response
        
        for video in response['items']:
            video_stats = dict(Channel = channel,
                               Subscribers = subscribers,
                               ChannelImg = ChannelImg,
                                Title = video['snippet']['title'],
                                Video_id = video['id'],
                                Published_time_UTC = video['snippet']['publishedAt'],
                                VideoImg = video['snippet']['thumbnails']['default']['url'],
                                 Duration = video['contentDetails']['duration'],
                                Views = video['statistics']['viewCount'],
                                Likes = video['statistics']['likeCount'],
                                #Dislikes = video['statistics']['dislikeCount'],
                                Favorites = video['statistics']['favoriteCount'],
                                Comments = video['statistics']['commentCount']                           
                              )
            all_video_stats.append(video_stats)
    
    return all_video_stats
      

In [None]:
# get_video_details
## The Published time returned by the Youtube API is in UTC timezone. (there is a Z at the end of the time)

In [None]:
## Loop through all the PlaylistIDs of the Channels to get the entire list of Video Ids. 

def get_video_data(channel_data, ytdata):
    video_data = pd.DataFrame()

    for _, row in channel_data.iterrows():
        playlist_id = row["PlaylistID"]
        channel = row["Channel"]
        subscribers = row['Subscribers']
        ChannelImg = row['ChannelImg']
        
        # Get video IDs for the current playlist
        video_ids = get_video_ids(ytdata, playlist_id)
        
        # Get details for each video
        video_details = get_video_details(ytdata, video_ids, channel, subscribers, ChannelImg)
        
        # Append the new video details to the main DataFrame
        video_data = pd.concat([video_data, pd.DataFrame(video_details)], ignore_index=True)

    return video_data

# for i in range(len(channel_data)):
#     playlistid1 = channel_data.loc[i,"PlaylistID"]
#     channel = channel_data.loc[i,"Channel"]
#     video_ids = get_video_ids(ytdata, playlistid1)
#     video_details = get_video_details(ytdata, video_ids, channel)
#     video_data = pd.concat([video_data, pd.DataFrame(video_details)], ignore_index=True)
    

In [None]:
#video_details = get_video_details(ytdata, video_ids, channel)

In [None]:
#video_data = pd.DataFrame(video_details)
video_data = get_video_data(channel_data, ytdata)

In [None]:
## The Published time returned by the Youtube API is in UTC timezone. (there is a Z at the end of the time)


video_data['Published_datetime_UTC'] = pd.to_datetime(video_data['Published_time_UTC'])
## The Published time returned by the Youtube API is in UTC timezone. (there is a Z at the end of the time)
## Convert the time into local time
video_data['Published_datetime'] = video_data['Published_datetime_UTC'].dt.tz_convert('America/Chicago')

## Need to use tz_localize to remove the -+1:00 that appears in the timezone datetime

video_data['Published_datetime'] = video_data['Published_datetime'].dt.tz_localize(None)

video_data['Published_date'] = video_data['Published_datetime'].dt.date

video_data['Subscribers'] = pd.to_numeric(video_data['Subscribers'])
video_data['Views'] = pd.to_numeric(video_data['Views'])
video_data['Likes'] = pd.to_numeric(video_data['Likes'])
video_data['Favorites'] = pd.to_numeric(video_data['Favorites'])
video_data['Comments'] = pd.to_numeric(video_data['Comments'])

# - The first capture group extracts numbers after 'T' and before 'D'
# - The second capture group extracts numbers after 'D' and before 'M'
# - The third capture group extracts numbers after 'M' and before 'S'
video_data[['Hours', 'Minutes', 'Seconds']] = video_data['Duration'].str.extract(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?')

# Convert the extracted columns to integers, set it to 0 if null (NaN)

video_data['Hours'] = pd.to_numeric(video_data['Hours'], errors='coerce').fillna(0).astype(int)
video_data['Minutes'] = pd.to_numeric(video_data['Minutes'], errors='coerce').fillna(0).astype(int)
video_data['Seconds'] = pd.to_numeric(video_data['Seconds'], errors='coerce').fillna(0).astype(int)
video_data

In [None]:
# select specific columns to be used as reference in Visualizations

select_cols = ['Channel', 'Published_date']
video_data_ref = video_data[video_data['Channel'] == 'SQLBI']
video_data_ref = video_data_ref[select_cols].copy()  ## copy is needed to avoid SettingWithCopyWarning error on this df slice
video_data_ref['Year'] = pd.to_datetime(video_data_ref['Published_date']).dt.year
video_data_ref = video_data_ref.drop(columns = ['Published_date']).drop_duplicates()

# Add 24 hrs to the df, this is only for TimeoftheDay Viz purposes:
# Create a static list containing the hours of the day

houroftheday = list(range(0, 24))
video_data_ref = pd.concat([video_data_ref.assign(HouroftheDay = i) for i in houroftheday], ignore_index = True)

video_data_ref

In [None]:
## folder to which the files will be written to
os.chdir("../Data")
print(os.getcwd()) 

In [None]:
video_data.to_csv("YTVideoDetails.csv", mode='w', index=False)
video_data_ref.to_csv("HourOfDay.txt", mode='w', index=False)

In [None]:
# stopwords in the nltk library helps to identify the commonly used words like a, the, is, was etc
# The Title of each video is parsed and these commonly used words are removed from the title
# This is then used to identify the text/subject that is most often used in this channel's video topics

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_common_words(text):
    if pd.isna(text):  # Handle NaN values
        return ""
    #words = text.translate(str.maketrans('', '', string.punctuation)).split()  # Remove punctuation & split
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    return ' '.join(filtered_words)

In [None]:
# Filter channels where word cloud analysis will be used
wc_video_data = video_data[video_data['Channel'] == 'SQLBI']

In [None]:
wc_video_data = wc_video_data[['Channel', 'Video_id', 'Title', 'Published_datetime']]
# remove stop_words before exploding

wc_video_data['Cleaned_Title'] = wc_video_data['Title'].apply(remove_common_words)
wc_video_data['Cleaned_Title'] = wc_video_data['Cleaned_Title'].str.replace('#', ' #')

wc_video_data = wc_video_data.assign(Title_words = wc_video_data['Cleaned_Title'].str.split()).explode('Title_words')
#wc_video_data['Title_words'] = wc_video_data.assign(Title_words = wc_video_data['Cleaned_Title'].str.split())
#wc_video_data['Title_words'] = wc_video_data['Title_words'].str.split('#')
#wc_video_data = wc_video_data.explode('Title_words')

wc_video_data = wc_video_data[['Channel', 'Video_id', 'Title', 'Published_datetime', 'Title_words']].drop_duplicates()

wc_video_data

In [None]:
wc_video_data.to_csv("YTVideoTitleWordCloud.csv", mode='w', index=False)