# Exploratory Data Analysing Using Youtube Video Data from Music Channels

In [8]:
import pandas as pd
import numpy as np
from dateutil import parser


# Data visualization libraries
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
sns.set(style="darkgrid", color_codes=True)

# Google API
from googleapiclient.discovery import build

In [9]:
api_key = 'AIzaSyAU_S3vpBrLcrCTHLDT4oq7fdwseGigAfM' 

channel_ids = [
    'UCq-Fj5jknLsUf-MWSy4_brA',
    'UCStaiwu-FAgp_RC_tBiLh9A',
    'UC507uEmHkM4ZRoEYswIqMFQ',
    'UCxtB5XgpGdeDtmLLtC4oI3Q',
    'UCpPwodiYc4ceaqEBB54trHQ',
    'UCe4LM_eKc9ywRmVuBm5pjQg',
    'UCudKvbd6gvbm5UCYRk5tZKA',
    'UC4eYXhJI4-7wSWc8UNRwD4A',
    'UCa10nxShhzNrCE1o2ZOPztg',
    'UCM1VesJtJ9vTXcMLLr_FfdQ'
]

youtube = build('youtube', 'v3', developerKey=api_key)

In [10]:
def get_channel_stats(youtube, channel_ids):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs
    
    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
    
    """
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute() 
    
    for i in range(len(response['items'])):
        data = dict(channelName = response['items'][i]['snippet']['title'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    totalVideos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    
    return pd.DataFrame(all_data)

def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel
    
    Returns:
    List of video IDs of all videos in the playlist
    
    """
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids

def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
        
    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
            
    return pd.DataFrame(all_video_info)

def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """
    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)     

### Get channel statistics

Using the `get_channel_stats` function defined below, now we are going to obtain the channel statistics for the 9 channels in scope.

In [11]:
channel_data = get_channel_stats(youtube, channel_ids)

Now I can print out the data and take a look at the channel statistics overview.

In [12]:
channel_data

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,HollywoodRecordsVEVO,1120000,3439871872,280,UUpPwodiYc4ceaqEBB54trHQ
1,Atlantic Records,14800000,13765827154,1975,UUe4LM_eKc9ywRmVuBm5pjQg
2,Warner Music India,335000,179206015,1275,UU507uEmHkM4ZRoEYswIqMFQ
3,TheSoundYouNeed,5530000,3016118512,474,UUudKvbd6gvbm5UCYRk5tZKA
4,Sony Music Entertainment,43700,28286,16,UUxtB5XgpGdeDtmLLtC4oI3Q
5,NPR Music,9730000,3183695635,2551,UU4eYXhJI4-7wSWc8UNRwD4A
6,YouTube Music,1530000,165536989,64,UUStaiwu-FAgp_RC_tBiLh9A
7,T-Series,271000000,262888132178,21494,UUq-Fj5jknLsUf-MWSy4_brA
8,Trap Nation,30300000,14521116169,2898,UUa10nxShhzNrCE1o2ZOPztg
9,Coke Studio Pakistan,15500000,4810209820,716,UUM1VesJtJ9vTXcMLLr_FfdQ


In [19]:
output_file = 'channel_statistics.xlsx'
channel_data.to_excel(output_file, index=False, engine='openpyxl')

### Get video statistics for all the channels

In [20]:
import pandas as pd

# Assuming get_video_ids and get_video_details are defined

video_df = pd.DataFrame()

for c in channel_data['channelName'].unique():
    print(f"Getting video information from channel: {c}")
    playlist_id = channel_data.loc[channel_data['channelName'] == c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)
    
    print(f"Video IDs: {video_ids}")  # Debug: Print video IDs
    
    if video_ids:
        # Get video data
        video_data = get_video_details(youtube, video_ids)
        
        # Print only the first item of the data if it's a list or a snippet of the dictionary
        if isinstance(video_data, list) and video_data:
            print(f"Sample Video Data: {video_data[0]}")  # Print first item of list for debugging
            data_frame = pd.DataFrame(video_data)
        elif isinstance(video_data, dict) and video_data:
            print(f"Sample Video Data: {video_data}")  # Print dict for debugging
            data_frame = pd.DataFrame([video_data])
        else:
            print(f"Unexpected data format or empty data: {type(video_data)}")
            continue
        
        # Print DataFrame structure and sample data
        print("DataFrame Columns:", data_frame.columns)
        print("DataFrame Shape:", data_frame.shape)
        print("DataFrame Head (Sample):", data_frame.head())
        
        if not data_frame.empty:
            # Append video data to video_df
            video_df = pd.concat([video_df, data_frame], ignore_index=True)
        else:
            print("data_frame is empty. Skipping concatenation.")
    else:
        print("No video IDs found for the given playlist ID.")

# After loop
print("Columns in final video_df:", video_df.columns)
print("DataFrame Shape after concatenation:", video_df.shape)
print("DataFrame Head after concatenation (Sample):", video_df.head())


Getting video information from channel: HollywoodRecordsVEVO
Video IDs: ['WrxrjfcznZg', 'vmiIl45tzWo', '-Qe1QHxGY5w', 'qTzOlky5AIM', 'kpQoNzzzP9A', 'gO_1zd3MflE', 'YI0CmUBplS4', 'JruhYHLpy2I', 'RZRhOELGccM', 'jnH6jRfsshI', 'RE37xDELAMs', 'Ast7PSI8wg0', 'p9PIasSce5c', 'hsy5tt1Hlj8', 'm5WNeDK3xSo', 'hy4VUod2VJw', 'Ec6eWd6R5JA', 'S03TJUhmtwM', '4OyGnqp9g24', 'K1aLrvCDFi4', '8JZg0SCnT_A', 'N0c64InqIbU', '5shsh6Hfp44', 'gpl2ukCZ16c', 'Lr3giSWTYno', '6NAQbNa37lQ', 'BPXx82k0y2I', '_qwJwjxgTzI', 'AXqNMRGVCf4', '8qG4P75acLo', 'zhqdLjNSFyE', 'fUhHxBlRpWU', 'EH6EnXZXxMw', 'hiUZsoL3h-o', 'zJWdmFQSFXo', 'NGgMLCYl2JM', '76_gE-Uh6Mo', '7SalDXjDKsY', '6DTgeEfw2G0', 'QBvH5j5m6U4', 'UuKyfBKby58', 'VZHvBFQfoT0', 'IdYDKFrEG1k', 'X7T6aRHhLzE', '77G7fqzBGvo', '2t_xRByJn0w', 'sK5RxQtCRFw', 'cqt0kDW3XaU', 'lE_BxVXzc34', 'W6j9dJ2nKbw', 'qcYVIkOGC7I', 'BZcgXIpMoqI', 'sHCZ0XoZsxI', 'QszoV30F1yM', '3lzHlDTzk8A', 'dWOEW7kVVxE', '_giX0Cw66P4', 'STJG7b8FGE8', '8-pKuID8I9k', 'i0zPBWxJqUQ', 'SeJXGFl-cn0', '73r8zIP0Lyc

In [33]:
print(f"Video Data Type: {type(video_data)}")
print(f"Sample Video Data: {video_data[:5]}")


Video Data Type: <class 'pandas.core.frame.DataFrame'>
Sample Video Data:       video_id          channelTitle  \
0  OOkqBZDVvhQ  Coke Studio Pakistan   
1  mYiACoAF0ng  Coke Studio Pakistan   
2  nMRgN6bDalE  Coke Studio Pakistan   
3  p0yT_oWWtrA  Coke Studio Pakistan   
4  QsB4oMEnJmE  Coke Studio Pakistan   

                                               title  \
0  Tu Jhoom Rendition | Coke Studio Pakistan | Ab...   
1  Magical Journey of Mehmaan | Coke Studio Pakistan   
2  Coke Studio Pakistan | Season 15 | Mehmaan | S...   
3  Coke Studio Pakistan | Season 15 | Mehmaan | S...   
4  Coke Studio Pakistan | Season 15 | Mehmaan | S...   

                                         description  \
0  A tribute to the song that’s touched millions....   
1  The Magical Journey of Mehmaan #Mehmaan\n#Coke...   
2  The set makers talk about their experience \n#...   
3  Nooremah’s Father shares his experience of Meh...   
4  #Mehmaan #CokeStudio15\n#CokeStudioPakistan #S...   

           