## Scraping video data using YouTube API

#### Set-up

Run the following instructions on your terminal:
``` bash
pip install google-api-python-client
pip install --upgrade google-api-python-client
```

In [71]:
from googleapiclient.discovery import build
from IPython.display import JSON
from pprint import pprint
import pandas as pd
import json

In [72]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

In [73]:
# creating service object of the youtube version 3 API
youtube = build('youtube', 'v3', developerKey=credentials['youtube_api_key'])

## Getting a list of music videos so that we have a list of videos to analyse


In [78]:
# Create the request object


def youtube_search(max_results:int, query:str, region:str, searchtype:str, category:int, orderfilter: str):
    search_data = []
    video_ids = []
    
    youtube_search_request = youtube.search().list(
        part="snippet",
        maxResults= max_results,
        q= query,
        regionCode= region,
        type=searchtype,
        videoCategoryId=category,
        order = orderfilter,
        fields="items(id/videoId,snippet(channelId,channelTitle,description,title)),nextPageToken,pageInfo,prevPageToken,regionCode"
    )
    
    # Execute the request and get the response
    youtube_search_response = youtube_search_request.execute()
    
    # iterate through each element in the nested dictionary to get the relevant values of each video
    for item in youtube_search_response['items']:
        video_id = item['id']['videoId']
        title = item['snippet']['title']
        channel_id = item['snippet']['channelId']
        channel_title = item['snippet']['channelTitle']
        description = item['snippet']['description']
        
        # append the relevant values to the dictionarys to save as a dataframe
        search_data.append({
        'video_id': video_id,
        'title': title,
        'channel_id': channel_id,
        'channel_title': channel_title,
        'description': description
        })
    
        # the video_id dictionary will be used in the second part of the code involving the .videos() method
        video_ids.append (video_id)
        
    return search_data, video_ids



In [80]:
youtube_search_data, mv_video_id = youtube_search(100, "official music video", "US", "video", 10, "viewCount")

In [81]:
# save to json 
with open('../data/search_data.json', 'w') as json_file:
    json.dump(youtube_search_data, json_file, indent=4)

yt_search_df = pd.DataFrame(youtube_search_data)


## From the list of music videos, get statistics on each video


In [76]:

def get_stats(videoId:list):

    video_data=[]
    # create the request object
    # from the above response, we already have the channelId, channelTitle, videoID, categoryID, 
    video_request = youtube.videos().list(
    part = "statistics, id, topicDetails",
    id = videoId
    )
    video_response = video_request.execute()

    # iterate through each element in the nested dictionary to get the relevant values
    for item in video_response['items']:
        like_count = item['statistics']['likeCount']
        view_count = item['statistics']['viewCount']
        comment_count = item['statistics']['commentCount']
        wikipedia_category = item['topicDetails']['topicCategories'] # this output gives us a few wikipedia links of genres that we will have to clean up to get the genre

        # append the relevant values to the data dictionary to save as a dataframe
        video_data.append ({
        'video_id': item['id'],
        'like_count': like_count,
        'view_count': view_count,
        'comment_count': comment_count,
        'wikipedia_category': wikipedia_category
        })
        
    return video_data


In [77]:
mv_video_stats = get_stats(mv_video_id)

In [82]:
# save to json 
with open('../data/mv_stats_data.json', 'w') as json_file:
    json.dump(mv_video_stats, json_file, indent=4)

mv_videostats_df = pd.DataFrame(mv_video_stats)



In [83]:
# merge the mv stats and search dataframes
searchandstats_merged_df = pd.merge(yt_search_df, mv_videostats_df, left_on='video_id', right_on='video_id')