## Scraping video data using YouTube API

#### Set-up

Run the following instructions on your terminal:
``` bash
pip install google-api-python-client
pip install --upgrade google-api-python-client
```

In [69]:
from googleapiclient.discovery import build
from IPython.display import JSON
from pprint import pprint
import pandas as pd
import json

In [70]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

In [71]:
# creating service object of the youtube version 3 API
youtube = build('youtube', 'v3', developerKey=credentials['youtube_api_key'])

## Getting a list of music videos so that we have a list of videos to analyse


In [72]:
# Create the request object


def youtube_mv_search(max_results:int):
    search_data = []
    
    youtube_search_request = youtube.search().list(
        part="snippet",
        maxResults= max_results,
        q="official music video",
        regionCode="US",
        type="video",
        videoCategoryId="10",
        order = "viewCount",
        fields="items(id/videoId,snippet(channelId,channelTitle,description,title)),nextPageToken,pageInfo,prevPageToken,regionCode"
    )
    
    # Execute the request and get the response
    youtube_search_response = youtube_search_request.execute()
    
    # iterate through each element in the nested dictionary to get the relevant values of each video
    for item in youtube_search_response['items']:
        video_id = item['id']['videoId']
        title = item['snippet']['title']
        channel_id = item['snippet']['channelId']
        channel_title = item['snippet']['channelTitle']
        description = item['snippet']['description']
        
        # append the relevant values to the data dictionary to save as a dataframe
        search_data.append({
        'video_id': video_id,
        'title': title,
        'channel_id': channel_id,
        'channel_title': channel_title,
        'description': description
        })
        # question for alex: why do i have to write another for loop in order for this to work?? why cant i just use video_id from above
        video_id = [item['id']['videoId'] for item in youtube_search_response['items']]
    return search_data, video_id


mv_search_data, video_id = youtube_mv_search(100)

video_id
 



['kJQP7kiw5Fk',
 'JGwWNGJdvx8',
 'RgKAFK5djSk',
 'OPf0YbXqDm0',
 '09R8_2nJtjg',
 'hT_nvWreIhg',
 'CevxZvSJLk8',
 'pRpeEdMmmQ0',
 'lp-EO5I60KA',
 '0KSOMA3QBU0',
 '2Vv-BfVoq4g',
 '60ItHLz5WEA',
 'aJOTlE1K90k',
 'YqeW9_5kURI',
 'nfWlot6h_JM',
 'e-ORhEE9VVg',
 'YQHsXMglC9A',
 'kffacxfA7G4',
 'k2qgadSvNyU',
 'papuvlVeZg8',
 'pXRviuL6vMY',
 'fLexgOxsZu0',
 '2vjPBrBU-TM',
 '7wtfhZwyrcc',
 'PIh2xe4jnpk',
 'rYEDA3JcQqw',
 'rtOvBOTyX00',
 '34Na4j8AVgA',
 'lWA2pjMjpBs',
 'YBHQbu5rbdQ',
 'PMivT7MJ41M',
 'IHNzOHi8sJs',
 'hLQl3WQQoQ0',
 'q0hyYWKXF0Q',
 'fKopy74weus',
 'YykjpeuMNEk',
 '5qm8PH4xAss',
 'LjhCEhWiKXk',
 'YVkUvmDQ3HY',
 '1G4isv_Fylg',
 'L3wKzyIN1yk',
 'hTWKbfoikeg',
 'gdZLi9oWNZg',
 'NGLxoKOvzu4',
 'qrO4YZeyl0I',
 'XsX3ATc3FbA',
 'Pkh8UtuejGw',
 'CTFtOOh47oo',
 'UqyT8IEBkvY',
 'nCkpzqqog4k']

## From the list of music videos, get statistics on each video


In [78]:

def get_mv_stats():

    mv_data=[]
    # create the request object
    # from the above response, we already have the channelId, channelTitle, videoID, categoryID, 
    video_request = youtube.videos().list(
    part = "statistics",
    id = video_id
    )
    video_response = video_request.execute()

    # iterate through each element in the nested dictionary to get the relevant values
    for item in video_response['items']:
        like_count = item['statistics']['likeCount']
        view_count = item['statistics']['viewCount']
        comment_count = item['statistics']['commentCount']

        # append the relevant values to the data dictionary to save as a dataframe
        mv_data.append({
        'like_count': like_count,
        'view_count': view_count,
        'comment_count': comment_count
    })

    return mv_data


In [79]:
mv_stats = get_mv_stats()
mv_stats

[{'like_count': '52529155',
  'view_count': '8326377906',
  'comment_count': '4227763'},
 {'like_count': '32315795',
  'view_count': '6147983351',
  'comment_count': '1146007'},
 {'like_count': '42253455',
  'view_count': '6107224160',
  'comment_count': '2176847'},
 {'like_count': '21082365',
  'view_count': '5097209667',
  'comment_count': '605099'},
 {'like_count': '15904563',
  'view_count': '3972031256',
  'comment_count': '418253'},
 {'like_count': '17487076',
  'view_count': '3926338554',
  'comment_count': '488572'},
 {'like_count': '16593041',
  'view_count': '3917768529',
  'comment_count': '774275'},
 {'like_count': '21931186',
  'view_count': '3794414517',
  'comment_count': '1314100'},
 {'like_count': '14909884',
  'view_count': '3699231129',
  'comment_count': '372087'},
 {'like_count': '17279907',
  'view_count': '3634618694',
  'comment_count': '801277'},
 {'like_count': '20492901',
  'view_count': '3610215968',
  'comment_count': '508190'},
 {'like_count': '27559989',


In [None]:
# ruikai you can ignore this part I havent figured it out yet
# mv_df[''] = mv_df.apply(lambda row: generate_song_url(row['Artist'], row['Title']), axis=1)