# Get Data

Before getting started, ensure that you have run the requirements.txt file with

`pip install -r requirements.txt`

## Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import urllib.request
import json
import pandas as pd


## Function definitions

In [15]:
# This function takes the ID of a YouTube video and returns a list of tags.
def get_youtube_tags(ID):
    ''' This function takes the ID of a YouTube video and returns a list of tags.
    (Input): Video ID (String). This is the string that appears at the end of the URL of the video.
    (Output): A list of tags (Strings).'''

    url = "https://www.youtube.com/watch?v=" + ID
    request = requests.get(url)
    html = BeautifulSoup(request.content, 'html.parser')
    tags = html.find_all('meta', attrs={'property': 'og:video:tag'})

    content = []
    for tag in tags:
        content.append(tag['content'])
    return content


In [27]:
# Function to return all channel data from a channel id
def get_channel_data(CHANNEL_ID):
    """This function takes the ID of a YouTube channel and returns a JSON of the channels data.
    Including Channel ID, Channel name, and videos on the channel.
    (Input): Channel ID (String). This is the string that appears at the end of the URL of the channel.
    (Output): List of Dicts. Each dict contains the data for a video on the channel.
    (Note): This function requires a Google API key. Replace 'YOUR_API_KEY' with your API key as a string"""

    API_KEY = YOUR_API_KEY  # Replace with your API key

    BASE_SEARCH_URL = 'https://www.googleapis.com/youtube/v3/search?'

    FIRST_URL = BASE_SEARCH_URL + \
        'key={}&channelId={}&part=snippet,id&order=date&maxResults=25'.format(
            API_KEY, CHANNEL_ID)

    channel_data = []
    url = FIRST_URL

    while True:
        inp = urllib.request.urlopen(url)
        resp = json.load(inp)

        for i in resp['items']:
            if i['id']['kind'] == "youtube#video":
                channel_data.append(i)
        try:
            next_page_token = resp['nextPageToken']
            url = FIRST_URL + '&pageToken={}'.format(next_page_token)
        except:
            break
    return channel_data


In [5]:
# Function to extract the information that we want from the channel/video data.
def soup_to_df(CHANNEL_DATA, DF):
    """This function takes the channel data and creates a dataframe with the information we want.
    (Input): Channel data (List of Dicts), and the DataFrame to append to (videos_df)
    (output): Updates the DataFrame in place, no return."""

    for item in CHANNEL_DATA:
        channel_id = item['snippet']['channelId']
        channel_name = item['snippet']['channelTitle']
        video_id = item['id']['videoId']
        video_title = item['snippet']['title']
        video_tags = get_youtube_tags(video_id)
        if len(video_tags) > 0:
            if any(DF['VIDEO_ID'] == video_id):
                continue
            else:
                DF.loc[len(DF.index)] = [channel_id, channel_name,
                                         video_id, video_title, video_tags]
        else:
            pass


In [4]:
# If DF file exists, load it otherwise create a new one
def load_df(filename):
    DF = pd.read_pickle(filename)
    return DF


# Save the dataframe to a csv file
def save_df(DF, filename):
    DF.to_pickle(filename)


In [32]:
# Function used to a progress check on the DataFrame
def channel_counts(df):
    count_true = df['LOGGED'].value_counts()[True]
    count_false = df['LOGGED'].value_counts()[False]
    print(count_true, ' channels logged')
    print(count_false, ' channels remaining')

## Main driver

In [7]:
# Load in the data for working with. This only loads the automotive channels due to time limitations in the project.
# Future progress would be to finish collecting the auto channel data, then collect the other categories.
channels_df = load_df('Data_In/auto_channels_list.pkl')
videos_df = load_df('Data_Out/auto_channels_data.pkl')


In [None]:
# Main driver
# 1. Find the next unlogged channel in the channel DF
# 2. Get the channel data and send it to the soup_to_df function
# 3. Once a channel is complete, print the chanel name
# 4. Once the Google API limit is reached, a 403 error will be thrown. triggering the exception.
# 5. Save the progresss, then call the channel counts function for a progress check.

try:
    for row in channels_df.itertuples():
        if row.LOGGED == False:
            soup_to_df(get_channel_data(row.CHANNEL_ID), videos_df)
            channels_df['LOGGED'][row.Index] = True
            print(row.CHANNEL_NAME + ' logged')
        else:
            continue
except:
    save_df(videos_df, 'Data_Out/auto_channels_data.pkl')
    save_df(channels_df, 'Data_in/auto_channels_list.pkl')
    channel_counts(channels_df)