In [6]:
import pandas as pd
import json
import re
from typing import List, Optional
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
import requests

TODO:
- get all videos on the Neo4j youtube channel
- upload list of videos to gcp bucket
- pull video titles and get transcripts of each
- chunk transcripts
- upload to graph

In [2]:
import os

import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors

In [5]:
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]

api_service_name = "youtube"
api_version = "v3"
client_secrets_file = os.environ.get("GCP_SERVICE_ACCOUNT_KEY_PATH")
api_key = os.environ.get("YOUTUBE_API_KEY")

# Get credentials and create an API client
flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
    client_secrets_file, scopes)
credentials = flow.credentials
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=api_key, credentials=credentials)

request = youtube.search().list(
    part="snippet",
    q="@neo4j"
)
response = request.execute()

print(response['items'])

ValueError: There is no access token for this session, did you call fetch_token?

In [19]:
response['items'][0]

{'kind': 'youtube#searchResult',
 'etag': 'PUwUneO0Y0_0iXRhKY_QcU7u54g',
 'id': {'kind': 'youtube#channel', 'channelId': 'UCvze3hU6OZBkB1vkhH2lH9Q'},
 'snippet': {'publishedAt': '2014-12-03T23:18:57Z',
  'channelId': 'UCvze3hU6OZBkB1vkhH2lH9Q',
  'title': 'Neo4j',
  'description': 'Neo4j is the Graph Database & Analytics leader. This channel features videos by our Developer Relations, Engineering and ...',
  'thumbnails': {'default': {'url': 'https://yt3.ggpht.com/BEHbbtI42GHOisSFC2P0SDkPlBO9eofY_vle616iX0ze76HT3_P40AzE9TjGQH2nxsGaWyey9g=s88-c-k-c0xffffffff-no-rj-mo'},
   'medium': {'url': 'https://yt3.ggpht.com/BEHbbtI42GHOisSFC2P0SDkPlBO9eofY_vle616iX0ze76HT3_P40AzE9TjGQH2nxsGaWyey9g=s240-c-k-c0xffffffff-no-rj-mo'},
   'high': {'url': 'https://yt3.ggpht.com/BEHbbtI42GHOisSFC2P0SDkPlBO9eofY_vle616iX0ze76HT3_P40AzE9TjGQH2nxsGaWyey9g=s800-c-k-c0xffffffff-no-rj-mo'}},
  'channelTitle': 'Neo4j',
  'liveBroadcastContent': 'upcoming',
  'publishTime': '2014-12-03T23:18:57Z'}}

In [21]:
channel_request = youtube.channels().list(
    part='snippet',
    id="UCvze3hU6OZBkB1vkhH2lH9Q" # Neo4j channel id
)

channel_response = channel_request.execute()

print(channel_response)

{'kind': 'youtube#channelListResponse', 'etag': 'sM6Mjxe01RLMdKH0-NW7RFKHa38', 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5}, 'items': [{'kind': 'youtube#channel', 'etag': 'C3w1MrDZesb176j1Xt4dk6DUGOo', 'id': 'UCvze3hU6OZBkB1vkhH2lH9Q', 'snippet': {'title': 'Neo4j', 'description': 'Neo4j is the Graph Database & Analytics leader.  This channel features videos by our Developer Relations, Engineering and Product teams about best practices using Neo4j. Learn more at https://neo4j.com/ If you have technical questions or want to build a local community join https://community.neo4j.com', 'customUrl': '@neo4j', 'publishedAt': '2014-12-03T23:18:57Z', 'thumbnails': {'default': {'url': 'https://yt3.ggpht.com/BEHbbtI42GHOisSFC2P0SDkPlBO9eofY_vle616iX0ze76HT3_P40AzE9TjGQH2nxsGaWyey9g=s88-c-k-c0x00ffffff-no-rj', 'width': 88, 'height': 88}, 'medium': {'url': 'https://yt3.ggpht.com/BEHbbtI42GHOisSFC2P0SDkPlBO9eofY_vle616iX0ze76HT3_P40AzE9TjGQH2nxsGaWyey9g=s240-c-k-c0x00ffffff-no-rj', 'width': 2

In [27]:
videos_request = youtube.search().list(
    part="id",
    channelId="UCvze3hU6OZBkB1vkhH2lH9Q",
    type="video",
    maxResults=50
)

videos_response = videos_request.execute()

In [33]:
videos_response

{'kind': 'youtube#searchListResponse',
 'etag': 'pO2-wbJUXTvOOvRwgU0spVgwxYc',
 'nextPageToken': 'CDIQAA',
 'regionCode': 'US',
 'pageInfo': {'totalResults': 1973, 'resultsPerPage': 50},
 'items': [{'kind': 'youtube#searchResult',
   'etag': 'nxlahA9AHrmQu88pjCG-56TrBec',
   'id': {'kind': 'youtube#video', 'videoId': 'urO5FyP9PoI'}},
  {'kind': 'youtube#searchResult',
   'etag': 'f4qSWTBGM9I4Zy2ujR-gT-cvyK0',
   'id': {'kind': 'youtube#video', 'videoId': 'REVkXVxvMQE'}},
  {'kind': 'youtube#searchResult',
   'etag': 'CxjrsQeRbkzD4XK2WC91uegrfEw',
   'id': {'kind': 'youtube#video', 'videoId': 'mMkXnmueasA'}},
  {'kind': 'youtube#searchResult',
   'etag': 'LlzR-X3sD8-sfcnCCgcVLnfxaF8',
   'id': {'kind': 'youtube#video', 'videoId': 'nYQLp7itZx8'}},
  {'kind': 'youtube#searchResult',
   'etag': '4R2co9P2GJjdRx_2Bod-51WfQ94',
   'id': {'kind': 'youtube#video', 'videoId': 'o2yujqUGCTw'}},
  {'kind': 'youtube#searchResult',
   'etag': 'vetKh5-czmh9VSY3pY5ka8_rLSw',
   'id': {'kind': 'youtube#

In [44]:
def get_videos() -> List[str]:

    videos = []
    total_results = 0
    next_page_token = None

    def get_next_videos(next_page_token: str = None, total_results: int = None, videos: List = None) -> None:

        if not next_page_token:

            videos_request = youtube.search().list(
                part="id",
                channelId="UCvze3hU6OZBkB1vkhH2lH9Q",
                type="video",
                maxResults=50
            )
        else:
            videos_request = youtube.search().list(
                part="id",
                channelId="UCvze3hU6OZBkB1vkhH2lH9Q",
                type="video",
                maxResults=50,
                pageToken=next_page_token
            )

        videos_response = videos_request.execute()

        if total_results == 0:
            total_results = videos_response['pageInfo']['totalResults']

        next_page_token = videos_response['nextPageToken']
        print(next_page_token)

        videos += [x['id']['videoId'] for x in videos_response['items']]


    while total_results >= len(videos):

        get_next_videos(next_page_token=next_page_token, total_results=total_results, videos=videos)
        print("total results: ", total_results)
        print("ids retrieved: ", len(videos))

    return videos

In [45]:
videos = get_videos()

total results:  0
ids retrieved:  50


In [56]:
def get_next_videos(next_page_token: str = None, total_results: int = -1, videos: List = []) -> List[str]:

    if total_results == len(videos):
        print("complete")
    
    if not next_page_token:

        videos_request = youtube.search().list(
            part="id",
            channelId="UCvze3hU6OZBkB1vkhH2lH9Q",
            type="video",
            maxResults=50
        )

    else:
        videos_request = youtube.search().list(
            part="id",
            channelId="UCvze3hU6OZBkB1vkhH2lH9Q",
            type="video",
            maxResults=50,
            pageToken=next_page_token
        )

    videos_response = videos_request.execute()

    if total_results == -1:
        total_results = videos_response['pageInfo']['totalResults']
        print('total results set: ', total_results)

    next_page_token = videos_response['nextPageToken']
    # print(next_page_token)

    videos += [x['id']['videoId'] for x in videos_response['items']]

    print("total results: ", total_results)
    print("ids retrieved: ", len(videos), "\n")

    get_next_videos(next_page_token=next_page_token, total_results=total_results, videos=videos)


    return videos

In [57]:
vids = get_next_videos()

total results set:  1983
total results:  1983
ids retrieved:  50 

total results:  1983
ids retrieved:  100 

total results:  1983
ids retrieved:  150 

total results:  1983
ids retrieved:  200 

total results:  1983
ids retrieved:  250 

total results:  1983
ids retrieved:  300 

total results:  1983
ids retrieved:  350 

total results:  1983
ids retrieved:  400 

total results:  1983
ids retrieved:  450 

total results:  1983
ids retrieved:  500 



KeyError: 'nextPageToken'

In [28]:
channel_id = "UCvze3hU6OZBkB1vkhH2lH9Q"
api_key = os.environ.get("YOUTUBE_API_KEY")
uploads_id = "UUvze3hU6OZBkB1vkhH2lH9Q"

In [9]:
address = f"https://www.googleapis.com/youtube/v3/channels?id={channel_id}&key={api_key}&part=contentDetails"
req = requests.get(address)
data = req.json()

In [10]:
data

{'kind': 'youtube#channelListResponse',
 'etag': 'GAjRI1ohn2N7TwCkG6zy3YoEgkg',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#channel',
   'etag': 'ShFBCyiIcf4MH1THCp3SFC3cjg0',
   'id': 'UCvze3hU6OZBkB1vkhH2lH9Q',
   'contentDetails': {'relatedPlaylists': {'likes': '',
     'uploads': 'UUvze3hU6OZBkB1vkhH2lH9Q'}}}]}

In [15]:
uploads_id = data['items'][0]['contentDetails']['relatedPlaylists']['uploads']
uploads_id

'UUvze3hU6OZBkB1vkhH2lH9Q'

In [29]:
address = f"https://www.googleapis.com/youtube/v3/playlistItems?playlistId={uploads_id}&key={api_key}&part=snippet&maxResults=50"
vid_req = requests.get(address)
vids = vid_req.json()

In [32]:
vids

{'kind': 'youtube#playlistItemListResponse',
 'etag': 'xLN2HUF0l28UphoKOZRhfYbH8dg',
 'nextPageToken': 'EAAaIVBUOkNESWlFRFF4TlRJMVJEWkNSakE0TWtSRlJqZ29BUQ',
 'items': [{'kind': 'youtube#playlistItem',
   'etag': 'AHfzWRLa5mpZFHyj7yDIPPX3RLY',
   'id': 'VVV2emUzaFU2T1pCa0IxdmtoSDJsSDlRLkJQOW5BVFczeV9N',
   'snippet': {'publishedAt': '2023-11-02T13:55:45Z',
    'channelId': 'UCvze3hU6OZBkB1vkhH2lH9Q',
    'title': 'NODES 2023 - Fluffy and Fido on the Go: Applying Graph Data and AI to Hack Pet Travel',
    'description': 'Ever grappled with the difficulties of traveling with your cherished pet? Discovering the perfect location can require considerable research as you scour the web for pet-friendly hotels, restaurants, green spaces, and more. Furthermore, the urgency of finding an available veterinarian nearby in the event of a pet medical emergency can add to the stress.\n\nIn this session, the presenters will guide you on how to leverage publicly-available data to locate pet-friendly acc

In [31]:
len(vids['items'])

50

In [30]:
vids['items'][0]['snippet']['resourceId']['videoId']

'BP9nATW3y_M'

In [38]:


def get_video_addresses(next_page_token: str = None, total_results: int = -1, videos: List[str] = []) -> List[str]:
        
    channel_id = "UCvze3hU6OZBkB1vkhH2lH9Q"
    api_key = os.environ.get("YOUTUBE_API_KEY")
    uploads_id = "UUvze3hU6OZBkB1vkhH2lH9Q"

    address = f"https://www.googleapis.com/youtube/v3/playlistItems?playlistId={uploads_id}&key={api_key}&part=snippet&maxResults=50"
    
    if not next_page_token:
      vid_req = requests.get(address)

    else:
        vid_req = requests.get(address+f'&pageToken={next_page_token}')
        
    vids = vid_req.json()

    if total_results == -1:
        total_results = vids['pageInfo']['totalResults']
        print('total results set: ', total_results)

    videos += [x['snippet']['resourceId']['videoId'] for x in vids['items']]

    print("total results: ", total_results)
    print("ids retrieved: ", len(videos), "\n")

    if "nextPageToken" not in vids.keys():
        print("complete")
        return videos
    
    next_page_token = vids['nextPageToken']

    get_video_addresses(next_page_token=next_page_token, total_results=total_results, videos=videos)

    return videos


In [39]:
video_ids = get_video_addresses()

total results set:  1897
total results:  1897
ids retrieved:  50 

total results:  1897
ids retrieved:  100 

total results:  1897
ids retrieved:  150 

total results:  1897
ids retrieved:  200 

total results:  1897
ids retrieved:  250 

total results:  1897
ids retrieved:  300 

total results:  1897
ids retrieved:  350 

total results:  1897
ids retrieved:  400 

total results:  1897
ids retrieved:  450 

total results:  1897
ids retrieved:  500 

total results:  1897
ids retrieved:  550 

total results:  1897
ids retrieved:  600 

total results:  1897
ids retrieved:  650 

total results:  1897
ids retrieved:  700 

total results:  1897
ids retrieved:  750 

total results:  1897
ids retrieved:  800 

total results:  1897
ids retrieved:  850 

total results:  1897
ids retrieved:  900 

total results:  1897
ids retrieved:  950 

total results:  1897
ids retrieved:  1000 

total results:  1897
ids retrieved:  1050 

total results:  1897
ids retrieved:  1100 

total results:  1897
ids re

In [42]:
video_ids[:10]

['BP9nATW3y_M',
 'MQTVDGOiWBY',
 'vvn1WUgqxqM',
 '70V9TIYvqrw',
 'pQWEgpNwY2w',
 'DQ-SShCGVzs',
 'Znd4k64JRFc',
 '9DxwgIKVSHY',
 'm51Dtppb2h0',
 '_cC2PoOJHQc']

In [43]:
full_video_addresses = ["https://www.youtube.com/watch?v="+id for id in video_ids]

In [44]:
full_video_addresses[:10]

['https://www.youtube.com/watch?v=BP9nATW3y_M',
 'https://www.youtube.com/watch?v=MQTVDGOiWBY',
 'https://www.youtube.com/watch?v=vvn1WUgqxqM',
 'https://www.youtube.com/watch?v=70V9TIYvqrw',
 'https://www.youtube.com/watch?v=pQWEgpNwY2w',
 'https://www.youtube.com/watch?v=DQ-SShCGVzs',
 'https://www.youtube.com/watch?v=Znd4k64JRFc',
 'https://www.youtube.com/watch?v=9DxwgIKVSHY',
 'https://www.youtube.com/watch?v=m51Dtppb2h0',
 'https://www.youtube.com/watch?v=_cC2PoOJHQc']

In [46]:
df = pd.DataFrame({"YouTube_Address": full_video_addresses})

In [47]:
df.to_csv("YouTube_Addresses.csv")