In [1]:
import requests
import json
import polars as pl
import matplotlib.pyplot as plt
from youtube_transcript_api import YouTubeTranscriptApi
from os import getenv
import dotenv
from flask import Flask
dotenv.load_dotenv()
YOUTUBE_API_KEY = getenv("YOUTUBE_API_KEY")


In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics import DistanceMetric
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# functions

In [4]:
def getVideoRecords(response: requests.models.Response) -> list:
    """
        Function to extract YouTube video data from GET request response
    """

    video_record_list = []
    
    for raw_item in json.loads(response.text)['items']:
    
        # only execute for youtube videos
        if raw_item['id']['kind'] != "youtube#video":
            continue
        
        video_record = {}
        video_record['video_id'] = raw_item['id']['videoId']
        video_record['datetime'] = raw_item['snippet']['publishedAt']
        video_record['title'] = raw_item['snippet']['title']
        
        video_record_list.append(video_record)

    return video_record_list

In [5]:
def extract_text(transcript: list) -> str:
    """
        Function to extract text from transcript dictionary
    """
    
    text_list = [transcript[i]['text'] for i in range(len(transcript))]
    return ' '.join(text_list)

# get data

In [4]:
# define channel ID
channel_id = 'UCsXVk37bltHxD1rDPwtNM8Q'

# define url for API
url = 'https://www.googleapis.com/youtube/v3/search'

# initialize page token
page_token = None

# intialize list to store video data
video_record_list = []

In [12]:
%%time
# extract video data across multiple search result pages
while page_token != 0:
    # define parameters for API call
    params = {
        "key": YOUTUBE_API_KEY, 
        'channelId': channel_id, 
        'part': ["snippet","id"], 
        'maxResults':50, 
        'order': 'date',
        'pageToken': page_token
    }
    # make get request
    response = requests.get(url, params=params)

    # check for errors
    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print("were here")
        break


    # append video records to list
    video_record_list += getVideoRecords(response)

    try:
        # grab next page token
        page_token = json.loads(response.text)['nextPageToken']
    except:
        # if no next page token kill while loop
        page_token = 0

CPU times: total: 46.9 ms
Wall time: 3.37 s


write data to file

In [13]:
# write data to file
pl.DataFrame(video_record_list).write_parquet('data/video-ids.parquet')
pl.DataFrame(video_record_list).write_csv('data/video-ids.csv')

# get Transcripts

In [5]:
df = pl.read_parquet('data/video-ids.parquet')
print(df.head())

shape: (5, 3)
┌─────────────┬──────────────────────┬─────────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                           │
│ ---         ┆ ---                  ┆ ---                             │
│ str         ┆ str                  ┆ str                             │
╞═════════════╪══════════════════════╪═════════════════════════════════╡
│ dIpttD7b8B4 ┆ 2024-09-12T14:00:39Z ┆ Why Do Puddles Disappear but L… │
│ vSSkDos2hzo ┆ 2024-09-12T14:01:11Z ┆ We Need to Rethink Exercise (U… │
│ _kelDJFjhOo ┆ 2024-09-05T14:00:37Z ┆ The Perfect Sofa – According t… │
│ 49ApsH6lzk0 ┆ 2024-08-29T14:00:05Z ┆ This Happens When You Get a Ta… │
│ dBxxi5XAm3U ┆ 2024-08-27T14:00:00Z ┆ We Traveled Back in Time. Now … │
└─────────────┴──────────────────────┴─────────────────────────────────┘


In [6]:
%%time
transcript_text_list = []

for i in range(len(df)):

    # try to extract captions
    try:
        transcript = YouTubeTranscriptApi.get_transcript(df['video_id'][i])
        transcript_text = extract_text(transcript)
    # if not available set as n/a
    except:
        transcript_text = "n/a"
    
    transcript_text_list.append(transcript_text)

CPU times: total: 5.72 s
Wall time: 4min 21s


In [7]:
# add transcripts to dataframe
df = df.with_columns(pl.Series(name="transcript", values=transcript_text_list))
print(df.head())

shape: (5, 4)
┌─────────────┬──────────────────────┬──────────────────────────────┬──────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                        ┆ transcript                   │
│ ---         ┆ ---                  ┆ ---                          ┆ ---                          │
│ str         ┆ str                  ┆ str                          ┆ str                          │
╞═════════════╪══════════════════════╪══════════════════════════════╪══════════════════════════════╡
│ dIpttD7b8B4 ┆ 2024-09-12T14:00:39Z ┆ Why Do Puddles Disappear but ┆ n/a                          │
│             ┆                      ┆ L…                           ┆                              │
│ vSSkDos2hzo ┆ 2024-09-12T14:01:11Z ┆ We Need to Rethink Exercise  ┆ Losing weight is hard and    │
│             ┆                      ┆ (U…                          ┆ unfo…                        │
│ _kelDJFjhOo ┆ 2024-09-05T14:00:37Z ┆ The Perfect Sofa – According ┆ n/a    

In [8]:
# shape + unique values
print("shape:", df.shape)
print("n unique rows:", df.n_unique())
for j in range(df.shape[1]):
    print("n unique elements (" + df.columns[j] + "):", df[:,j].n_unique())

shape: (244, 4)
n unique rows: 242
n unique elements (video_id): 242
n unique elements (datetime): 242
n unique elements (title): 242
n unique elements (transcript): 216


In [10]:
filtered_df = df.filter(~pl.col('title').str.contains("#shorts"))

In [11]:
filtered_df

video_id,datetime,title,transcript
str,str,str,str
"""vSSkDos2hzo""","""2024-09-12T14:01:11Z""","""We Need to Rethink Exercise (U…","""Losing weight is hard and unfo…"
"""dBxxi5XAm3U""","""2024-08-27T14:00:00Z""","""We Traveled Back in Time. Now …","""You’re going forward through t…"
"""cRZOUcpiOxY""","""2024-08-13T14:00:01Z""","""Fever Feels Horrible, but is A…","""Fever feels bad. So we take me…"
"""fa8k8IQ1_X0""","""2024-08-06T14:00:09Z""","""A.I. ‐ Humanity&#39;s Final In…","""humans rule Earth without comp…"
"""GFLb5h2O2Ww""","""2024-06-25T14:00:01Z""","""This Disease is Deadlier Than …","""Hello, Steve here. Today I am …"
…,…,…,…
"""F3QpgXBtDeo""","""2013-11-28T17:03:32Z""","""How The Stock Exchange Works (…","""What is the Stock Exchange an…"
"""UuGrBhK2c7U""","""2013-10-11T19:11:39Z""","""The Gulf Stream Explained""","""The ocean conveyor belt and th…"
"""Uti2niW2BRA""","""2013-09-03T09:12:24Z""","""Fracking explained: opportunit…","""What is hydraulic fracturing –…"
"""KsF_hdjWJjo""","""2013-08-22T13:24:56Z""","""The Solar System -- our home i…","""the solar system our home in s…"


In [12]:
# shape + unique values
print("shape:", filtered_df.shape)
print("n unique rows:", filtered_df.n_unique())
for j in range(filtered_df.shape[1]):
    print("n unique elements (" + filtered_df.columns[j] + "):", filtered_df[:,j].n_unique())

shape: (199, 4)
n unique rows: 197
n unique elements (video_id): 197
n unique elements (datetime): 197
n unique elements (title): 197
n unique elements (transcript): 197


In [13]:
filtered_df.filter(pl.col("transcript") == "n/a")

video_id,datetime,title,transcript
str,str,str,str


In [14]:
# write data to file
filtered_df.write_parquet('data/video-transcripts.parquet')
filtered_df.write_csv('data/video-transcripts.csv')

# validate data

In [15]:
df = pl.read_parquet('data/video-transcripts.parquet')

In [16]:
# change datetime to Datetime dtype
df = df.with_columns(pl.col('datetime').cast(pl.Datetime))
print(df.head())

shape: (5, 4)
┌─────────────┬─────────────────────┬───────────────────────────────┬──────────────────────────────┐
│ video_id    ┆ datetime            ┆ title                         ┆ transcript                   │
│ ---         ┆ ---                 ┆ ---                           ┆ ---                          │
│ str         ┆ datetime[μs]        ┆ str                           ┆ str                          │
╞═════════════╪═════════════════════╪═══════════════════════════════╪══════════════════════════════╡
│ vSSkDos2hzo ┆ 2024-09-12 14:01:11 ┆ We Need to Rethink Exercise   ┆ Losing weight is hard and    │
│             ┆                     ┆ (U…                           ┆ unfo…                        │
│ dBxxi5XAm3U ┆ 2024-08-27 14:00:00 ┆ We Traveled Back in Time. Now ┆ You’re going forward through │
│             ┆                     ┆ …                             ┆ t…                           │
│ cRZOUcpiOxY ┆ 2024-08-13 14:00:01 ┆ Fever Feels Horrible, but is  ┆ Fever f

In [5]:
special_strings = ['&#39;', '&amp;']
special_string_replacements = ["'", "&"]

for i in range(len(special_strings)):
    df = df.with_columns(df['title'].str.replace(special_strings[i], special_string_replacements[i]).alias('title'))
    df = df.with_columns(df['transcript'].str.replace(special_strings[i], special_string_replacements[i]).alias('transcript'))

In [6]:
df.write_parquet('data/video-transcripts.parquet')
df.write_csv('data/video-transcripts.csv')

# load data

In [6]:
df = pl.read_parquet('data/video-transcripts.parquet')

In [7]:
print(df.head())

shape: (5, 4)
┌─────────────┬─────────────────────┬───────────────────────────────┬──────────────────────────────┐
│ video_id    ┆ datetime            ┆ title                         ┆ transcript                   │
│ ---         ┆ ---                 ┆ ---                           ┆ ---                          │
│ str         ┆ datetime[μs]        ┆ str                           ┆ str                          │
╞═════════════╪═════════════════════╪═══════════════════════════════╪══════════════════════════════╡
│ vSSkDos2hzo ┆ 2024-09-12 14:01:11 ┆ We Need to Rethink Exercise   ┆ Losing weight is hard and    │
│             ┆                     ┆ (Upd…                         ┆ unfort…                      │
│ dBxxi5XAm3U ┆ 2024-08-27 14:00:00 ┆ We Traveled Back in Time. Now ┆ You’re going forward through │
│             ┆                     ┆ Ph…                           ┆ tim…                         │
│ cRZOUcpiOxY ┆ 2024-08-13 14:00:01 ┆ Fever Feels Horrible, but is  ┆ Fever f

# Encode

In [3]:
model_name = 'all-MiniLM-L6-v2'
%time model = SentenceTransformer(model_name)

CPU times: total: 203 ms
Wall time: 4.06 s




In [4]:
df = pl.read_parquet('data/video-transcripts.parquet')
df.head()

video_id,datetime,title,transcript
str,datetime[μs],str,str
"""vSSkDos2hzo""",2024-09-12 14:01:11,"""We Need to Ret…","""Losing weight …"
"""dBxxi5XAm3U""",2024-08-27 14:00:00,"""We Traveled Ba…","""You’re going f…"
"""cRZOUcpiOxY""",2024-08-13 14:00:01,"""Fever Feels Ho…","""Fever feels ba…"
"""fa8k8IQ1_X0""",2024-08-06 14:00:09,"""A.I. ‐ Humanit…","""humans rule Ea…"
"""GFLb5h2O2Ww""",2024-06-25 14:00:01,"""This Disease i…","""Hello, Steve h…"


In [10]:
column_name_list = ['title', 'transcript']

for column_name in column_name_list:
    # generate embeddings
    embedding_arr = model.encode(df[column_name].to_list())

    # store embeddings in a dataframe
    schema_dict = {column_name+'_embedding-'+str(i): float for i in range(embedding_arr.shape[1])}
    df_embedding = pl.DataFrame(embedding_arr, schema=schema_dict)

    # append embeddings to video index
    df = pl.concat([df, df_embedding], how='horizontal')

In [16]:
df.write_parquet('data/video-index.parquet')

# Transform

In [5]:
%time df = pl.scan_parquet('data/video-index.parquet')

CPU times: total: 0 ns
Wall time: 0 ns


In [6]:
dist_name = 'manhattan'
%time dist = DistanceMetric.get_metric(dist_name)

CPU times: total: 0 ns
Wall time: 0 ns


In [7]:
def returnSearchResults(query: str, index: pl.lazyframe.frame.LazyFrame) -> np.ndarray:
    """
        Function to return indexes of top search results
    """
    
    # embed query
    query_embedding = model.encode(query).reshape(1, -1)
    
    # compute distances between query and titles/transcripts
    dist_arr = dist.pairwise(df.select(df.columns[4:388]).collect(), query_embedding) + dist.pairwise(df.select(df.columns[388:]).collect(), query_embedding)

    # search paramaters
    threshold = 40 # eye balled threshold for manhatten distance
    top_k = 5

    # evaluate videos close to query based on threshold
    idx_below_threshold = np.argwhere(dist_arr.flatten()<threshold).flatten()
    # keep top k closest videos
    idx_sorted = np.argsort(dist_arr[idx_below_threshold], axis=0).flatten()

    # return indexes of search results
    return idx_below_threshold[idx_sorted][:top_k]

In [8]:
query = "Time Travel"
idx_result = returnSearchResults(query, df)

print(df.select(['video_id', 'title']).collect()[idx_result])

shape: (5, 2)
┌─────────────┬───────────────────────────────────┐
│ video_id    ┆ title                             │
│ ---         ┆ ---                               │
│ str         ┆ str                               │
╞═════════════╪═══════════════════════════════════╡
│ dBxxi5XAm3U ┆ We Traveled Back in Time. Now Ph… │
│ wwSzpaTHyS8 ┆ Did The Future Already Happen? -… │
│ 2XkV6IpV2Y0 ┆ The History and Future of Everyt… │
│ CWu29PRCUvQ ┆ When Time Became History - The H… │
│ 5TbUxGZtwGI ┆ Time: The History & Future of Ev… │
└─────────────┴───────────────────────────────────┘


In [9]:
df.select(['title', 'video_id']).collect()[idx_result].to_dict(as_series=False)


{'title': ['We Traveled Back in Time. Now Physicists Are Angry.',
  'Did The Future Already Happen? - The Paradox of Time',
  'The History and Future of Everything -- Time',
  'When Time Became History - The Human Era',
  'Time: The History & Future of Everything – Remastered'],
 'video_id': ['dBxxi5XAm3U',
  'wwSzpaTHyS8',
  '2XkV6IpV2Y0',
  'CWu29PRCUvQ',
  '5TbUxGZtwGI']}

# Interface

In [22]:
def generate_response(question, df):

    idx_result = returnSearchResults(question, df)
    result = df.select(['title', 'video_id']).collect()[idx_result]
    
    return result