In [None]:
%%bash
cd ..
tree

#### **`Dependencies`**

In [None]:
import os

import pandas as pd
import polars as pl

from joblib import Parallel, delayed
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

from src.paths import PathConfig, load_config
from src.utils import embed_transcripts, transcribe_videos

In [None]:
# set the Pandas display options
pd.set_option("display.max_rows", 100) # max number of rows to display
pd.set_option("display.max_columns", None) # max number of columns to display

# set the Polars display options
pl.Config(
    tbl_rows=100, # max number of rows to display
    tbl_cols=1000, # max number of columns to display
    tbl_width_chars=10_000, # max table width, in characters
    fmt_str_lengths=100, # max number of characters to display for a pl.Utf8 (str) dtype column
    fmt_table_cell_list_len=20 # max number of items to display for a pl.List dtype column
)

#### **`extract-transform-load`**

In [None]:
# load the YouTube channel IDs
youtube_channel_ids: list[str] = load_config().get("youtube_channel_ids")
youtube_channel_ids[:5]

In [None]:
# a list of pl.LazyFrames, one per YouTube channel ID
lfs: list[pl.LazyFrame] = Parallel(n_jobs=-1)(
    delayed(transcribe_videos)(channel_id) for channel_id in tqdm(youtube_channel_ids)
)

In [None]:
# vertically concatenate the list of pl.LazyFrames into a single pl.DataFrame
df: pl.DataFrame = (
    pl.concat(lfs, how="vertical")
    .unique(subset="video_id")
    .sort(by="creation_date")
    .collect()
)
print(df)

- [**`Hugging Face Embedding Models Leaderboard`**](https://huggingface.co/spaces/mteb/leaderboard)

- [**`15 Best Open Source Text Embedding Models`**](https://www.graft.com/blog/open-source-text-embedding-models#15-open-source-text-embedding-models-updated-april-2024)

- [**`Alibaba-NLP/gte-large-en-v1.5 Embedding Model`**](https://huggingface.co/Alibaba-NLP/gte-large-en-v1.5)

In [None]:
# fetch the embedding model from Hugging Face
model_id: str = load_config().get("embedding_model_id")
embedding_model: SentenceTransformer = SentenceTransformer(model_id, trust_remote_code=True)
embedding_model, embedding_model.get_sentence_embedding_dimension()

In [None]:
# create serialized embeddings for the video transcripts, and ...
# write the resulting pl.DataFrame to ~/data/youtube_transcripts.parquet
(
    df
    .pipe(embed_transcripts, embedding_model)
    .with_columns(pl.col("creation_date").str.to_datetime())
    .write_parquet(PathConfig.PROCESSED_DATA_PATH)
)

In [None]:
# read in ~/data/youtube_transcripts.parquet 
print(pl.read_parquet(PathConfig.PROCESSED_DATA_PATH))