In [1]:
%%bash
cd ..
tree

[01;34m.[0m
├── [00mLICENSE[0m
├── [00mMakefile[0m
├── [00mREADME.md[0m
├── [01;34martifacts[0m
│   └── [01;34membedding_model[0m
│       ├── [01;34m1_Pooling[0m
│       │   └── [00mconfig.json[0m
│       ├── [00mREADME.md[0m
│       ├── [00mconfig.json[0m
│       ├── [00mconfig_sentence_transformers.json[0m
│       ├── [00mmodel.safetensors[0m
│       ├── [00mmodules.json[0m
│       ├── [00msentence_bert_config.json[0m
│       ├── [00mspecial_tokens_map.json[0m
│       ├── [00mtokenizer.json[0m
│       ├── [00mtokenizer_config.json[0m
│       └── [00mvocab.txt[0m
├── [00mconfig.yaml[0m
├── [01;34mdata[0m
│   └── [00myoutube_transcripts.parquet[0m
├── [00mdata.dvc[0m
├── [01;34mdocker[0m
├── [01;34mnotebooks[0m
│   └── [00myoutube-semantic-search.ipynb[0m
├── [00mpoetry.lock[0m
├── [00mpyproject.toml[0m
└── [01;34msrc[0m
    ├── [00m__init__.py[0m
    ├── [00mapp.py[0m
    ├── [00metl.py[0m
    ├── [00mfrontend.py[0m
    ├

#### **`Dependencies`**

In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
import warnings

from pathlib import PosixPath

import pandas as pd
import polars as pl

from joblib import Parallel, delayed
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

from src.paths import PathConfig, load_config
from src.utils import embed_transcripts, transcribe_videos

warnings.filterwarnings("ignore")

In [5]:
# set the Pandas display options
pd.set_option("display.max_rows", 100) # max number of rows to display
pd.set_option("display.max_columns", None) # max number of columns to display

# set the Polars display options
pl.Config(
    tbl_rows=10, # max number of rows to display
    tbl_cols=100, # max number of columns to display
    tbl_width_chars=1000, # max table width, in characters
    fmt_str_lengths=50, # max number of characters to display for a pl.Utf8 (str) dtype column
    fmt_table_cell_list_len=20 # max number of items to display for a pl.List dtype column
)

<polars.config.Config at 0x103b39ba0>

#### **`extract-transform-load`**

In [None]:
# load the YouTube channel IDs
youtube_channel_ids: list[str] = load_config().get("youtube_channel_ids")
youtube_channel_ids[:5]

In [None]:
# a list of pl.LazyFrames, one per YouTube channel ID
lfs: list[pl.LazyFrame] = Parallel(n_jobs=-1)(
    delayed(transcribe_videos)(channel_id) for channel_id in tqdm(youtube_channel_ids)
)

In [None]:
# vertically concatenate the list of pl.LazyFrames into a single pl.DataFrame
df: pl.DataFrame = (
    pl.concat(lfs, how="vertical")
    .unique(subset="video_id")
    .sort(by="creation_date")
    .collect()
)
print(df)

- [**`Hugging Face Embedding Models Leaderboard`**](https://huggingface.co/spaces/mteb/leaderboard)

- [**`15 Best Open Source Text Embedding Models`**](https://www.graft.com/blog/open-source-text-embedding-models#15-open-source-text-embedding-models-updated-april-2024)

- [**`Alibaba-NLP/gte-large-en-v1.5 Embedding Model`**](https://huggingface.co/Alibaba-NLP/gte-large-en-v1.5)

In [None]:
# fetch the embedding model from Hugging Face
model_id: str = load_config().get("embedding_model_id")
embedding_model: SentenceTransformer = SentenceTransformer(model_id, trust_remote_code=True)
embedding_model, embedding_model.get_sentence_embedding_dimension()

In [None]:
# save the embedding model locally to ~/artifacts/embedding_model/
artifacts_dir: PosixPath = PathConfig.ARTIFACTS_DIR
artifacts_dir.mkdir(parents=True, exist_ok=True)
embedding_model.save(str(artifacts_dir / "embedding_model"))

In [None]:
# create serialized embeddings for the video transcripts, and ...
# write the resulting pl.DataFrame to ~/data/youtube_transcripts.parquet
(
    df
    .pipe(embed_transcripts, embedding_model)
    .with_columns(pl.col("creation_date").str.to_datetime())
    .write_parquet(PathConfig.PROCESSED_DATA_PATH)
)

In [6]:
# read in ~/data/youtube_transcripts.parquet 
print(pl.read_parquet(PathConfig.PROCESSED_DATA_PATH))

shape: (1_357, 5)
┌─────────────┬─────────────────────┬─────────────────────────────────────────────────────┬─────────────────────────────────────────────────────┬─────────────────────────────────────────────────────┐
│ video_id    ┆ creation_date       ┆ title                                               ┆ transcript                                          ┆ embedding                                           │
│ ---         ┆ ---                 ┆ ---                                                 ┆ ---                                                 ┆ ---                                                 │
│ str         ┆ datetime[μs]        ┆ str                                                 ┆ str                                                 ┆ binary                                              │
╞═════════════╪═════════════════════╪═════════════════════════════════════════════════════╪═════════════════════════════════════════════════════╪═════════════════════════════════════