In [1]:
from youtube_transcript import fetch_youtube_transcript
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [5]:
url = "https://www.youtube.com/watch?v=PSs6nxngL6k"

In [6]:
data = fetch_youtube_transcript(url)

In [9]:
full_transcript = " ".join([d["transcript"] for d in data])

In [12]:
text_splitter = RecursiveCharacterTextSplitter(
     chunk_size=300,
    chunk_overlap=0,
)

In [14]:
documents = text_splitter.split_text(full_transcript)

In [18]:
docs = [Document(page_content=doc) for doc in documents]

In [45]:
docs = []
for i in range(len(data)):
    for idx in range(len(documents)):
        doc = documents[idx]
        if data[i]['transcript'] in doc:
            # print(doc)
            docs.append(Document(page_content = doc, metadata = data[i]))

In [2]:
import os
import shutil

# Remove the existing chroma_db directory if it exists (to fix corrupted DB)
if os.path.exists('chroma_db'):
    shutil.rmtree('chroma_db')

os.makedirs('chroma_db', exist_ok=True)

vector_store = Chroma(
    embedding_function=OpenAIEmbeddings(),
    persist_directory='chroma_db',
    collection_name='youtube_transcripts'
)

In [59]:
vector_store.add_documents(docs)

['cd51b381-ed3f-4463-ab35-035e74796a51',
 'd1e36a68-5b99-43ce-8be0-f5a40229f8fb',
 '3440c309-cd47-4ff1-9469-4c86ce22a525',
 'a9c7ae91-eb8b-457e-816a-3015b9879582',
 '81c6ec2f-660e-4e72-9379-df7fabeb7f71',
 '6b4d5647-e111-4084-9315-57f87574dc90',
 '7fa5d85f-c792-4a57-ad6c-95da59e2b15d',
 '98e16045-93b1-4313-9e73-ff06d21a3398',
 '8b37061d-d95c-4713-bc90-5b2e9fb105ea',
 '521cdecb-c090-429a-84e1-027dca9ee1b8',
 '28f59704-d563-4a90-8900-05709b3220a9',
 'd97a5451-5581-4f17-9977-7e5d6366a5c3',
 'f261c1de-1008-45c0-9524-a164e50b6b97',
 'f64414cf-dc5f-4e17-b723-da9a625a77e5',
 '6dde4e9a-3757-446f-a013-1a6e8ec4b378',
 '669f3a4e-9948-402e-880a-53f1eb52d2be',
 '19afb3ce-f136-4d44-9bd0-721d33d888c0',
 'd4774d03-a72c-4fdf-8d04-b4d48f235fde',
 '0313d4d6-e942-4b30-aa2e-2d5488f0f627',
 'b498bcdd-6c25-4fe5-8cd5-0a5ace0c10c8',
 'ce662231-470d-490b-9b54-83773cd1456b',
 '9d599322-1031-4c2e-ba62-a9fc629d86c1',
 '36912977-6370-4b49-a241-dc966235c62c',
 'addf11d4-2d9b-426e-84d9-97a87ee51c7a',
 'bc866a04-1591-

In [4]:
vector_store.get(include=['embeddings','documents', 'metadatas'])

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'included': ['embeddings', 'documents', 'metadatas'],
 'data': None,
 'metadatas': []}

In [61]:
# search documents
vector_store.similarity_search(
    query='What are decision trees?',
    k=2
)

[Document(metadata={'seconds': 16, 'minutes': 2, 'transcript': 'love stack quest you are well on your'}, page_content="this decision tree classifies a person as either someone who loves stack quest or someone who doesn't since decision trees are a type of machine learning then if you understand how we use this tree to predict or classify if someone would love stack quest you are well on your way to understanding"),
 Document(metadata={'seconds': 26, 'minutes': 2, 'transcript': 'learning'}, page_content="this decision tree classifies a person as either someone who loves stack quest or someone who doesn't since decision trees are a type of machine learning then if you understand how we use this tree to predict or classify if someone would love stack quest you are well on your way to understanding")]