# About
- Load transcript data
- Create and persist a chromadb

# Imports and Settings

In [49]:
import pandas as pd
import os
from dotenv import load_dotenv

from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


load_dotenv()
OPENAI_APIKEY = os.environ['OPENAI_APIKEY']

# Create Vector Store

In [50]:

# Instantiate the embeddings model
# OpenAI embeddings models: 
# OpenAI pricing: 
# Documentation on LangChain: https://python.langchain.com/docs/modules/data_connection/text_embedding/ ; https://python.langchain.com/docs/integrations/text_embedding/openai/
embeddings_model = OpenAIEmbeddings(api_key=OPENAI_APIKEY, model='text-embedding-3-large', max_retries=100, chunk_size=16, show_progress_bar=False)


In [51]:
# Load the data
df_transcripts = pd.read_csv('../data/transcripts_all_2024-04-10_cleaned')
df_transcripts.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,videoId,title,description,publishedAt,date,year,transcript
0,0,0,r3PuCQ8CxTc,Benefits and Side Effects of the Pneumonia Vac...,Randomized controlled trials have found that p...,2024-04-10T11:59:52Z,2024-04-10,2024,"""Benefits and Side Effects of the Pneumonia Va..."
1,2,2,oa6UtySJKFE,Benefits and Side Effects of the Flu Vaccine,Flu shots can prevent more than just the flu. ...,2024-04-08T11:59:55Z,2024-04-08,2024,"In this 3-video series, I show the science beh..."
2,3,3,wZLgy4dvM1Y,New Sponsorship: Big Broccoli,Help keep us ad- and sponsorship-free by makin...,2024-04-07T15:59:56Z,2024-04-07,2024,I’m Dr. Michael Greger and ever since I starte...


In [52]:
# Let's keep the columns we want to include in the metadata
cols_keep = ['videoId', 'title', 'description', 'publishedAt', 'date', 'year', 'transcript']
df_transcripts = df_transcripts[cols_keep]
df_transcripts.head(3)

Unnamed: 0,videoId,title,description,publishedAt,date,year,transcript
0,r3PuCQ8CxTc,Benefits and Side Effects of the Pneumonia Vac...,Randomized controlled trials have found that p...,2024-04-10T11:59:52Z,2024-04-10,2024,"""Benefits and Side Effects of the Pneumonia Va..."
1,oa6UtySJKFE,Benefits and Side Effects of the Flu Vaccine,Flu shots can prevent more than just the flu. ...,2024-04-08T11:59:55Z,2024-04-08,2024,"In this 3-video series, I show the science beh..."
2,wZLgy4dvM1Y,New Sponsorship: Big Broccoli,Help keep us ad- and sponsorship-free by makin...,2024-04-07T15:59:56Z,2024-04-07,2024,I’m Dr. Michael Greger and ever since I starte...


In [55]:
# Langchain documentation: 
# Intanstiate and run the dataframe loader to load the documents. You must include the column name that has the text
#https://python.langchain.com/docs/modules/data_connection/document_loaders/
loader = DataFrameLoader(df_transcripts, 'transcript')
docs = loader.load()
# Split the transcripts into smaller chunks
# https://python.langchain.com/docs/modules/data_connection/document_transformers/
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
# Create and persist a Chroma vector store
# https://python.langchain.com/docs/modules/data_connection/vectorstores/
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings_model, persist_directory="../chroma_db")



# Test it out with some queries!

In [60]:
# Read it back from disk just to make sure it is working properly
vectorstore2 = Chroma(persist_directory="../chroma_db", embedding_function=embeddings_model)


In [75]:
# Create a test query
query = "eggs and diabetes"
k = 10
results = vectorstore2.similarity_search_with_relevance_scores(query, k=k)
# Print out just the video titles
[(r[0].metadata['title'], r[1]) for r in results]

[('Eggs and Diabetes', 0.6322850575728234),
 ('Eggs and Diabetes', 0.5619138976070329),
 ('Bacon, Eggs, and Gestational Diabetes During Pregnancy',
  0.4246064193885445),
 ('Debunking Egg Industry Myths', 0.41186146527994116),
 ('Bacon, Eggs, and Gestational Diabetes During Pregnancy',
  0.40046226004429075),
 ('Bacon, Eggs, and Gestational Diabetes During Pregnancy', 0.397414874309559),
 ('Whose Health Unaffected by Eggs', 0.39514518222355044),
 ('Debunking Egg Industry Myths', 0.38059743881674213),
 ('How Not To Die: The Role of Diet in Preventing, Arresting, and Reversing Our Top 15 Killers',
  0.3653613530800539),
 ('Flashback Friday: How Not to Die', 0.3652545951127758)]

In [80]:
# Create the "retreiver", including how many results you want it to produce
# Will be using this in the rag "chain" we are going to create so let's just make sure it is working as expected
# https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore/
retriever = vectorstore2.as_retriever(search_kwargs={"k":k})

In [81]:
# Run a test
results = retriever.invoke(query)
[r.metadata['title'] for r in results]


['Eggs and Diabetes',
 'Eggs and Diabetes',
 'Bacon, Eggs, and Gestational Diabetes During Pregnancy',
 'Debunking Egg Industry Myths',
 'Bacon, Eggs, and Gestational Diabetes During Pregnancy',
 'Bacon, Eggs, and Gestational Diabetes During Pregnancy',
 'Whose Health Unaffected by Eggs',
 'Debunking Egg Industry Myths',
 'How Not To Die: The Role of Diet in Preventing, Arresting, and Reversing Our Top 15 Killers',
 'Flashback Friday: How Not to Die']

# Discussion

This is very, very basic! You may also want to do the following (which we will cover in the future):
- Search on the small chunks but retrieve the full "parent" document to include in the context of the LLM query
- Vectorize summaries of each document and the retrieve the summary or the full parent document
- Vectorize questions a person might have about the documents and then retrieve the full parent document
- Optimize the chunk size
- Use different types of retrievers
- ...and more!!