# About
- Load transcript data
- Create and persist a chromadb

# Imports and Settings

In [21]:
import pandas as pd
import os
from dotenv import load_dotenv

from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


load_dotenv()
OPENAI_APIKEY = os.environ['OPENAI_APIKEY']

# Create Vector Store

In [22]:

# Instantiate the embeddings model
# OpenAI embeddings models: https://platform.openai.com/docs/models/embeddings
# OpenAI pricing: https://openai.com/pricing
# Documentation on LangChain: https://python.langchain.com/docs/modules/data_connection/text_embedding/ ; https://python.langchain.com/docs/integrations/text_embedding/openai/
embeddings_model = OpenAIEmbeddings(api_key=OPENAI_APIKEY, model='text-embedding-3-large', max_retries=100, chunk_size=16, show_progress_bar=False)


In [23]:
# Load the data
df_transcripts = pd.read_csv('../data/transcripts_all_2024-04-18_cleaned.csv')
df_transcripts.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,videoId,title,description,publishedAt,transcript,num_tokens
0,0,0,7niWR9iARhE,Podcast: How to Die a Good Death,"Retaining dignity, privacy, and relief from pa...",2024-04-18T12:59:47Z,I hear from lots of people every day who are c...,2412
1,1,1,E-72zyo4wRg,Three Reasons Why Fruits and Vegetables May Re...,Osteoporosis and diet: Even just a single extr...,2024-04-17T11:59:47Z,"""Three Reasons Fruits and Vegetables May Reduc...",590
2,2,2,Rr0rKZ3M-CA,Dark green leafy vegetables are packed with lu...,New subscribers to our e-newsletter always rec...,2024-04-16T18:00:14Z,here's the rate of cognitive decline in elderl...,141


In [24]:
df_transcripts.shape[0]

2577

In [26]:
# Let's keep the columns we want to include in the metadata
cols_keep = ['videoId', 'title', 'description', 'publishedAt', 'transcript']
df_transcripts = df_transcripts[cols_keep]
df_transcripts.head(3)

Unnamed: 0,videoId,title,description,publishedAt,transcript
0,7niWR9iARhE,Podcast: How to Die a Good Death,"Retaining dignity, privacy, and relief from pa...",2024-04-18T12:59:47Z,I hear from lots of people every day who are c...
1,E-72zyo4wRg,Three Reasons Why Fruits and Vegetables May Re...,Osteoporosis and diet: Even just a single extr...,2024-04-17T11:59:47Z,"""Three Reasons Fruits and Vegetables May Reduc..."
2,Rr0rKZ3M-CA,Dark green leafy vegetables are packed with lu...,New subscribers to our e-newsletter always rec...,2024-04-16T18:00:14Z,here's the rate of cognitive decline in elderl...


In [27]:
# Langchain documentation: 
# Intanstiate and run the dataframe loader to load the documents. You must include the column name that has the text
#https://python.langchain.com/docs/modules/data_connection/document_loaders/
loader = DataFrameLoader(df_transcripts, 'transcript')
docs = loader.load()

In [28]:
docs[0].metadata['title']

'Podcast: How to Die a Good Death'

In [29]:
docs[0].metadata.keys()

dict_keys(['videoId', 'title', 'description', 'publishedAt'])

In [30]:
# Split the transcripts into smaller chunks
# https://python.langchain.com/docs/modules/data_connection/document_transformers/
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)


In [31]:
len(docs)

2577

In [32]:
len(splits)

17857

In [33]:
splits[0]

Document(page_content='I hear from lots of people every day who are concerned about how their diet is affecting their health. They need answers based on facts. In other words, in the peer-reviewed medical literature, and that is what I’m here for. Welcome to the Nutrition Facts Podcast. I’m your host, Dr. Michael Greger. Today we explore the possibilities of how we might have control over our own death— retaining dignity, privacy, and pain relief. And we start with a close look at hospice. We have all sorts of detailed stats about dying, but little about the experience of death. For the minority who die under the care of palliative care teams one’s death could probably be described as good, but there’s a suspicion that for the majority who die in hospitals or nursing homes the experience is bad. And that’s where most people die. In spite of widespread preference to die at home, in almost all populations, most deaths occur in institutions. And this doesn’t just have consequences for the

In [34]:
# Create and persist a Chroma vector store
# https://python.langchain.com/docs/modules/data_connection/vectorstores/
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings_model, persist_directory="../chroma_db")


# Test it out with some queries!

In [15]:
# Read it back from disk just to make sure it is working properly
vectorstore2 = Chroma(persist_directory="../chroma_db", embedding_function=embeddings_model)


In [16]:
# Create a test query
query = "eggs and diabetes"
k = 10
results = vectorstore2.similarity_search_with_relevance_scores(query, k=k)
# Print out just the video titles
[(r[0].metadata['title'], r[1]) for r in results]

[('Eggs and Diabetes', 0.6323911411905252),
 ('Eggs and Diabetes', 0.6323001039977417),
 ('Eggs and Diabetes', 0.5619727767544027),
 ('Eggs and Diabetes', 0.5618286345324954),
 ('Bacon, Eggs, and Gestational Diabetes During Pregnancy',
  0.4246965714975093),
 ('Bacon, Eggs, and Gestational Diabetes During Pregnancy',
  0.4243575843949362),
 ('Debunking Egg Industry Myths', 0.41192523346173815),
 ('Debunking Egg Industry Myths', 0.41148863425801385),
 ('Bacon, Eggs, and Gestational Diabetes During Pregnancy',
  0.4008350489193696),
 ('Bacon, Eggs, and Gestational Diabetes During Pregnancy',
  0.40055894491477473)]

In [17]:
# Create the "retreiver", including how many results you want it to produce
# Will be using this in the rag "chain" we are going to create so let's just make sure it is working as expected
# https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore/
retriever = vectorstore2.as_retriever(search_kwargs={"k":k})

In [18]:
# Run a test
results = retriever.invoke(query)
[r.metadata['title'] for r in results]


['Eggs and Diabetes',
 'Eggs and Diabetes',
 'Eggs and Diabetes',
 'Eggs and Diabetes',
 'Bacon, Eggs, and Gestational Diabetes During Pregnancy',
 'Bacon, Eggs, and Gestational Diabetes During Pregnancy',
 'Debunking Egg Industry Myths',
 'Debunking Egg Industry Myths',
 'Bacon, Eggs, and Gestational Diabetes During Pregnancy',
 'Bacon, Eggs, and Gestational Diabetes During Pregnancy']

In [19]:
len(results)

10

In [20]:
results

[Document(page_content='"Eggs and Diabetes" Type 2 diabetes is becoming  a world pandemic. We know the consumption  of eggs is related to the development of some  other chronic diseases... what about diabetes? Researchers found a stepwise  increase in risk the more and more  eggs people ate. Eating just a single  egg a week appeared to increase the odds  of diabetes by 76%. Two eggs a week appeared  to double the odds, and just a single egg  a day tripled the odds. Three times greater  risk of type 2 diabetes, one of the leading causes  of death and amputation, blindness, and kidney failure. This is not the  first time a link between eggs and diabetes  has been reported. In 2009, Harvard researchers found  that a single egg a day or more was associated with  an increased risk of type 2 diabetes  in men and women, and that finding has since  also been confirmed in other populations:  Asia in 2011 and Europe in 2012. And the "high" consumption of eggs associated with diabetes risk  was l

# Discussion

This is very, very basic! You may also want to do the following (which we will cover in the future):
- Search on the small chunks but retrieve the full "parent" document to include in the context of the LLM query
- Vectorize summaries of each document and the retrieve the summary or the full parent document
- Vectorize questions a person might have about the documents and then retrieve the full parent document
- Optimize the chunk size
- Use different types of retrievers
- ...and more!!