In [2]:
import dotenv
dotenv.load_dotenv()

True

## 1. Load YouTube transcript

In [3]:
from langchain.document_loaders import YoutubeLoader

In [21]:
loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=AbPQD1evVFM", add_video_info=True
)
docs = loader.load()

In [22]:
docs



In [5]:
len(docs[0].page_content)

121766

In [6]:
docs[0].page_content



## Summarize

In [7]:
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-1106")
chain = load_summarize_chain(llm, chain_type="stuff")

# chain.run(docs)

In [8]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate

# Define prompt
prompt_template = """Identify angles that a journalist covering this city council meeting might want to write a story about. These do not have to be fully fleshed out stories. Rather, they should be leads that the journalist would follow up on with rigorous reporting.
"{text}"
CONCISE SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

# Define LLM chain
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Define StuffDocumentsChain
stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")

# docs = loader.load()
# print(stuff_chain.run(docs))

In [9]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.text_splitter import CharacterTextSplitter

llm = ChatOpenAI(temperature=0)

# Map
map_template = """The following is a transcript from a New York City Council Meeting.
{docs}
Based on this transcript, please identify angles that a journalist covering this city council meeting might want to write a story about. These do not have to be fully fleshed out stories. Rather, they should be leads that the journalist would follow up on with rigorous reporting. Please include the most relevant quote from the transcript for each angle.
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [10]:
# Reduce
reduce_template = """The following is set of angles that a journalist might wish to pursue in their reporiting:
{docs}
Take these and distill it into a final, consolidated list of angles to follow up on. 
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

In [17]:
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

In [18]:
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

In [None]:
# Text splitter that doesnt work
# text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
#     chunk_size=10000, chunk_overlap=2000
# )
# split_docs = text_splitter.split_documents(docs)

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100000, chunk_overlap=2000, add_start_index=True
)
split_docs = text_splitter.split_documents(docs)

In [14]:
len(docs[0].page_content)

121766

In [15]:
len(split_docs)

6

In [20]:
print(map_reduce_chain.run(split_docs))

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 4097 tokens. However, your messages resulted in 19975 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

## 2. Split

In [78]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

In [79]:
len(all_splits)

153

In [61]:
all_splits[10].metadata

{'source': 'AbPQD1evVFM',
 'title': 'LIVE: Watch "Oversight - New York City\'s Climate Resiliency Efforts" Hosted Jointly by the Commit…',
 'description': 'Unknown',
 'view_count': 203,
 'thumbnail_url': 'https://i.ytimg.com/vi/AbPQD1evVFM/hq720.jpg?v=6526dc0f',
 'publish_date': '2023-10-11 00:00:00',
 'length': 9704,
 'author': 'NYCCouncil',
 'start_index': 8005}

## 3. Store

In [80]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

## 4. Retrieve

In [81]:
retriever = vectorstore.as_retriever(search_type='similarity', search_kwargs={"k": 6})
retrieved_docs = retriever.get_relevant_documents(
    "What is the biggest roadblock to executing new infrastructure projects?"
)

In [67]:
len(retrieved_docs)

6

In [83]:
print(retrieved_docs[0].page_content)

the sewage system that's uh the speed of projects and and um I guess the way I see it the way I see it the way I see it our challenge is not not wanting to do the right thing it is not even a lack of capital because once we have projects identified I have a 30 1 billion 10-year Capital plan at D capital is not really the main constraint um if we choose to prioritize I mean I don't I don't think we've had a lot of instances where or blue belt or cloudburst projects have been delayed I you know they in the popular imagination there's a lot that we blame uh environmental impact statements on uh for which is not necessarily the issue here um it I'm I'm just pointing out I'm I'm just pointing out where's the 22nd project it's how do we push all of them through the pipeline as quickly as possible um some of that is just about having really good management and I could not be more proud of the work that my agency has done in terms of managing its procure in terms of managing its procure in ter

## 5. Generate

In [15]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

In [16]:
from langchain import hub
prompt = hub.pull('rlm/rag-prompt')

In [17]:
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [22]:
for chunk in rag_chain.stream(""):
    print(chunk, end="", flush=True)

RIT Agerwal, Elijah Hutchinson, and Meritt Larson spoke at the meeting.