In [7]:
#Inspired by https://github.com/svpino/youtube-rag/blob/main/rag.ipynb
#Installing libraries
# %pip install --upgrade openai
# %pip install -U openai-whisper
# %pip install pytube
# %pip install langchain
# %pip install langchain_openai
# %pip install docarray
# %pip install --upgrade --quiet  llmlingua accelerate
# %pip install pinecone-client==3.0.0
# %pip install langchain_pinecone

## Chain Prompt, Model and Parser

Go to [OpenAI](https://openai) create your own account and generate API key.
Make sure you have copied it correctly and saved it somewhere safe.
 
 

In [31]:
#Insert openai api key and youtube video link below:
from openai import OpenAI
import os

OPENAI_API_KEY = "[ENTER YOUR OPENAI API KEY HERE]"
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=cdiD-9MMpb0"

### Defining the model.

**Cost-Saving Strategy**: Adding max_token parameter to prevents the model from generating too many tokens. 

You can read about Prompt Parameters in my article: ["Understanding Prompt Engineering Parameters for Enhanced Performance of LLMs"](https://www.linkedin.com/pulse/understanding-prompt-parameters-enhanced-performance-llms-chhagani-vfv4c/?trackingId=D6N6tB2%2BRoKmgQ9galckSw%3D%3D) 

In [9]:
#LangChain to use openai model
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(

    temperature=0,
    max_tokens=150,
    openai_api_key = OPENAI_API_KEY,
    model="gpt-3.5-turbo",

)

In [10]:
#Testing the model
model.invoke("How are you?")

AIMessage(content="I'm just a computer program, so I don't have feelings or emotions, but I'm here to help you with anything you need. How can I assist you today?", response_metadata={'finish_reason': 'stop', 'logprobs': None})

### Adding a Parser to the chain

The parser will only use string message in the invoke() 

In [11]:
#Parsing only output string using langChain
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

#Chained model and parser such that the result from model going into parser and then is saved in chain variable

chain = model | parser

In [12]:
#Testing the chain
chain.invoke("How are you?")

"I'm just a computer program, so I don't have feelings or emotions, but I'm here to help you with anything you need. How can I assist you today?"

### Prompt Template

In [13]:
#Defining prompt template
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [14]:
#Adding prompt to the chain
chain = prompt | model | parser

In [15]:
#Testing the chain
chain.invoke({
    "context": "My name is Mahima",
    "question": "What is my name?"
})

'Your name is Mahima.'

## Split Text, Generate Embedding and Pinecone retrieval

### Text from YouTube video

Below code uses whisper from openai to translate audio from youtube video to text and saving it in transcription.txt

In [16]:
import os
import tempfile
import whisper
from pytube import YouTube

if not os.path.exists("transcription.txt"):
    youtube = YouTube(YOUTUBE_VIDEO)
    audio = youtube.streams.filter(only_audio=True).first()

    whisper_model = whisper.load_model("base")

    with tempfile.TemporaryDirectory() as tmpdir:
        file = audio.download(output_path=tmpdir)
        transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()

        with open("transcription.txt", "w") as file:
            file.write(transcription)

In [17]:
#Reading first 100 char of in the txt file to verify the success of the above code
with open("transcription.txt") as file:
    transcription = file.read()

transcription[:100]

"I think it's possible that physics has exploits and we should be trying to find them. arranging some"

#### Error

On invoking the chain with context as transcript, it displays error as the file exceeds the minimum token per minute value for the used model.

In [18]:
try:
    chain.invoke({
        "context": transcription,
        "question": "Is reading papers a good idea?"
    })
except Exception as e:
    print(e)

Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-axYGPtzMjNzaBubJMB1oSL34 on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}


In [19]:
#Loading transcript in variable text_documents
from langchain_community.document_loaders import TextLoader

loader = TextLoader("transcription.txt")
text_documents = loader.load()

### Text Splitting

Splitting the transcript into 1000 characters with an overlap of 20 characters.

In [20]:

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)

In [21]:
print(len(documents))

221


### Creating Embeddings

To find revant documents from the chunks we create embeddings. You can try understanding it better by using [Cohere's Playground](https://dashboard.cohere.com/playground/embed).

In [22]:
#Embeddings from OpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [23]:
#Saving the embeddings to vectorestore in memory
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore2 = DocArrayInMemorySearch.from_documents(documents, embeddings)



In [24]:
#Using retriever to retrieve data from vectorstore and passing the question through the retrieval step 
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

retriever2 = vectorstore2.as_retriever()
setup = RunnableParallel(context=retriever2, question=RunnablePassthrough())
setup.invoke("What is the future of AI?")

{'context': [Document(page_content="ton of text on the internet is about humans and connection and love and so on. So I think they have a very good understanding in some in some sense of of how people speak to each other about this and they're very capable of creating a lot of that kind of text. There's a lot of like sci-fi from 50s and 60s that imagined AI's in a very different way. They are calculating coal Balkan-like machines. That's not what we're getting today. We're getting pretty emotional AI's that actually are very competent and capable of generating you know possible sounding text with respect to all of these topics. Yeah I'm really hopeful about AI systems that are like companions that help you grow, develop as a human being, help you maximize long term happiness. But I'm also very worried about AI systems that figure out from the internet that humans get attracted to drama. So these would just be like shit talking AI's. Did you hear it? They'll do gossip. They'll do they'l

In [25]:
#chaining all together
chain = setup | prompt | model | parser
chain.invoke("What is the future of AI?")

'The future of AI could involve AI systems that act as companions to help individuals grow and develop, as well as AI systems that may exploit vulnerabilities in physics to advance their capabilities.'

### Using Pinecone

Setup your [Pinecone account](https://www.pinecone.io) and create an index before running below steps. We are using pinecone to store the chunks of data along with embeddings.

In [26]:
#Insert your Pinecone API key below:
os.environ['PINECONE_API_KEY'] = "[ENTER YOUR PINECONE API KEY HERE]"

In [27]:
#Inserting the document and embeddings in the index
from langchain_pinecone import PineconeVectorStore

index_name = "article-rag-index"

pinecone = PineconeVectorStore.from_documents(
    documents,embeddings,index_name=index_name
)

  from tqdm.autonotebook import tqdm


In [28]:
#Retrieving data from pinecone intead of system memory
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

pinecone_retriever = pinecone.as_retriever()
pinecone_setup = RunnableParallel(context=pinecone_retriever, question=RunnablePassthrough())
pinecone_setup.invoke("What is the future of AI?")

{'context': [Document(page_content="space but in the digital space it just feels like it's going to be very tricky. Very tricky to out because it seems to be pretty low cost to fake stuff. What are you going to put an AI in jail for like trying to use a fake personhood proof? I mean okay fine you'll put a lot of AI in jail but there'll be more AI's like exponentially more. The cost of creating bought is very low. Unless there's some kind of way to track accurately like you're not allowed to create any program without showing tying yourself to that program. Like any program that runs on the internet you'll be able to trace every single human program that was involved with that program. Yeah maybe you have to start declaring when you know we have to start drawing those boundaries and keeping track of okay what are digital entities versus human entities and what is the ownership of human entities and digital entities and something like that. I don't know but I think I'm optimistic that th

In [29]:
#Chaining pinecone setup
chain = pinecone_setup | prompt | model | parser
chain.invoke("What is the future of AI?")

'The future of AI involves the challenge of distinguishing between digital entities and human entities, establishing boundaries, and tracking ownership. It may require implementing measures to accurately trace the origin of programs running on the internet.'

## Wrap retriever with LLMLingua

**Cost-Saving Strategy**: LLMLingua help in prompt compression and thus reduction in input token cost. Below is an example of using [LLMLingua with LangChain](https://python.langchain.com/docs/integrations/retrievers/llmlingua). 

You can read more about LLMLingua in my article: ["Cost-Saving Strategies for Large Language Models(LLMs) - Part 1"](https://www.linkedin.com/pulse/cost-reduction-strategies-large-language-modelsllms-mahima-chhagani-7642c/?trackingId=WmayODJxThKUsQWc1jiGqw%3D%3D)

In [32]:
#Wrapping our base retriever with a ContextualCompressionRetriever, using LLMLinguaCompressor as a compressor
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_compressors.llmlingua_filter import LLMLinguaCompressor

compressor = LLMLinguaCompressor(model_name="openai-community/gpt2", device_map="cpu")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=pinecone_retriever #Passing pinecone_retriever in base_retriever
)

compressed_docs = compression_retriever.get_relevant_documents(
    "What is synthetic intelligence?"
)
print(compressed_docs)

[Document(page_content="I think it's possible that physics has exploits and we should be trying to find them arranging some kind of a crazy quantum mechanical system that somehow gives you buffer overflow, somehow gives you a rounding error in the floating point. Synthetic intelligences are kind of like the next stage of development. And I don't know where it leads to. Like at some point, I suspect the universe is some kind of a puzzle. These synthetic AIs will uncover that puzzle and solve it. The following is a conversation with Andre Kappathi, previously the director of AI at Tesla. And before that, at OpenAI and Stanford, he is one of the greatest scientist engineers and educators in the history of artificial intelligence. This is the Lex Friedman podcast to support it. Please check out our sponsors and now to your friends. Here's Andre Kappathi. What is a neural network? And what does it seem to do such a surprisingly good job of learning? What is a neural network? It's a mathemat

In [33]:
#Testing compressed_retriver
from langchain.chains import RetrievalQA

compressed_chain = RetrievalQA.from_chain_type(llm=model, retriever=compression_retriever)
compressed_chain.invoke("What is synthetic intelligence?")

{'query': 'What is synthetic intelligence?',
 'result': 'Synthetic intelligence refers to artificial intelligence systems that are designed to mimic human-like cognitive functions and behaviors. These systems are created using algorithms and computational models to perform tasks that typically require human intelligence, such as learning, problem-solving, and decision-making.'}

In [34]:
#Creating a setup for LLMLingua retriever
LLMLingua_setup = RunnableParallel(context=compression_retriever, question=RunnablePassthrough())
LLMLingua_setup.invoke("What is the future of AI?")

{'context': [Document(page_content="space but in the digital space it just feels like it's going to be very tricky. Very tricky to out because it seems to be pretty low cost to fake stuff. What are you going to put an AI in jail for like trying to use a fake personhood proof? I mean okay fine you'll put a lot of AI in jail but there'll be more AI's like exponentially more. The cost of creating bought is very low. Unless there's some kind of way to track accurately like you're not allowed to create any program without showing tying yourself to that program. Like any program that runs on the internet you'll be able to trace every single human program that was involved with that program. Yeah maybe you have to start declaring when you know we have to start drawing those boundaries and keeping track of okay what are digital entities versus human entities and what is ownership of human entities and digital entities and something like that. I don't know but I think I'm optimistic that this i

In [35]:
#Chaining LLMLingua_setup instead of pinecone
chain2 = LLMLingua_setup | prompt | model | parser

In [36]:
#Testing the chain
chain2.invoke("What is the future of AI?")

'The future of AI is uncertain and it is believed to be very tricky due to the ease of creating fake content. There is a concern about the potential for AI to be used for illegal activities, but there is also optimism that it is possible to track and regulate AI effectively.'

## LLM Caching with LangChain

**Cost-Saving Strategy**: Cashing the results to avoid generating data for the same question  multiple times. This reduces the number of queries and hence save the cost.

In [37]:
#InMemory Caching
from langchain.globals import set_llm_cache
from langchain.cache import InMemoryCache

set_llm_cache(InMemoryCache())

chain2.invoke("What is synthetic intelligence?")

'Synthetic intelligence is described as the next stage of development, similar to artificial intelligence but with the potential to uncover and solve the puzzle of the universe.'

#### SQLite Caching

We generate a database named cache that stores the cached data. 

In [38]:
#Caching using SQLite
from langchain.cache import SQLiteCache

set_llm_cache(SQLiteCache(database_path=".cache.db"))

In [39]:
import sqlalchemy
from sqlalchemy import create_engine, text

engine = create_engine("sqlite:///.cache.db")

In [40]:
#Testing
chain2.invoke("What is the document about?")

'The document is about the challenges and complexities of working with computers, setting up developer environments, hardware, environmental variables, scripts, and automation processes. It also mentions the concept of archive as a pre-print server for academic research publishing.'

In [41]:
#Testing with the same question
chain2.invoke("What is the document about?")

'The document is about the challenges and complexities of working with computers, setting up developer environments, hardware, environmental variables, scripts, and automation processes. It also mentions the concept of archive as a pre-print server for academic research publishing.'

In [42]:
#Testing with the same question but one extra space
chain2.invoke("What is the  document about?")

'The document is about the challenges and complexities of working with computers and setting up developer environments, as well as discussing the concept of archive as a pre-print server for academic research publishing.'

**Note**: Running the same question twice does not result in saving it in cache but when we run the same question with an extra space it creates another row in the database. To resolve this, we can use sematic caching using [redis or gptcache](https://github.com/sugarforever/LangChain-Tutorials/blob/main/LangChain_Caching.ipynb).

In [43]:
#Checking the cache memory
with engine.connect() as connection:

    rs = connection.exec_driver_sql('select * from full_llm_cache')
    print(rs.keys())
    for row in rs:
        print(row)

RMKeyView(['prompt', 'llm', 'idx', 'response'])
('[{"lc": 1, "type": "constructor", "id": ["langchain", "schema", "messages", "HumanMessage"], "kwargs": {"content": "\\nAnswer the question based on t ... (1342 characters truncated) ... t about it three minutes\', metadata={\'source\': \'transcription.txt\'})]\\n\\nQuestion: What is the document about?\\n", "additional_kwargs": {}}}]', '{"lc": 1, "type": "constructor", "id": ["langchain", "chat_models", "openai", "ChatOpenAI"], "kwargs": {"temperature": 0.0, "max_tokens": 150, "opena ... (10168 characters truncated) ... ": "string"}}, "required": ["content", "tool_call_id"]}}}}], "edges": [{"source": 0, "target": 1}, {"source": 1, "target": 2}]}}---[(\'stop\', None)]', 0, '{"lc": 1, "type": "constructor", "id": ["langchain", "schema", "output", "ChatGeneration"], "kwargs": {"message": {"lc": 1, "type": "constructor", "i ... (303 characters truncated) ... as a pre-print server for academic research publishing.", "additional_kwargs": {}}}