In [10]:
import os
import logging
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [4]:
# Setting the environment path and autheticating the OpenAI API

os.environ["OPENAI_API_KEY"] = "Your API key"

In [2]:
# Loading the data

data_loader =  WikipediaLoader(query="Sachin Tendulkar")
data = data_loader.load()
data

[Document(page_content='Sachin Tendulkar, ( ; pronounced [sətɕin teːɳɖulkəɾ]; born 24 April 1973) is an Indian former international cricketer who captained the Indian national team. He is widely regarded as one of the greatest batsmen in the history of cricket. Hailed as the world\'s most prolific batsman of all time, he is the all-time highest run-scorer in both ODI and Test cricket with more than 18,000 runs and 15,000 runs, respectively. He also holds the record for receiving the most player of the match awards in international cricket. Tendulkar was a Member of Parliament, Rajya Sabha by nomination from 2012 to 2018.\nTendulkar took up cricket at the age of eleven, made his Test match debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for over 24 years. In 2002, halfway through his career, Wisden ranked him the second-greatest Test batsman of all time, behind Don Bradman, and the second

In [5]:
# Now we need to split the data 
text_splitter = CharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(data)
docs

Created a chunk of size 2783, which is longer than the specified 1000
Created a chunk of size 1096, which is longer than the specified 1000
Created a chunk of size 2453, which is longer than the specified 1000
Created a chunk of size 1707, which is longer than the specified 1000
Created a chunk of size 2197, which is longer than the specified 1000
Created a chunk of size 2097, which is longer than the specified 1000
Created a chunk of size 2554, which is longer than the specified 1000
Created a chunk of size 2018, which is longer than the specified 1000
Created a chunk of size 1141, which is longer than the specified 1000
Created a chunk of size 2000, which is longer than the specified 1000
Created a chunk of size 2537, which is longer than the specified 1000
Created a chunk of size 1606, which is longer than the specified 1000
Created a chunk of size 1320, which is longer than the specified 1000
Created a chunk of size 1422, which is longer than the specified 1000
Created a chunk of s

[Document(page_content='Sachin Tendulkar, ( ; pronounced [sətɕin teːɳɖulkəɾ]; born 24 April 1973) is an Indian former international cricketer who captained the Indian national team. He is widely regarded as one of the greatest batsmen in the history of cricket. Hailed as the world\'s most prolific batsman of all time, he is the all-time highest run-scorer in both ODI and Test cricket with more than 18,000 runs and 15,000 runs, respectively. He also holds the record for receiving the most player of the match awards in international cricket. Tendulkar was a Member of Parliament, Rajya Sabha by nomination from 2012 to 2018.\nTendulkar took up cricket at the age of eleven, made his Test match debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for over 24 years. In 2002, halfway through his career, Wisden ranked him the second-greatest Test batsman of all time, behind Don Bradman, and the second

In [6]:
print(len(data))
print(len(docs))

25
75


In [9]:
# Embedding function used in this scenario
# Storeing the embedded vector in Chroma

embedding_function = OpenAIEmbeddings()
DB = Chroma.from_documents(docs, embedding=embedding_function, persist_directory='./Sachin_db')
DB.persist()

In [11]:
# We need the instance of LLM used for compression

llm  = ChatOpenAI(temperature=0)

In [13]:
# Now we need to compress the document (defining the compressor)
compressor = LLMChainExtractor.from_llm(llm)

# and then retrieve the result from the compressor
compressor_retriver =  ContextualCompressionRetriever(base_compressor=compressor, 
                                                      base_retriever=DB.as_retriever())


In [14]:
compressor_retriver

ContextualCompressionRetriever(base_compressor=LLMChainExtractor(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], output_parser=NoOutputParser(), template='Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. \n\nRemember, *DO NOT* edit the extracted parts of the context.\n\n> Question: {question}\n> Context:\n>>>\n{context}\n>>>\nExtracted relevant parts:'), llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x16b6454d0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x169497c50>, temperature=0.0, openai_api_key='sk-proj-vhZsCqXIvjGZkwwbkKRYT3BlbkFJto7ELDnJAifqCspv5g1g', openai_proxy='')), get_input=<function default_get_input at 0x1117b44a0>), base_retriever=VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma 

In [19]:
# we can see the difference between the ordinary retriver with help of similarity serach and compressor retriever

# Ordinary one shows its a huge document (contains both relevant and irrelavant text as per query)
docs = DB.similarity_search(query="When does Sachin scored his first century")
print(docs[0].page_content)
print(docs[1].page_content)
print(docs[2].page_content)
print(docs[3].page_content)

This page presents some of the notable achievements of Sachin Tendulkar, a former Indian cricketer, universally regarded as one of the best batsmen of all time. Debates on Tendulkar's precise rank amongst his predecessors are unlikely to conclude soon. He was the sport's first batsman to score a double century (200 runs not out) in a single One Day International match, and is so far the only player to have scored 100 centuries in internationals. He played first-class cricket for 26 years and one day, whilst his international career spanned exactly 24 years from 15 November 1989 to 16 November 2013.


== Honours ==
Sachin Tendulkar is a retired Indian cricketer who is widely acknowledged as one of the greatest batsmen of all time, he is the most prolific run-scorer in international cricket. Tendulkar has scored the highest number of centuries (100 or more runs) in Test matches and One Day International (ODI) matches organised by the International Cricket Council. His total of 51 centuri

In [20]:
compressed_docs = compressor_retriver.get_relevant_documents(query="When does Sachin scored his first century")
compressed_docs



[Document(page_content='After making his Test debut in 1989, Tendulkar achieved his first century against England at Old Trafford, Manchester in 1990; he made 119 not out.', metadata={'source': 'https://en.wikipedia.org/wiki/List_of_international_cricket_centuries_by_Sachin_Tendulkar', 'summary': "Sachin Tendulkar is a retired Indian cricketer who is widely acknowledged as one of the greatest batsmen of all time, he is the most prolific run-scorer in international cricket. Tendulkar has scored the highest number of centuries (100 or more runs) in Test matches and One Day International (ODI) matches organised by the International Cricket Council. His total of 51 centuries in Test matches is a world record for highest number of centuries by a batsman and his 49 centuries in ODI matches are the second highest number of centuries after Virat Kohli. He became the first and only cricketer to score 100 international centuries when he made 114 against Bangladesh in March 2012.\nAfter making hi

In [27]:
# Precise information can be found on page_content

compressed_docs[0].page_content

'After making his Test debut in 1989, Tendulkar achieved his first century against England at Old Trafford, Manchester in 1990; he made 119 not out.'

In [29]:
# We can also find the summary for the compression
compressed_docs[0].metadata

{'source': 'https://en.wikipedia.org/wiki/List_of_international_cricket_centuries_by_Sachin_Tendulkar',
 'summary': "Sachin Tendulkar is a retired Indian cricketer who is widely acknowledged as one of the greatest batsmen of all time, he is the most prolific run-scorer in international cricket. Tendulkar has scored the highest number of centuries (100 or more runs) in Test matches and One Day International (ODI) matches organised by the International Cricket Council. His total of 51 centuries in Test matches is a world record for highest number of centuries by a batsman and his 49 centuries in ODI matches are the second highest number of centuries after Virat Kohli. He became the first and only cricketer to score 100 international centuries when he made 114 against Bangladesh in March 2012.\nAfter making his Test debut in 1989, Tendulkar achieved his first century against England at Old Trafford, Manchester in 1990; he made 119 not out. In Test matches, Tendulkar has scored centuries a

In [30]:
compressed_docs[0].metadata['summary']

"Sachin Tendulkar is a retired Indian cricketer who is widely acknowledged as one of the greatest batsmen of all time, he is the most prolific run-scorer in international cricket. Tendulkar has scored the highest number of centuries (100 or more runs) in Test matches and One Day International (ODI) matches organised by the International Cricket Council. His total of 51 centuries in Test matches is a world record for highest number of centuries by a batsman and his 49 centuries in ODI matches are the second highest number of centuries after Virat Kohli. He became the first and only cricketer to score 100 international centuries when he made 114 against Bangladesh in March 2012.\nAfter making his Test debut in 1989, Tendulkar achieved his first century against England at Old Trafford, Manchester in 1990; he made 119 not out. In Test matches, Tendulkar has scored centuries against all the Test cricket playing nations, and is the second batsman to score 150 against each of them. He has sco