# Similarity search. 

`BaseRetriever` class in LangChain is as fllows:

In [1]:
from abc import ABC, abstractmethod
from typing import List
from langchain.schema import Document

class BaseRetriever(ABC):
    @abstractmethod
    def get_relevant_documents(self, query: str) -> List[Document]:
        """Get texts relevant for a query.

        Args:
            query: string to find relevant texts for

        Returns:
            List of relevant documents
        """

In [2]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
loader = TextLoader('state_of_the_union.txt', encoding='utf8')

In [3]:
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator().from_loaders([loader])

Using embedded DuckDB without persistence: data will be transient


In [4]:
index.vectorstore

<langchain.vectorstores.chroma.Chroma at 0x198facdadc0>

In [5]:
index.vectorstore.as_retriever()

VectorStoreRetriever(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x00000198FACDADC0>, search_type='similarity', search_kwargs={})

# Walkthrough

Logic of `VectorstoreIndexCreator`:

In [74]:
document = loader.load()

In [76]:
document[0].page_content

'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.\n\nLast year COVID-19 kept us apart. This year we are finally together again.\n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.\n\nWith a duty to one another to the American people to the Constitution.\n\nAnd with an unwavering resolve that freedom will always triumph over tyranny.\n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated.\n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined.\n\nHe met the Ukrainian people.\n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.\n\nGroups of citizens blocking tanks with their bodi

In [77]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(document)

In [78]:
texts[0]

Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.\n\nLast year COVID-19 kept us apart. This year we are finally together again.\n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.\n\nWith a duty to one another to the American people to the Constitution.\n\nAnd with an unwavering resolve that freedom will always triumph over tyranny.\n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated.\n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined.\n\nHe met the Ukrainian people.\n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.', metadata={'source': 'state_o

In [79]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

from langchain.vectorstores import Chroma
db = Chroma.from_documents(texts, embeddings)

Using embedded DuckDB without persistence: data will be transient


In [145]:
db

<langchain.vectorstores.chroma.Chroma at 0x21973706cd0>

In [83]:
retriever = db.as_retriever()

In [84]:
retriever

VectorStoreRetriever(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x0000021973706CD0>, search_type='similarity', search_kwargs={})

In [19]:
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)

In [20]:
query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)

"The President said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, from a family of public school educators and police officers, and a consensus builder that has received support from the Fraternal Order of Police and former judges appointed by Democrats and Republicans."

In [143]:
index_creator = VectorstoreIndexCreator(
        vectorstore_cls=Chroma,
        embedding=OpenAIEmbeddings(),
        text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200))

In [144]:
index_creator.from_loaders([loader]).vectorstore

Using embedded DuckDB without persistence: data will be transient


<langchain.vectorstores.chroma.Chroma at 0x21972cdfeb0>

### Try with pinecone

In [15]:
import pinecone 
import os
from langchain.vectorstores import Pinecone
from tqdm.autonotebook import tqdm

# initialize pinecone
pinecone.init(
    api_key=os.environ.get("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment= "us-east4-gcp"  # next to api key in console
)

index_name = "test"
index = pinecone.Index(index_name)
index_stats_response = index.describe_index_stats()
index_stats_response

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 924}},
 'total_vector_count': 924}

### PDF file: Pinecone

In [19]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

In [18]:
loader = PyMuPDFLoader("..\\..\\Docs\\PDFs\\2022-Annual-Review.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
#text = list(map(lambda p: p.page_content, docs))
embeddings = OpenAIEmbeddings()


import pinecone 
import os
from langchain.vectorstores import Pinecone
from tqdm.autonotebook import tqdm

# initialize pinecone
pinecone.init(
    api_key=os.environ.get("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment= "us-east4-gcp"  # next to api key in console
)

index_name = "test"

docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name, namespace="annual_report")

In [17]:
query = "What is the prifit margin?"
search_docs = docsearch.similarity_search(query)
search_docs

[Document(page_content="customers’ future demand. We place non-cancellable inventory orders for certain products in advance of our historical \nlead times, pay premiums and provide deposits to secure future supply and capacity. We also adjust to other market \nfactors, such as product offerings and pricing actions by our competitors, new product transitions, and macroeconomic \nconditions - all of which may impact demand for our products.\nRefer to the Gross Profit and Gross Margin discussion below in this Management's Discussion and Analysis for further \ndiscussion.\nRevenue Recognition\nWe derive our revenue from product sales, including hardware and systems, license and development arrangements, \nsoftware licensing, and cloud services. We determine revenue recognition through the following steps: (1) identification \nof the contract with a customer; (2) identification of the performance obligations in the contract; (3) determination of the", metadata={'author': '', 'creationDate':

In [22]:
retriever = docsearch.as_retriever()
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)

In [23]:
query = "What is the profit margine?"
qa.run(query)

' The gross margin of our Graphics segment increased during fiscal year 2022 when compared to fiscal year 2021, primarily due to higher-end mix within GeForce GPUs. The gross margin of our Compute & Networking segment decreased during fiscal year 2022 when compared to fiscal year 2021, primarily due to a shift in product mix and partially offset by a reduced contribution from Automotive solutions. As a result, the overall net effect on our gross margin was an unfavorable impact of 0.9% in fiscal year 2022 and insignificant in fiscal year 2021.'

## From Mayo's course: (23:00)

Convert texts to vectors

In [113]:
texts = list(map(lambda p: p.page_content, docs))
vectors = embeddings.embed_documents(texts)

In [114]:
len(vectors)

924

In [115]:
len(docs)

924

In [116]:
len(vectors[0])

1536

In [121]:
docs[0]

Document(page_content='2022\nNVIDIA CORPORATION \nANNUAL REVIEW\nNOTICE OF ANNUAL MEETING\nPROXY STATEMENT \nFORM 10-K', metadata={'source': '..\\..\\Docs\\PDFs\\2022-Annual-Review.pdf', 'file_path': '..\\..\\Docs\\PDFs\\2022-Annual-Review.pdf', 'page_number': 1, 'total_pages': 207, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign 17.2 (Windows)', 'producer': 'Adobe PDF Library 16.0.7', 'creationDate': "D:20220419103350-07'00'", 'modDate': "D:20220419114447-07'00'", 'trapped': ''})

## Same thing in CSV file

In [146]:
from langchain.document_loaders import CSVLoader
loader = CSVLoader("..\\..\\Docs\\CSV_files\\nvda_data.csv")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
docs = text_splitter.split_documents(documents)
#text = list(map(lambda p: p.page_content, docs))
embeddings = OpenAIEmbeddings()

from langchain.vectorstores import Chroma
docsearch = Chroma.from_documents(docs, embeddings)

Using embedded DuckDB without persistence: data will be transient


In [138]:
query = "Do you see any pattern (Relationship with Adj Close) on this data?"
searchDocs = docsearch.similarity_search(query)

In [139]:
searchDocs

[Document(page_content='Close: 162.6999969482422\nAdj Close: 162.63076782226562\nVolume: 16793400.0', metadata={'source': '..\\..\\Docs\\CSV_files\\nvda_data.csv', 'row': 4}),
 Document(page_content='Close: 153.1699981689453\nAdj Close: 153.10482788085938\nVolume: 40473900.0', metadata={'source': '..\\..\\Docs\\CSV_files\\nvda_data.csv', 'row': 1}),
 Document(page_content='Close: 167.64999389648438\nAdj Close: 167.62152099609375\nVolume: 45293200.0', metadata={'source': '..\\..\\Docs\\CSV_files\\nvda_data.csv', 'row': 40}),
 Document(page_content='Close: 153.38999938964844\nAdj Close: 153.36395263671875\nVolume: 56504500.0\nChange: -11.6199951171875', metadata={'source': '..\\..\\Docs\\CSV_files\\nvda_data.csv', 'row': 23})]

In [148]:
retriever = docsearch.as_retriever()
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)
query = "Do you see any pattern on price movements?"
qa.run(query)

" No, I don't see any pattern in the price movements."

## Use DirectoryLoader to load all the documents inside the directory

## This can get the news I get from the Financial API:

In [27]:
from langchain.document_loaders import UnstructuredURLLoader

urls = ["https://finance.yahoo.com/news/nvidia-nvda-outpaces-stock-market-214509412.html"]

loader = UnstructuredURLLoader(urls=urls)

In [28]:
data = loader.load()

In [29]:
data

[Document(page_content="Zacks\n\nNvidia (NVDA) Outpaces Stock Market Gains: What You Should Know\n\nOops!Something went wrong.Please try again later.More content below\n\nNVDA\n\n\n\nZacks Equity Research\n\n3 min read\n\nIn this article:\n\nOops!Something went wrong.Please try again later.More content below\n\nNVDAWatchlist\n\nNvidia (NVDA) closed the most recent trading day at $270.02, moving +0.91% from the previous trading session. This move outpaced the S&P 500's daily gain of 0.33%. Meanwhile, the Dow gained 0.3%, and the Nasdaq, a tech-heavy index, added 1.92%.\n\nHeading into today, shares of the maker of graphics chips for gaming and artificial intelligence had gained 4.02% over the past month, lagging the Computer and Technology sector's gain of 7.45% and the S&P 500's gain of 5.67% in that time.\n\nWall Street will be looking for positivity from Nvidia as it approaches its next earnings report date. The company is expected to report EPS of $0.92, down 32.35% from the prior-y