# RAG Using LangChain & SingleStore

### Install libraries & dependencies

In [8]:
!pip install langchain --quiet
!pip install --upgrade openai==0.28.1 --quiet
!pip install pdf2image --quiet
!pip install pdfminer.six --quiet
!pip install singlestoredb --quiet
!pip install tiktoken --quiet
!pip install --upgrade unstructured==0.10.14 --quiet
!pip install -qU pypdf langchain_community

### Import the libraries

In [12]:
from langchain.document_loaders import PyPDFLoader
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
import os

### Load your custom document

In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "https://unctad.org/system/files/official-document/wesp2023_en.pdf"
loader = PyPDFLoader(file_path)

data = loader.load()

### Split the document into chunks

In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

print(f"You have {len(data)} document(s) in your data")
print(f"There are {len(data[0].page_content)} characters in your document")

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap = 0)
texts = text_splitter.split_documents(data)

print(f"You have {len(texts)} pages")

You have 178 document(s) in your data
There are 44 characters in your document
You have 379 pages


### Useing OpenAI API to generate embeddings for the document chunks

In [22]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key: ")

OpenAI API Key:  ········


### Let's store our document embeddings into SingleStore database

In [23]:
from langchain.embeddings import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

#from langchain.vectorstores.singlestoredb as s2
from langchain.vectorstores import SingleStoreDB
#from langchain.vectorstores.utils import DistanceStrategy

#s2.ORDERING_DIRECTIVE["DOT_PRODUCT"] = s2.ORDERING_DIRECTIVE[DistanceStrategy.DOT_PRODUCT]

docsearch = SingleStoreDB.from_documents(
    texts,
    embedding,
    table_name = "pdf_wes",
    #distance_strategy = "DOT_PRODUCT"
)

  embedding = OpenAIEmbeddings()


### Let us check the text chunks and associated embeddings stored inside our database

In [25]:
%%sql
select * from pdf_wes limit 1;

id,content,vector,metadata
1125899906842631,IMF International Monetary Fund NAIRU Non-accelerating inflation rate of unemployment ODA Official development assistance OECD Organisation for Economic Co-operation and Development OPEC Plus Organization of the Petroleum Exporting Countries Plus PPP purchasing power parity SDGs Sustainable Development Goals SDRs Special drawing rights TRIPS Trade-related aspects of intellectual property rights UNCTAD United Nations Conference on Trade and Development UN DESA United Nations Department of Economic and Social Affairs UNWTO United Nations World Tourism Organization WHO World Health Organization,"b'\xa5~\\\xbcQe0\xbc\xe6g\xdb<3\x89\xa9\xbb\xc3\xf7e\xbc\x94\xfb#\xba\xd0P4\xbc\xfd\x8d\x82<\x97\xea\x96\xbcr\x0f\xab\xbc\xa7\x86\xd5;\xdcF\x85\xb8Tu\xa2\xbc\x9c<\x07=\xfd\xae\x81\xbbqI\xb0\xbc\xf8\x1a\x93\xbcoA\xb7;\xfb\x85\t\xbd\xdc\x04\x07<\x83q\xe4\xbc\xb3[\'<\xbe!\xf9:\x12\x84*<3\xaa\xa8;\x88\x05S;\x8f\xe38=\xb8\x10\x15=\xee,\xbb;xi\x94:\x8d\xdb?=]:\x02\xbd-P\xbf\xbc\x89\xaaN\xbcP\x023\xbcs\xb4\xa6;}\x17{<;\x0c\x8b\xbct\xd5\xa5\xbc\xb1\xf0\xb0\xbc\xf5\n\xa1\xba\xdcg\x04\xbc\xd4\xe4""<\xd6\r\x1b=\xbd\x9d|\xbbi\xe7\xcd\xbb\xeb^\xc7;DN\xe0\x91\xf9\xbb\xd2\x9a\xab<\xcc\xfeC\xbb9F\x10\xbc\x19 \x92\xbc\xf9\\\x11\xbc>\xb2\xf8\xbc~Yy<\xc0k\xf0\x8c<\xb5B!=O~\xb6\xbc\x8b\xf4\xc5\xbbzP\x0e\xbbu\xdd\x1e\xbc\x00\x13\xf1\xbb\x03\xe1d\xbb\\\xd7\x04\xbcM\x13@<\x97\xa8\x98:\x9d#\x81<\xee,;\xbc\xb9\xb5\x10=\xfc\xe8\x06=\x1bj\t\xbc\xf3\xc0)\xbb\xc3\xf7\xe5\xbb/y\xb7\xbc~zx\xbc\x05\n\xdd\xbc\xb9\xb5\x90<\xfa\xe0\r\xbd\xdb\xc2\x08\xbc\xb9\x94\x11\xbdy\x0e\x10=*\x82K<\xbd|};i\xe7M\xbc\t;N\xbc\xb4\x00\xa3\xbc\x0e\x11;\xbc+i\xc5\x91\xf9:\xab\xb7F<8\xc2\x93\xbc\x01\xfa\xea\xba\xb3:(<\x18\xde\x93\xbc\xbecw\xbb<\xf3\x04\xba\xc7\x8bT;\x97\xea\x96\x91y\xbc=\x98\x80<\x1c\xee\x05=Xd\x15\xbc\x91\x0c\xb1;\xdd\xeb\x809\x86\xdcZ<*\xc4\xc9\xba\x1c\xee\x05\xbcA\xa1\xeb9y/\x8f\xbcXC\x96:N\xb8;\xbb\xfe+\xf7\xbc\xa9m\xcf\xba\x15\xef\xa0\xbbv\x1f\x9d<\xd1\xd40<\xd5\x89\x9e\xbc8_\x96;\xc1\xef\xec<}\xb4}<\x16\xb5\x9b;\xc6\xe6\xd841`1\xbcP\xe13\xbc\xfc#~=\x8ap\xc9<\xf2<-\xbc\xef\x8f\xb8<\xc8\x0fQ\xbb\x86?X\xbc\xb5\xa5\x1e\xbc\x82\xed\xe7\xbc\x01\x97m;\xbd(\x00;=p\xfa;E\xb1]:\x10\xf8\xb4<\x1bI\n<\xb4\x00#<\xb81\x94\xbb\xe5\xe3^;H=\xd3\xbc\x12B,\x91\xf9\xbasr\xa8\xbc\x1b\x8b\x88\xbaI$M<:\xa9\r\xbcQe0=[2\t\xbd\xba{\x0b\xbdU\x96\xa1<\xba\x18\x8e<\xf1\xb8\xb0;3\xec\xa6;\xcd\x06=;v@\x9c<\xfa""\x0c\xbcG\x1c\xd4\xbcV;\x9d\xb9W\xe0\x98;\xca\xf6J\xbc\xc4\x9ca\xbb\x16R\x9e;\x8a\rL\xbc\x9c<\x87\xb2\xf8\xbc\xbd|}\xbc\x004p<}`\x80\xbc4\x91\xa2<\xc2\x94h\xbc\xb3[\'<\xfdD\xfd;\x88hP\xba\x08\xb7\xd1<\xdd\x0c\x80\xbb\x12!\xad\xbb\x00\x13q<\x00U\xef<1`1<=O\xfb;{\x16\x89\xbb\x80\xe5n<\x00Uo\xbbJE\xcc\xbc\xd4\xe4""\xbc\xc5\xff\xde;P#\xb2;\xd0\x922\xbazP\x8e;\xbb\xbd\t\xbc\x18\x9c\x95\xbc\x80\xe5n<\xf3\x02\xa8\xbc\x97\xea\x16n<\x01\xd9k\xbc\xd3\xdc\xa9;B\xe3\xe9;@\xdbp\x8c;\xbd\xc5\x02=>\xf4v\xb9\xb3:(\xbd\x84\x16\xe09\x11\x00.<\x0b""H\xbc\xfb\xc7\x87<\xd3`&<\xfc#~<\x0e\xf0;\xbd\xb6\x08\x9cr0\xaa\xbcW\x9e\x9a<\x90\xeb\xb1<}\x17\xfb\xbc\x08\x96\xd2<2&,=\xe7NU\xbc\x8c\x99\xc1;\xe95\xcf;\xb5\x84\x1f\xbd66\x1e=\xf7u\x97\xbc\xbf\xc6\xf4;\xd8\x15\x94\xbc\x91N\xaf\xbcK\x0bG\xbd\x8aO\xca\xbcDN\xe0\xbc\x1b\x8b\x08=\x17\x18\x19\xbc:\xeb\x0b\xbc\xfc\xe1\x7f\xbc\xfbd\x8a\xbb\xf4\xa7#<\x9a\x13\x0f;\x8b\x15\xc5\xbb]|\x00\xd3\xf7\xbc\x0e\xcf<:\xec\xe2C\xbc\x80\xe5n\xbbmZ\xbd=\xb6\xe7\x1c<\xd3`\xa6;\xfb\x85\t\xbcTT#\xbc\xfeLv\xba\x8fF\xb6;\xca\xd5\xcb<\x9c\xc0\x03\xbd\x80a\xf2;\x8b\xf4E\xbc;\x90\x07<\xb0\xae2\xbch\x84P\xbaV\x1a\x9e;\x91N/\xbc>\xf4v\xbc\xc4\x9ca;\xb0\xcf1<\x87\xa2U<<\xcb\xfe\xbcA\xa1\xeb\xbc\xd7p\x18;\xbf\xe7\xf3\xbc\xf4\xc8\xa2<\x03\x02\xe4\xbb\xf2\x9f\xaa;\xdc\x04\x07\x91\xf9\xbbH\xa0P\x8c;\x82\x0e\xe7\xbc\xba\x18\x8e\xbc%\xcd\xdd\xbd\xeb\xa0\xc5\xbc\x95^\xa1<\xa7eV<\xca\xf6\xca\xba\xc1\xce\xed\xb2\xf8\xbcn{\xbc;\xd9\x99\x10:\x9c\x9f\x04;\x1b(\x8b;\n\x01\xc9;\xbcA\x86\xbbj\x8cI\xbb\x10\xf8\xb4;S\xd0\xa6<\x13)\xa6\xbc\x04\x86`<\xa8\xe9R\xbc}?\x81\xbd\xbf\xc6\xf4  {'page': 5, 'source': 'https://unctad.org/system/files/official-document/wesp2023_en.pdf'}",


### Ask a query against your custom data (the pdf that you loaded) using just similarity search to retrieve the top k closest content

In [26]:
query = "What India's GDP growth is projected to be?"
docs = docsearch.similarity_search(query)
print(docs[0].page_content)

99cHAPtER  iii . REGion Al DEvEloPmEnt S AnD outlooK
South Asia: A challenging road 
ahead amid global headwinds
 ● South Asia’s outlook has deteriorated 
amid challenging domestic and 
global conditions.
 ● Rising global food and energy prices are 
intensifying pressure on food security 
and undermining progress on the SDGs.
 ● The economic impact of the conflict 
in Ukraine is exacerbating existing 
vulnerabilities across the region.
The outlook for South Asia has deteriorated and 
is subject to multiple downside risks amid global 
monetary tightening, fiscal vulnerabilities, rising 
inflation and extreme weather events. Regional 
GDP growth is expected to slow to 4.8 per cent 
in 2023 from an estimated 5.6 per cent expansion 
in 2022. Overall, weaker global demand, tighter 
monetary policy, additional supply disruptions, 
further escalation in commodity prices and 
the emergence of new COVID-19 variants pose 
significant risks in 2023.
India’s GDP growth rate is projected to moderat

### Here is the augmented response to the user query

In [29]:
import openai

prompt = f"The user asked: {query}. The most similar text from the document is: {docs[0].page_content}"

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
)

print(response['choices'][0]['message']['content'])

India's GDP growth rate is projected to moderate to 5.8 per cent in 2023 from an estimated 6.4 per cent in 2022 as higher interest rates and a global economic slowdown will weigh on investment and export performance.


### Let’s test when knowledge base (custom documents like pdf) is not provided

In [30]:
from langchain.llms import OpenAI
llm = OpenAI(temperature=0.8)

  llm = OpenAI(temperature=0.8)


In [32]:
llm.predict("What India's GDP growth is projected to be in 2024?")

"\n\nIt is difficult to accurately predict India's GDP growth in 2024 as it is dependent on various factors such as economic policies, global trends, and domestic and international events. However, according to the Economic Survey 2020-2021, India's real GDP is projected to grow at a rate of 11% in the financial year 2021-2022 and return to its pre-pandemic growth trajectory of 6-7% in the following years. Therefore, it is possible that India's GDP growth in 2024 may be around 6-7%, but it could vary depending on future developments."