<a href="https://colab.research.google.com/github/nandyc/langchain-rag-basics/blob/main/langchain_rag_Summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/nandyc/langchain-rag-basics.git

Cloning into 'langchain-rag-basics'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 20 (delta 6), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (20/20), 162.29 KiB | 3.25 MiB/s, done.
Resolving deltas: 100% (6/6), done.


In [2]:
%cd langchain-rag-basics

/content/langchain-rag-basics


In [9]:
%%capture
### Install required modules and set the envvar for Gemini API Key
!pip install pypdf
!pip install chromadb
!pip install google.generativeai
!pip install langchain-google-genai
!pip install langchain
!pip install langchain_community

In [4]:
#Import Python modules
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.vectorstores import Chroma
from google.colab import userdata

In [5]:
#export GOOGLE_API_KEY="YOUR_GOOGLE_API_KEY"
GOOGLE_API_KEY =userdata.get('GEMINI_APIKEY')

In [6]:
#Load the models
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key = GOOGLE_API_KEY)
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key = GOOGLE_API_KEY)

In [25]:
embeddings

GoogleGenerativeAIEmbeddings(client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x79d0bc19b8e0>, model='models/embedding-001', task_type=None, google_api_key=SecretStr('**********'), credentials=None, client_options=None, transport=None, request_options=None)

In [11]:
#Load the PDF and create chunks
loader = PyPDFLoader("./data/Reimagining Data Visualization to Address Sustainability Goals.pdf")
text_splitter = CharacterTextSplitter(
    separator=".",
    chunk_size=250,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

In [12]:
pages = loader.load_and_split(text_splitter)



In [26]:
len(pages)

163

In [27]:
pages

[Document(metadata={'source': './data/Reimagining Data Visualization to Address Sustainability Goals.pdf', 'page': 0}, page_content='arXiv:2409.03611v1  [cs'),
 Document(metadata={'source': './data/Reimagining Data Visualization to Address Sustainability Goals.pdf', 'page': 0}, page_content='HC]  5 Sep 2024Reimagining Data Visualization to Address Sustainability Goals\nNarges Mahyar *\nManning College of Information and Computer Sciences, Univ ersity of Massachusetts Amherst\nABSTRACT\nInformation visualization holds signiﬁcant potential to s upport sus-\ntainability goals such as environmental stewardship, and c limate\nresilience by transforming complex data into accessible vi sual for-\nmats that enhance public understanding of complex climate c hange\ndata and drive actionable insights'),
 Document(metadata={'source': './data/Reimagining Data Visualization to Address Sustainability Goals.pdf', 'page': 0}, page_content='While the ﬁeld has predo mi-\nnantly focused on analytical orie

In [13]:
#Turn the chunks into embeddings and store them in Chroma
vectordb=Chroma.from_documents(pages,embeddings)

In [14]:
#Configure Chroma as a retriever with top_k=5
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

In [15]:
#Create the retrieval chain
template = """
You are a helpful AI assistant.
Answer based on the context provided.
context: {context}
input: {input}
answer:
"""

In [16]:
prompt = PromptTemplate.from_template(template)
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [23]:
#Invoke the retrieval chain
response=retrieval_chain.invoke({"input":"what is NIMBY "})

In [24]:
#Print the answer to the question
print(response["answer"])

NIMBY is an acronym that stands for "Not In My Backyard." It refers to a common phenomenon in urban planning where residents oppose new developments or changes that they perceive as negative, even if they may benefit the community as a whole.
