<a href="https://colab.research.google.com/github/pinilDissanayaka/American-History-RAG-Application/blob/main/Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install -r requirements.txt

Collecting langchain (from -r /content/requirements.txt (line 1))
  Downloading langchain-0.2.10-py3-none-any.whl (990 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m990.0/990.0 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_community (from -r /content/requirements.txt (line 2))
  Downloading langchain_community-0.2.9-py3-none-any.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_google_genai (from -r /content/requirements.txt (line 3))
  Downloading langchain_google_genai-1.0.8-py3-none-any.whl (38 kB)
Collecting sentence-transformers (from -r /content/requirements.txt (line 4))
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Collecting langchain_core (from -r /content/requirements.txt (lin

In [41]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone as PineconeStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from google.colab import userdata

In [10]:
os.environ['PINECONE_API_KEY']=userdata.get('PINECORN_API_KEY')
os.environ['GROQ_API_KEY']=userdata.get('GROQ_API_KEY')

In [11]:
INDEX_NAME="american-history"
DIMENSIONS=512

In [13]:
pinecone=Pinecone(api_key=os.environ['PINECONE_API_KEY'])

if not INDEX_NAME in pinecone.list_indexes().names():
  pinecone.create_index(
    name=INDEX_NAME,
    dimension=DIMENSIONS,
    metric="euclidean",
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)
else:
  print("Index already exists")

Index already exists


In [14]:
pinecone.list_indexes()

{'indexes': [{'dimension': 512,
              'host': 'american-history-4myrn7y.svc.aped-4627-b74a.pinecone.io',
              'metric': 'euclidean',
              'name': 'american-history',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 384,
              'host': 'pdf-rag-4myrn7y.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'pdf-rag',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [15]:
llm = ChatGroq(
    temperature=0.8,
    model="llama3-70b-8192"
)

In [16]:
loader=PyPDFLoader("/content/history_outline.pdf")

docs=loader.load()

In [17]:
docs[10]

Document(metadata={'source': '/content/history_outline.pdf', 'page': 10}, page_content='OUTLINE OF U.S. HISTORY9\ngroups and strong evidence exists \nthat neighboring tribes maintained \nextensive and formal relations — \nboth friendly and hostile  .\nTHE FIRST EUROPEANS\nThe first Europeans to arrive in \nNorth America — at least the first \nfor whom there is solid evidence \n— were Norse, traveling west from \nGreenland, where Erik the Red \nhad founded a settlement around \nthe year 985  . In 1001 his son Leif is \nthought to have explored the north -\neast coast of what is now Canada and \nspent at least one winter there  .\nWhile Norse sagas suggest that \nViking sailors explored the Atlan -\ntic coast of North America down \nas far as the Bahamas, such claims \nremain unproven  . In 1963, however, \nthe ruins of some Norse houses dat -\ning from that era were discovered at \nL’Anse-aux-Meadows in northern \nNewfoundland, thus supporting at \nleast some of the saga claims  .\nIn 1

In [18]:
textSplitter=RecursiveCharacterTextSplitter(
    chunk_size=10000,
    chunk_overlap=1000,
    length_function=len
)

textChunks=textSplitter.split_documents(docs)

textChunks[10]

Document(metadata={'source': '/content/history_outline.pdf', 'page': 12}, page_content='OUTLINE OF U.S. HISTORY11\nof a few hundred English colonists \nto a flood of millions of newcomers  . \nImpelled by powerful and diverse \nmotivations, they built a new civi -\nlization on the northern part of the \ncontinent  .\nThe first English immigrants \nto what is now the United States \ncrossed the Atlantic long after thriv -\ning Spanish colonies had been estab -\nlished in Mexico, the West Indies, \nand South America  . Like all early \ntravelers to the New World, they \ncame in small, overcrowded ships  . \nDuring their six- to 12-week voy -\nages, they lived on meager rations  . \nMany died of disease, ships were \noften battered by storms, and some \nwere lost at sea  .\nMost European emigrants left \ntheir homelands to escape political \noppression, to seek the freedom to \npractice their religion, or to find op -\nportunities denied them at home  . \nBetween 1620 and 1635, economic \

In [19]:
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/distiluse-base-multilingual-cased-v1",
                                 model_kwargs=model_kwargs,
                                 encode_kwargs=encode_kwargs)

  warn_deprecated(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [21]:
vectorstore=PineconeStore.from_documents(
    textChunks,
    embeddings,
    index_name=INDEX_NAME
)

In [31]:
vectorstoreRetriever=vectorstore.as_retriever(search_kwargs={"k": 5})

In [30]:
keywordRetriever=BM25Retriever.from_documents(textChunks, k=3)

In [33]:
retriever=EnsembleRetriever(retrievers=[vectorstoreRetriever, keywordRetriever],
                            weights=[0.8, 0.2])

In [48]:
promptTemplate = """Given the following context and a question, generate an answer based on this context only.
In the answer try to provide as much text as possible from "response" section in the source document context without making much changes.
If the answer is not found in the context, kindly state "I don't know." Don't try to make up an answer.

CONTEXT: {context}

QUESTION: {question}"""

prompt = PromptTemplate.from_template(promptTemplate)

In [49]:
outputParser = StrOutputParser()

In [50]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | outputParser
)

In [51]:
print(chain.invoke('When France’s King Louis XVI had sent to America an expe ditionary force of 6,000 men under the Comte Jean de Rochambeau ?'))

In July 1780, France's King Louis XVI had sent to America an expeditionary force of 6,000 men under the Comte Jean de Rochambeau.
