<a href="https://colab.research.google.com/github/pinilDissanayaka/American-History-RAG-Application/blob/main/Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone as PineconeStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from google.colab import userdata

In [28]:
os.environ['PINECONE_API_KEY']=userdata.get('PINECORN_API_KEY')
os.environ['GROQ_API_KEY']=userdata.get('GROQ_API_KEY')

In [8]:
INDEX_NAME="american-history"
DIMENSIONS=512

In [9]:
pinecone=Pinecone(api_key=os.environ['PINECORN_API_KEY'])

if not INDEX_NAME in pinecone.list_indexes().names():
  pinecone.create_index(
    name=INDEX_NAME,
    dimension=DIMENSIONS,
    metric="euclidean",
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)
else:
  print("Index already exists")

In [10]:
pinecone.list_indexes()

{'indexes': [{'dimension': 512,
              'host': 'american-history-4myrn7y.svc.aped-4627-b74a.pinecone.io',
              'metric': 'euclidean',
              'name': 'american-history',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 384,
              'host': 'pdf-rag-4myrn7y.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'pdf-rag',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [11]:
llm = ChatGroq(
    temperature=0.8,
    model="llama3-70b-8192"
)

In [15]:
loader=PyPDFLoader("/content/history_outline.pdf")

docs=loader.load()

In [17]:
docs[10]

Document(metadata={'source': '/content/history_outline.pdf', 'page': 10}, page_content='OUTLINE OF U.S. HISTORY9\ngroups and strong evidence exists \nthat neighboring tribes maintained \nextensive and formal relations — \nboth friendly and hostile  .\nTHE FIRST EUROPEANS\nThe first Europeans to arrive in \nNorth America — at least the first \nfor whom there is solid evidence \n— were Norse, traveling west from \nGreenland, where Erik the Red \nhad founded a settlement around \nthe year 985  . In 1001 his son Leif is \nthought to have explored the north -\neast coast of what is now Canada and \nspent at least one winter there  .\nWhile Norse sagas suggest that \nViking sailors explored the Atlan -\ntic coast of North America down \nas far as the Bahamas, such claims \nremain unproven  . In 1963, however, \nthe ruins of some Norse houses dat -\ning from that era were discovered at \nL’Anse-aux-Meadows in northern \nNewfoundland, thus supporting at \nleast some of the saga claims  .\nIn 1

In [18]:
textSplitter=RecursiveCharacterTextSplitter(
    chunk_size=10000,
    chunk_overlap=1000,
    length_function=len
)

textChunks=textSplitter.split_documents(docs)

textChunks[10]

Document(metadata={'source': '/content/history_outline.pdf', 'page': 12}, page_content='OUTLINE OF U.S. HISTORY11\nof a few hundred English colonists \nto a flood of millions of newcomers  . \nImpelled by powerful and diverse \nmotivations, they built a new civi -\nlization on the northern part of the \ncontinent  .\nThe first English immigrants \nto what is now the United States \ncrossed the Atlantic long after thriv -\ning Spanish colonies had been estab -\nlished in Mexico, the West Indies, \nand South America  . Like all early \ntravelers to the New World, they \ncame in small, overcrowded ships  . \nDuring their six- to 12-week voy -\nages, they lived on meager rations  . \nMany died of disease, ships were \noften battered by storms, and some \nwere lost at sea  .\nMost European emigrants left \ntheir homelands to escape political \noppression, to seek the freedom to \npractice their religion, or to find op -\nportunities denied them at home  . \nBetween 1620 and 1635, economic \

In [21]:
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/distiluse-base-multilingual-cased-v1",
                                 model_kwargs=model_kwargs,
                                 encode_kwargs=encode_kwargs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [29]:
vectorstore=PineconeStore.from_documents(
    textChunks,
    embeddings,
    index_name=INDEX_NAME
)