<a href="https://colab.research.google.com/github/pinilDissanayaka/Psychology-RAG-Fusion/blob/main/Notebook_RAG_Fusion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -r requirements.txt

Collecting langchain (from -r requirements.txt (line 1))
  Downloading langchain-0.2.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_community (from -r requirements.txt (line 2))
  Downloading langchain_community-0.2.14-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain_huggingface (from -r requirements.txt (line 3))
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Collecting sentence-transformers (from -r requirements.txt (line 4))
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pinecone (from -r requirements.txt (line 6))
  Downloading pinecone-5.1.0-py3-none-any.whl.metadata (19 kB)
Collecting langchain_core (from -r requirements.txt (line 7))
  Downloading langchain_core-0.2.36-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain_pinecone (from -r requirements.txt (line 9))
  Downloading langchain_pinecone-0.1.3-py3-none-any.whl.metadata (1.7 kB)
Collecting langchain_experimental (from -r requirem

In [1]:
import os
from langchain_groq import ChatGroq
from google.colab import userdata
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain.prompts import ChatPromptTemplate
from langchain.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_core.documents import Document

In [2]:
def getLlm():
  os.environ['GROQ_API_KEY']=userdata.get('GROQ_API_KEY')
  llm=ChatGroq(model_name='llama-3.1-70b-versatile',
              temperature=0.6)
  return llm

In [3]:
llm=getLlm()

In [4]:
os.environ['PINECONE_API_KEY']=userdata.get('PINECORN_API_KEY')
pinecone=Pinecone()

In [9]:
pineconeIndexNames=pinecone.list_indexes().names()

print('Available Indexes  :')
print(f'\t {pineconeIndexNames}')

Available Indexes  :
	 ['multi-rag', 'constitution']


In [10]:
INDEX_NAME='rag-fusion'
DIMENSIONS=512

if not INDEX_NAME in pineconeIndexNames:
  pinecone.create_index(
          name=INDEX_NAME,
          dimension=DIMENSIONS,
          metric="euclidean",
          spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
          )
)
else:
  print("Index already exists")

In [11]:
pinecone.describe_index(INDEX_NAME)

{
    "name": "rag-fusion",
    "dimension": 512,
    "metric": "euclidean",
    "host": "rag-fusion-4myrn7y.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}

In [12]:
template = """You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Original question:

{question}"""


multiQyeryPrompt = ChatPromptTemplate.from_template(template)

print(multiQyeryPrompt)

input_variables=['question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='You are an AI language model assistant. Your task is to generate five\ndifferent versions of the given user question to retrieve relevant documents from a vector\ndatabase. By generating multiple perspectives on the user question, your goal is to help\nthe user overcome some of the limitations of the distance-based similarity search.\nProvide these alternative questions separated by newlines. Original question:\n\n{question}'))]


In [13]:
def separateQuestions(question):
  return question.split('\n')

In [14]:
multiQyeryChain=multiQyeryPrompt | llm | StrOutputParser() | RunnableLambda(separateQuestions)

In [15]:
multiQyeryChain.invoke('how are you?')

["what's your status?",
 'can you tell me about your condition?',
 'how would you describe your current state?',
 "what's going on with you?",
 'could you give me an update about yourself?']

In [16]:
from urllib import request

file_name="Introducing Psychology"
url="https://ocw.mit.edu/ans7870/9/9.00SC/MIT9_00SCF11_text.pdf"

request.urlretrieve(url, file_name)

('Introducing Psychology', <http.client.HTTPMessage at 0x7d8aeacc6620>)

In [17]:
loader=PyPDFLoader(file_name)

data=loader.load()

In [18]:
pageContent=list()

for page_content in data:
  pageContent.append(page_content.page_content)

print(f'Total pages: {len(pageContent)}')

Total pages: 783


In [20]:
model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [21]:
chunker=SemanticChunker(embeddings)

In [None]:
docs=chunker.split_documents(data)

In [None]:
print(f'Total documents: {len(docs)}')

In [22]:
vectoreStore=PineconeVectorStore.from_documents(
    documents=docs,
    embedding=embeddings,
    index_name=INDEX_NAME,
    text_key='text'
)

NameError: name 'docs' is not defined

In [None]:
retrever=vectoreStore.as_retriever()