#**Install Required Packages**

In [None]:
!pip install --upgrade --quiet  pinecone-client pinecone-text pinecone-notebooks langchain_community langchain_huggingface datasets

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:

In [None]:
import os
from pinecone import Pinecone,ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import PineconeHybridSearchRetriever

#**Dataset**

In [None]:
from datasets import load_dataset

dataset = load_dataset("quora", split="train[0:5000]", trust_remote_code=True)

Downloading data:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['questions', 'is_duplicate'],
    num_rows: 5000
})

In [None]:
dataset[:4]

{'questions': [{'id': [1, 2],
   'text': ['What is the step by step guide to invest in share market in india?',
    'What is the step by step guide to invest in share market?']},
  {'id': [3, 4],
   'text': ['What is the story of Kohinoor (Koh-i-Noor) Diamond?',
    'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?']},
  {'id': [5, 6],
   'text': ['How can I increase the speed of my internet connection while using a VPN?',
    'How can Internet speed be increased by hacking through DNS?']},
  {'id': [7, 8],
   'text': ['Why am I mentally very lonely? How can I solve it?',
    'Find the remainder when [math]23^{24}[/math] is divided by 24,23?']}],
 'is_duplicate': [False, False, False, False]}

In [None]:
df = dataset.to_pandas()

In [None]:
df.head()

Unnamed: 0,questions,is_duplicate
0,"{'id': [1, 2], 'text': ['What is the step by s...",False
1,"{'id': [3, 4], 'text': ['What is the story of ...",False
2,"{'id': [5, 6], 'text': ['How can I increase th...",False
3,"{'id': [7, 8], 'text': ['Why am I mentally ver...",False
4,"{'id': [9, 10], 'text': ['Which one dissolve i...",False


In [None]:
questions = []

for question in dataset['questions']:
    questions.extend(question['text'])

questions = list(set(questions)) # Remove duplicates
print('\n'.join(questions[:5]))
print(len(questions))

What's the difference between the iPhone 6 and the iPhone 6s?
Can you use EBT cards at Costco or Sam's Club?
Solve the case: A man got killed in his office. The suspects are Edison, Maxis, Jason, Janna, and Sofia. A calendar near the man has 6, 4, 9, 10, 11 written in blood. Who is the killer?
How do I Find a persons real name with a picture?
What is the best film of 2016?
9859


In [None]:
questions[:10]

["What's the difference between the iPhone 6 and the iPhone 6s?",
 "Can you use EBT cards at Costco or Sam's Club?",
 'Solve the case: A man got killed in his office. The suspects are Edison, Maxis, Jason, Janna, and Sofia. A calendar near the man has 6, 4, 9, 10, 11 written in blood. Who is the killer?',
 'How do I Find a persons real name with a picture?',
 'What is the best film of 2016?',
 'What would happen if your entire city suddenly decided to listen to you?',
 "Why don't some peoples answers have a place to comment?",
 'In what sectors is it best to live in Gurgaon (Gurugram)?',
 'What is difference between immortality and eternity?',
 'How often is it healthy to masturbate?']

In [None]:
questions = questions[:200]

In [None]:
len(questions)

200

#**Pinecone**

In [None]:
# api_key=
# HF_TOKEN = 

In [None]:
index_name="hybrid-search-langchain-pinecone"

## initialize the Pinecone client
pc=Pinecone(api_key=api_key)

#create the index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # dimensionality of dense model all-MiniLM-L6-v2
        metric="dotproduct",  # sparse values supported only for dotproduct
        spec=ServerlessSpec(cloud="aws", region="us-east-1"), #use the free tier
    )

In [None]:
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x7d63c2f0c430>

##**Dense Retrival using HF embeddings**

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

##**Sparse Retrival BM25**

In [None]:
from pinecone_text.sparse import BM25Encoder

bm25_encoder=BM25Encoder().default()
bm25_encoder

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7d63c31c1c90>

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
bm25_encoder.fit(questions)

## store the values to a json file
bm25_encoder.dump("bm25_values.json")

# load to your BM25Encoder object
bm25_encoder = BM25Encoder().load("bm25_values.json")

  0%|          | 0/200 [00:00<?, ?it/s]

##**Hybrid Search Retrival**

##**Reciprocal Rank Fusion**

# Reciprocal Rank Fusion (RRF) Explained

"""
Reciprocal Rank Fusion (RRF) is an ensemble-based ranking method used in information retrieval
to combine ranked lists from multiple retrieval models or sources. It is simple, robust, and
often outperforms more complex fusion methods when combining results.

How RRF Works:
RRF assigns a score to each document based on its rank in the ranked lists. The fused score for
a document is computed using the formula:

    RRF(d) = sum(1 / (k + r_k(d))) for k = 1 to n

Where:
- n: The number of ranked lists being fused.
- r_k(d): The rank of document d in the k-th ranked list (1-based rank).
- k: A small constant (usually set to 60) to avoid division by zero and reduce the impact of low ranks.

Key Points:
- Documents appearing higher in the rankings of multiple lists get higher fused scores.
- Lower-ranked documents contribute less to the overall score due to the reciprocal nature of the scoring function.
- The parameter k controls how steeply the contribution of lower ranks diminishes.

Example:

Input Ranked Lists:
1. BM25:
   - Doc1: Rank 1
   - Doc2: Rank 2
   - Doc3: Rank 3
2. Dense Retrieval:
   - Doc2: Rank 1
   - Doc1: Rank 2
   - Doc4: Rank 3

Compute RRF Scores (k = 60):
- For Doc1:
  RRF = 1/(60+1) + 1/(60+2) = 0.01639 + 0.01613 = 0.03252
- For Doc2:
  RRF = 1/(60+2) + 1/(60+1) = 0.01613 + 0.01639 = 0.03252
- For Doc3:
  RRF = 1/(60+3) = 0.01587
- For Doc4:
  RRF = 1/(60+3) = 0.01587

Fused Ranking:
1. Doc1 and Doc2 (tie): 0.03252
2. Doc3 and Doc4 (tie): 0.01587

Advantages of RRF:
1. Simplicity: RRF is computationally inexpensive and straightforward to implement.
2. Robustness: It works well even when the models being combined have widely varying performance.
3. No Need for Parameter Tuning: The small constant k is fixed and doesn't require significant optimization.

Use Cases:
1. Hybrid Search: Combining dense and sparse retrieval results.
"""


**Text Explanation for PineconeHybridSearchRetriever Weight Adjustment**

"""
The PineconeHybridSearchRetriever allows hybrid search by combining dense vector-based retrieval
(e.g., from a transformer embedding) and sparse vector-based retrieval (e.g., from BM25).
You can adjust the weights for dense and sparse retrieval components to balance their contribution
during the hybrid search.

Steps to Adjust Weights:
1. You can modify the weights using the `alpha` parameter. Here's how it works:

- `alpha` determines the relative weight given to the dense and sparse components.
- If `alpha = 0.5`, the dense and sparse scores are weighted equally.
- If `alpha = 0.7`, the dense component gets more weight (70%), while the sparse component gets less (30%).

"""


In [None]:
retriever =PineconeHybridSearchRetriever(embeddings=embeddings, sparse_encoder=bm25_encoder, index=index, alpha=0.7)

In [None]:
retriever.add_texts(questions)


  0%|          | 0/7 [00:00<?, ?it/s]

#**Testing**

In [38]:
questions[:20]

["What's the difference between the iPhone 6 and the iPhone 6s?",
 "Can you use EBT cards at Costco or Sam's Club?",
 'Solve the case: A man got killed in his office. The suspects are Edison, Maxis, Jason, Janna, and Sofia. A calendar near the man has 6, 4, 9, 10, 11 written in blood. Who is the killer?',
 'How do I Find a persons real name with a picture?',
 'What is the best film of 2016?',
 'What would happen if your entire city suddenly decided to listen to you?',
 "Why don't some peoples answers have a place to comment?",
 'In what sectors is it best to live in Gurgaon (Gurugram)?',
 'What is difference between immortality and eternity?',
 'How often is it healthy to masturbate?',
 "What are good gifts for a foreign visitor to bring when they're invited to someone's home in Martinique for the first time?",
 'What is the best thing I can buy for 2€ on Amazon?',
 'How can a teacher maintain discipline?',
 'Why do young women sext older men?',
 'How does the rate of student happiness

In [None]:
retriever.invoke("What is the difference between iphones?")

[Document(metadata={'score': 0.709553838}, page_content="What's the difference between the iPhone 6 and the iPhone 6s?"),
 Document(metadata={'score': 0.413765669}, page_content='How much would it cost to build your own iPhone?'),
 Document(metadata={'score': 0.320991904}, page_content='Which phone do you use and why?'),
 Document(metadata={'score': 0.285753608}, page_content='Is it possible to install iOS to an Android phone?')]

In [None]:
# Can you use EBT cards at Costco or Sam's Club?
retriever.invoke("where can i use EBT cards?")

[Document(metadata={'score': 0.641137123}, page_content="Can you use EBT cards at Costco or Sam's Club?"),
 Document(metadata={'score': 0.187404364}, page_content='Which phone do you use and why?'),
 Document(metadata={'score': 0.182535231}, page_content='Which US colleges do not accept IELTS?'),
 Document(metadata={'score': 0.180081889}, page_content="What app/website let's me use a security camera filter?")]

In [None]:
retriever.invoke("i want to order pizza")

[Document(metadata={'score': 0.557736635}, page_content="Which pizzas are the most popularly ordered pizzas on Domino's menu?"),
 Document(metadata={'score': 0.220222935}, page_content='How can I buy Tesla shares / stock?'),
 Document(metadata={'score': 0.17836906}, page_content="What are good gifts for a foreign visitor to bring when they're invited to someone's home in Martinique for the first time?"),
 Document(metadata={'score': 0.169184312}, page_content='How can I get satisfied sex?')]

In [39]:
retriever.invoke("i want to recover my password")

[Document(metadata={'score': 0.607464969}, page_content='How can I recover my Hotmail password?'),
 Document(metadata={'score': 0.20484142}, page_content='Where can I download a cracked version of Mathematica?'),
 Document(metadata={'score': 0.149258584}, page_content="Is there any way to get a user's email address through Twitter OAuth API?"),
 Document(metadata={'score': 0.130314425}, page_content='How can I forget my wife?')]

In [40]:
retriever.invoke("i am better")

[Document(metadata={'score': 0.434283495}, page_content='Why do people think that they are better than everyone and everything else?'),
 Document(metadata={'score': 0.306995034}, page_content='Who are better drivers woman or man?'),
 Document(metadata={'score': 0.272710681}, page_content='Who is the better actress between Priyanka Chopra and Deepika Padukone?'),
 Document(metadata={'score': 0.228032082}, page_content='How can I get better at building in Minecraft? Other than using Grian.')]

#**References**

1. **SentenceTransformers Documentation**:  
   For more information on using SentenceTransformers, refer to the official documentation:
   [SentenceTransformers](https://www.sbert.net/)

2. **Hybrid Search with Pinecone**:  
   Learn about hybrid search in Pinecone, combining dense and sparse retrieval methods:
   [Hybrid Search in Pinecone](https://docs.pinecone.io/docs/hybrid-search)

3. **LangChain Documentation**:  
   Explore LangChain for building applications with LLMs, handling chains, agents, and retrieval:
   [LangChain Docs](https://docs.langchain.com/)

4. **Hands-On Large Language Models (Book)**:  
   A great resource for understanding and working with large language models:
   [Hands-On Large Language Models](https://www.oreilly.com/library/view/hands-on-large-language/9781098122233/) see ch8
