## Install Dependencies

In [86]:
#!pip install pandas torch datasets transformers sentence-transformers requests tqdm nltk

In [87]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from collections import Counter
from tqdm.auto import tqdm
import mmh3
import requests
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

## Connect to Pinecone

The hybrid vector index is currently not available in Pinecone python client. So, we will use the Pinecone REST API to communicate with the new index. The ```HybridPinecone``` class below gives you a similar interface to the python-client to communicate with the new index.

In [88]:
class HybridPinecone:
    # initializes the HybridPinecone object
    def __init__(self, api_key, environment):
        # make environment, headers and project_id available across all the function within the class
        self.environment = environment
        self.headers = {'Api-Key': api_key}
        # get project_id
        res = requests.get(
            f"https://controller.{self.environment}.pinecone.io/actions/whoami",
            headers=self.headers
        )
        self.project_id = res.json()['project_name']
        self.host = None

    # creates an index in pinecone vector database
    def create_index(self, index_name, dimension, metric, pod_type):
        # index specification
        params = {
            'name': index_name,
            'dimension': dimension,
            'metric': metric,
            'pod_type': pod_type
        }
        # sent a post request with the headers and parameters to pinecone database
        res = requests.post(
            f"https://controller.{self.environment}.pinecone.io/databases",
            headers=self.headers,
            json=params
        )
        # return the creation status
        return res
    
    # get the project_id for the index and update self.host variable
    def connect_index(self, index_name):
        # set the self.host variable
        self.host = f"{index_name}-{self.project_id}.svc.{self.environment}.pinecone.io"
        res = self.describe_index_stats()
        # return index related information as json
        return res
    
    def describe_index(self, index_name):
        # send a get request to pinecone database to get index description
        res = requests.get(
            f"https://controller.{self.environment}.pinecone.io/databases/{index_name}",
            headers=self.headers
        )
        return res.json()

    # returns description of the index
    def describe_index_stats(self):
        # send a get request to pinecone database to get index description
        res = requests.get(
            f"https://{self.host}/describe_index_stats",
            headers=self.headers
        )
        # return the index description as json
        return res.json()

    # uploads the documents to pinecone database
    def upsert(self, vectors):
        # send a post request with vectors to pinecone database
        res = requests.post(
            f"https://{self.host}/hybrid/vectors/upsert",
            headers=self.headers,
            json={'vectors': vectors} #TODO this is probably the issue
        )
        # return the http response status
        return res

    # searches pinecone database with the query
    def query(self, query):
        # sends a post request to hybrib vector index with the query dict
        res = requests.post(
            f"https://{self.host}/hybrid/query",
            headers=self.headers,
            json=query
        )
        # returns the result as json
        return res.json()

    # deletes an index in pinecone database
    def delete_index(self, index_name):
        # sends a delete request
        res = requests.delete(
            f"https://controller.{self.environment}.pinecone.io/databases/{index_name}",
            headers=self.headers
        )
        # returns the http response status
        return res

## Create index

In [89]:
# initialize an instance of HybridPinecone class
pinecone = HybridPinecone(
    api_key = "1f136ea0-a50c-4af1-a9ad-93de37970fab",  # app.pinecone.io
    environment = "us-west1-gcp"
)

# choose a name for your index
index_name = "hybrid-search-demo"

In [90]:
# # create the index
# pinecone.create_index(
#     index_name = index_name,
#     dimension = 384,
#     metric = "dotproduct",
#     pod_type = "s1h"
# )

Now we have created the hybrid vector index using the `"s1h"` hybrid `pod_type`. To connect to the index we must `wait until it is ready`, we can check it's status like so:

In [91]:
pinecone.describe_index(index_name)

{'database': {'name': 'hybrid-search-demo',
  'index_type': 'approximated',
  'metric': 'dotproduct',
  'dimension': 384,
  'replicas': 1,
  'shards': 1,
  'pods': 1,
  'pod_type': 's1h',
  'index_config': {'approximated': {'k_bits': 512}}},
 'status': {'waiting': [],
  'crashed': [],
  'host': 'hybrid-search-demo-94a860f.svc.us-west1-gcp.pinecone.io',
  'port': 433,
  'state': 'Ready',
  'ready': True}}

If the `state` is `'Ready'` we can continue and connect to the index like so:

In [92]:
pinecone.connect_index(index_name)

{'namespaces': {}, 'dimension': 384, 'indexFullness': 0, 'totalVectorCount': 0}

## Load Dataset

We will use a json file containing a web crawl of pinecone.io

In [93]:
df = pd.read_json("Pinecone_io_Webcrawl.json")

ids = [] # id
titles = [] # title
contents = [] # content
urls = [] # url
boosts = [] # boost
blob_texts = [] # derived from titles, bodies, and urls

# Process fields to be used as metadata later
for doc in df.response.docs:

    title = str(doc.get('title'))
    titles.append(title)

    content = str(doc.get('content'))
    contents.append(content[:9000])

    url = str(doc.get('url'))
    urls.append(url)

    boost = str(doc.get('boost'))
    boosts.append(boost)

    # Blob text will be used to generate the sparse and dense vectors
    blob_text =  str(doc.get('title')) + ' ' + str(doc.get('content')) + ' ' + str(doc.get('url'))
    blob_texts.append(blob_text)


## Sparse and Dense Vector Creation

In [94]:
# load a sentence transformer model from huggingface
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# create a tokenizer
class Tokenizer:
  def __init__(self):
    self.stemmer = SnowballStemmer('english')

  def encode(self, text):
    words = [self.stemmer.stem(word) for word in word_tokenize(text)]
    ids = [mmh3.hash(word, signed=False) for word in words]
    return dict(Counter(ids))

tokenizer = Tokenizer()

In [95]:
sparse_vectors = []
dense_vectors = []

for blob in blob_texts:
    sparse_vectors.append(tokenizer.encode(str(blob)))
    dense_vectors.append(model.encode([blob], normalize_embeddings=True).tolist())

print(sparse_vectors[0])
print(dense_vectors[0])

{3753613260: 7, 711529291: 8, 2751991620: 9, 3888873998: 3, 3584054563: 5, 2650797237: 6, 2580097488: 1, 3357039338: 2, 2852256900: 4, 649407353: 4, 4037162160: 3, 1928848833: 2, 1280338847: 3, 619178129: 3, 4210889658: 3, 1811268606: 2, 131900689: 2, 1622730881: 1, 1720810685: 1, 1363043438: 2, 958659146: 2, 2223247000: 2, 2816322140: 1, 1101351995: 1, 1793137844: 1, 152217691: 8, 3162218338: 3, 1955147705: 2, 553238108: 4, 4222409465: 1, 1647276219: 1, 3286166600: 3, 4243668012: 3, 3997133275: 1, 117848935: 4, 569047032: 1, 1491351846: 1, 2941713443: 11, 286431791: 3, 3981855590: 5, 3688822001: 1, 3658264083: 1, 1009084850: 3, 2236453805: 2, 2779594451: 4, 1181836714: 1, 2319041398: 2, 2641553256: 2, 814527388: 7, 3396792551: 1, 806976768: 2, 2522961926: 2, 2733467792: 1, 2367602973: 1, 1682104160: 1, 1026658409: 1, 1396724889: 1, 723937430: 5, 1717991430: 1, 1447422055: 2, 2399171853: 2, 3194111925: 2, 3449948193: 2, 270731446: 2, 3397902157: 2, 3176141921: 2, 21141582: 2, 549248996

In [96]:
# Build metadata
meta = []

for url, title, content, boost in zip(urls,titles,contents,boosts):
    metavalue = {'url':url,'title':title,'content':content,'boost':boost}
    meta.append(metavalue)

print(meta[0])

{'url': 'https://www.pinecone.io/contact/', 'title': 'Contact | Pinecone', 'content': 'Contact | Pinecone\nPricing\nDocs\nLearn\nLearning Center\nCommunity\nCompany\nAbout\nCareers\nPartners\nTrust & Security\nContact\nLog In\nSign Up Free\nPricing\nDocs\nLearn\nCommunity\nCompany\nCareers\nPartners\nTrust & Security\nContact\nLog In\nCreate Account\nToggle menu\nTalk to the Vector Search Experts\nSend us your questions about Pinecone or details about your vector search needs. We’ll schedule a time to learn and share more with you.\nWant to try Pinecone? Start here.\nPress and general inquiries: info@pinecone.io\nSupport: support@pinecone.io Read the Docs\nDon’t fill this out if you’re human:\nFirst Name\nLast Name\nWork Email\nJob Title\nCompany\nPhone\nComments\nContact Us\nBy submitting you agree to our privacy policy and to receive information from Pinecone related to our products. You can unsubscribe at any time.\nMessage received! We will reply as soon as we can. For now you can 

## Upsert Documents

Now we can go ahead and generate sparse and dense vectors for the full dataset and upsert them along with the metadata to the new hybrid index. We can do that easily as follows:

In [97]:
batch_size = 1
f = open("output.txt", "a")

for i in tqdm(range(0, len(blob_texts), batch_size)):
    i_end = min(i+batch_size, len(blob_texts))
    ids = [str(x) for x in range(i, i_end)]

    vectors = []
    
    for id, sparse, dense, metadata in zip(ids, sparse_vectors[i:i_end], dense_vectors[i:i_end], meta[i:i_end]):
        vectors.append({
            'id': id,
            'sparse_values': sparse,
            'values': dense,
            'metadata': metadata
        })

    pinecone.upsert(vectors)
    f.write(str(vectors))


f.close()
# show index description after uploading the documents
#pinecone.describe_index_stats()


  0%|          | 0/171 [00:00<?, ?it/s]

## Querying

Now we can query the index, providing the sparse and dense vectors of a question, along with a weight for keyword relevance (“alpha”). `Alpha=1` will provide a purely semantic-based search result and `alpha=0` will provide a purely keyword-based result equivalent to BM25. The default value is `0.5`.

Let's write a helper function to execute queries and after that run some queries.

In [98]:
def hybrid_query(question, top_k, alpha):
    # convert the question into a sparse vector
    sparse_vec = [{1: 1}]#tokenizer.encode([question])
    # convert the question into a dense vector
    dense_vec = model.encode([question]).tolist()
    # set the query parameters to send to pinecone
    query = {
      "topK": top_k,
      "vector": dense_vec,
      "sparseVector": sparse_vec[0],
      "alpha": alpha,
      "includeMetadata": True,
      "includeValues": True
    }
    # query pinecone with the query parameters
    result = pinecone.query(query)
    # return search results as json
    return result

In [99]:
question = "are you in us-east-2"

First, we will do a pure semantic search by setting the alpha value as 1.

In [100]:
hybrid_query(question, top_k=1, alpha=1)

{'matches': [{'id': '44',
   'score': 0.171690628,
   'values': [-0.0130419694,
    -0.119259268,
    0.0268057585,
    0.0701151416,
    0.0145442355,
    -0.0241187084,
    -0.0400342084,
    0.0324219167,
    0.0145818349,
    0.0965186805,
    -0.0239014979,
    -0.0130719217,
    -0.0569431819,
    0.0501514859,
    0.0353316069,
    -0.0234582,
    0.0565207303,
    -0.0812524334,
    0.00649508648,
    0.0329303369,
    -0.0449211076,
    -0.00126629241,
    -0.0452318043,
    0.0151787875,
    0.0236369614,
    -0.107929364,
    -0.0532954782,
    0.029236123,
    -0.0184621308,
    -0.0123242801,
    -0.00563243171,
    0.0338342786,
    0.0905211419,
    0.0618571676,
    -0.00266061607,
    0.022274958,
    0.0294739306,
    -0.106270447,
    -0.0709137544,
    0.042845767,
    -0.000541055575,
    0.00618083356,
    -0.116268687,
    0.0242140666,
    -0.013110701,
    0.016263647,
    0.0229357444,
    0.0263115093,
    0.0742606148,
    0.00452383654,
    0.04444848,
    

The most relevant result from above is the second document with id 711. Now let's try with an alpha value of 0.3.

In [101]:
hybrid_query(question, top_k=1, alpha=0.3)

{'matches': [{'id': '44',
   'score': 0.0515071899,
   'values': [-0.0130419694,
    -0.119259268,
    0.0268057585,
    0.0701151416,
    0.0145442355,
    -0.0241187084,
    -0.0400342084,
    0.0324219167,
    0.0145818349,
    0.0965186805,
    -0.0239014979,
    -0.0130719217,
    -0.0569431819,
    0.0501514859,
    0.0353316069,
    -0.0234582,
    0.0565207303,
    -0.0812524334,
    0.00649508648,
    0.0329303369,
    -0.0449211076,
    -0.00126629241,
    -0.0452318043,
    0.0151787875,
    0.0236369614,
    -0.107929364,
    -0.0532954782,
    0.029236123,
    -0.0184621308,
    -0.0123242801,
    -0.00563243171,
    0.0338342786,
    0.0905211419,
    0.0618571676,
    -0.00266061607,
    0.022274958,
    0.0294739306,
    -0.106270447,
    -0.0709137544,
    0.042845767,
    -0.000541055575,
    0.00618083356,
    -0.116268687,
    0.0242140666,
    -0.013110701,
    0.016263647,
    0.0229357444,
    0.0263115093,
    0.0742606148,
    0.00452383654,
    0.04444848,
   

The most relevant document is now ranked the highest.

# Delete the Index

In [102]:
#pinecone.delete_index("hybrid-search-demo")