## Install Dependencies

In [20]:
#!pip install torch datasets transformers sentence-transformers requests tqdm

In [21]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import requests
from collections import Counter
from tqdm.auto import tqdm

## Connect to Pinecone

The hybrid vector index is currently not available in Pinecone python client. So, we will use the Pinecone REST API to communicate with the new index. The ```HybridPinecone``` class below gives you a similar interface to the python-client to communicate with the new index.

In [22]:
class HybridPinecone:
    # initializes the HybridPinecone object
    def __init__(self, api_key, environment):
        # make environment, headers and project_id available across all the function within the class
        self.environment = environment
        self.headers = {'Api-Key': api_key}
        # get project_id
        res = requests.get(
            f"https://controller.{self.environment}.pinecone.io/actions/whoami",
            headers=self.headers
        )
        self.project_id = res.json()['project_name']
        self.host = None

    # creates an index in pinecone vector database
    def create_index(self, index_name, dimension, metric, pod_type):
        # index specification
        params = {
            'name': index_name,
            'dimension': dimension,
            'metric': metric,
            'pod_type': pod_type
        }
        # sent a post request with the headers and parameters to pinecone database
        res = requests.post(
            f"https://controller.{self.environment}.pinecone.io/databases",
            headers=self.headers,
            json=params
        )
        # return the creation status
        return res
    
    # get the project_id for the index and update self.host variable
    def connect_index(self, index_name):
        # set the self.host variable
        self.host = f"{index_name}-{self.project_id}.svc.{self.environment}.pinecone.io"
        res = self.describe_index_stats()
        # return index related information as json
        return res
    
    def describe_index(self, index_name):
        # send a get request to pinecone database to get index description
        res = requests.get(
            f"https://controller.{self.environment}.pinecone.io/databases/{index_name}",
            headers=self.headers
        )
        return res.json()

    # returns description of the index
    def describe_index_stats(self):
        # send a get request to pinecone database to get index description
        res = requests.get(
            f"https://{self.host}/describe_index_stats",
            headers=self.headers
        )
        # return the index description as json
        return res.json()

    # uploads the documents to pinecone database
    def upsert(self, vectors):
        # send a post request with vectors to pinecone database
        res = requests.post(
            f"https://{self.host}/hybrid/vectors/upsert",
            headers=self.headers,
            json={'vectors': vectors}
        )
        # return the http response status
        return res

    # searches pinecone database with the query
    def query(self, query):
        # sends a post request to hybrib vector index with the query dict
        res = requests.post(
            f"https://{self.host}/hybrid/query",
            headers=self.headers,
            json=query
        )
        # returns the result as json
        return res.json()

    # deletes an index in pinecone database
    def delete_index(self, index_name):
        # sends a delete request
        res = requests.delete(
            f"https://controller.{self.environment}.pinecone.io/databases/{index_name}",
            headers=self.headers
        )
        # returns the http response status
        return res

Let's initiate a connection and create an index. For this, we need a [free API key](https://app.pinecone.io/), and then we initialize the connection like so:

In [23]:
# initialize an instance of HybridPinecone class
pinecone = HybridPinecone(
    api_key = "1f136ea0-a50c-4af1-a9ad-93de37970fab",  # app.pinecone.io
    environment = "us-west1-gcp"
)

We specify the metric type as "dotproduct" and dimension as 384 because the retriever we use to generate context embeddings outputs 384-dimension vectors.

In [24]:
# choose a name for your index
index_name = "hybrid-test"

# create the index
pinecone.create_index(
    index_name = index_name,
    dimension = 384,
    metric = "dotproduct",
    pod_type = "s1h"
)

<Response [201]>

Now we have created the hybrid vector index using the `"s1h"` hybrid `pod_type`. To connect to the index we must wait until it is ready, we can check it's status like so:

In [25]:
pinecone.describe_index(index_name)

{'database': {'name': 'hybrid-test',
  'index_type': 'approximated',
  'metric': 'dotproduct',
  'dimension': 384,
  'replicas': 1,
  'shards': 1,
  'pods': 1,
  'pod_type': 's1h',
  'index_config': {'approximated': {'k_bits': 512}}},
 'status': {'waiting': ['database'],
  'crashed': [],
  'host': 'hybrid-test-94a860f.svc.us-west1-gcp.pinecone.io',
  'port': 433,
  'state': 'Initializing',
  'ready': False}}

If the `state` is `'Ready'` we can continue and connect to the index like so:

In [27]:
pinecone.connect_index(index_name)

{'namespaces': {}, 'dimension': 384, 'indexFullness': 0, 'totalVectorCount': 0}

## Load Dataset

We will use a json file containing a web crawl of pinecone.io

In [28]:
df = pd.read_json("Pinecone_io_Webcrawl.json")

sentences = []
urls = []

for doc in df.response.docs:
    #content =  str(doc.get('title')) + ' ' + str(doc.get('content'))
    content =  str(doc.get('content'))
    sentences.append(''.join(content[:5000]))
    url = str(doc.get('url'))
    urls.append(url)

## Sparse Vectors

We will use the BERT tokenizer to create sparse vectors, i.e. we will take the token IDs.

In [29]:
from transformers import BertTokenizerFast

# load bert tokenizer from huggingface
tokenizer = BertTokenizerFast.from_pretrained(
    'bert-base-uncased'
)

Let's create a sparse vector for a context passage.

In [30]:
# tokenize the context passage
inputs = tokenizer(
    str(sentences), padding=True, truncation=True,
    max_length=512
)
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [31]:
# extract the input ids
input_ids = inputs['input_ids']
input_ids

[101,
 1031,
 1005,
 3967,
 1064,
 7222,
 8663,
 2063,
 1032,
 21411,
 23553,
 1032,
 1050,
 3527,
 6169,
 1032,
 17953,
 14644,
 2078,
 1032,
 17953,
 14644,
 5582,
 2415,
 1032,
 13316,
 5358,
 23041,
 3012,
 1032,
 13316,
 25377,
 19092,
 1032,
 6583,
 5092,
 4904,
 1032,
 13316,
 12069,
 2545,
 1032,
 27937,
 8445,
 16912,
 1032,
 23961,
 7946,
 2102,
 1004,
 3036,
 1032,
 13316,
 12162,
 18908,
 1032,
 17953,
 8649,
 1999,
 1032,
 24978,
 23773,
 2039,
 2489,
 1032,
 21411,
 23553,
 1032,
 1050,
 3527,
 6169,
 1032,
 17953,
 14644,
 2078,
 1032,
 13316,
 5358,
 23041,
 3012,
 1032,
 13316,
 25377,
 19092,
 1032,
 13316,
 12069,
 2545,
 1032,
 27937,
 8445,
 16912,
 1032,
 23961,
 7946,
 2102,
 1004,
 3036,
 1032,
 13316,
 12162,
 18908,
 1032,
 17953,
 8649,
 1999,
 1032,
 13316,
 29313,
 4070,
 1032,
 23961,
 8649,
 9354,
 12183,
 1032,
 23961,
 2389,
 2243,
 2000,
 1996,
 9207,
 3945,
 8519,
 1032,
 24978,
 10497,
 2149,
 2115,
 3980,
 2055,
 7222,
 8663,
 2063,
 2030,
 4751,
 2

We need to convert this into a dictionary of key to frequency values.

In [32]:
# convert the input_ids list to a dictionary of key to frequency values
sparse_vec = dict(Counter(input_ids))
sparse_vec

{101: 1,
 1031: 1,
 1005: 3,
 3967: 2,
 1064: 8,
 7222: 10,
 8663: 11,
 2063: 11,
 1032: 67,
 21411: 5,
 23553: 4,
 1050: 12,
 3527: 4,
 6169: 4,
 17953: 9,
 14644: 6,
 2078: 4,
 5582: 2,
 2415: 3,
 13316: 19,
 5358: 4,
 23041: 3,
 3012: 3,
 25377: 5,
 19092: 5,
 6583: 2,
 5092: 2,
 4904: 2,
 12069: 4,
 2545: 4,
 27937: 4,
 8445: 2,
 16912: 2,
 23961: 4,
 7946: 2,
 2102: 2,
 1004: 3,
 3036: 3,
 12162: 4,
 18908: 4,
 8649: 3,
 1999: 2,
 24978: 4,
 23773: 1,
 2039: 1,
 2489: 2,
 29313: 2,
 4070: 2,
 9354: 1,
 12183: 1,
 2389: 1,
 2243: 2,
 2000: 8,
 1996: 3,
 9207: 2,
 3945: 4,
 8519: 1,
 10497: 1,
 2149: 3,
 2115: 3,
 3980: 1,
 2055: 3,
 2030: 4,
 4751: 1,
 3791: 1,
 1012: 15,
 2057: 3,
 1521: 5,
 2222: 1,
 6134: 1,
 1037: 3,
 2051: 2,
 4553: 2,
 1998: 4,
 3745: 1,
 2062: 2,
 2007: 2,
 2017: 7,
 15737: 1,
 3372: 1,
 3046: 2,
 1029: 2,
 2707: 1,
 2182: 1,
 7971: 1,
 2236: 1,
 27050: 1,
 1024: 4,
 18558: 1,
 1030: 2,
 22834: 2,
 6279: 1,
 6442: 1,
 2490: 1,
 3191: 2,
 9986: 2,
 2015: 2,
 

Let's write a function to do this in batches. Notice that we are removing some keys from the dictionary. These are special tokens from the tokenizer which we do not really need when creating sparse vectors.

In [33]:
def build_dict(input_batch):
  # store a batch of sparse embeddings
    sparse_emb = []
    # iterate through input batch
    for token_ids in input_batch:
        # convert the input_ids list to a dictionary of key to frequency values
        d = dict(Counter(token_ids))
        # remove special tokens and append sparse vectors to sparse_emb list
        sparse_emb.append({key: d[key] for key in d if key not in [101, 102, 103, 0]})
    # return sparse_emb list
    return sparse_emb

Let's write another function to help us generate sparse vectors in batches.

In [34]:
def generate_sparse_vectors(context_batch):
    # create batch of input_ids
    inputs = tokenizer(
            context_batch, padding=True,
            truncation=True,
            max_length=512
    )['input_ids']
    # create sparse dictionaries
    sparse_embeds = build_dict(inputs)
    return sparse_embeds

## Dense Vectors

Alongside sparse vectors we must also add dense vectors (as usual). We do this like so:

In [35]:
from sentence_transformers import SentenceTransformer

# load a sentence transformer model from huggingface
model = SentenceTransformer(
    'multi-qa-MiniLM-L6-cos-v1'
)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

The model gives us a `384` dimensional dense vector.

## Upsert Documents

Now we can go ahead and generate sparse and dense vectors for the full dataset and upsert them along with the metadata to the new hybrid index. We can do that easily as follows:

In [36]:
batch_size = 32

for i in tqdm(range(0, len(sentences), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(sentences))
    # extract batch
    context_batch = sentences[i:i_end]
    # create unique IDs
    ids = [str(x) for x in range(i, i_end)]
    # add context passages as metadata
    #meta = [{'context': context} for context in context_batch]
    meta = [{'url': url} for url in urls[i:i_end]]
    # create dense vectors
    dense_embeds = model.encode(context_batch).tolist()
    # create sparse vectors
    sparse_embeds = generate_sparse_vectors(context_batch)

    vectors = []
    # loop through the data and create dictionaries for uploading documents to pinecone index
    for _id, sparse, dense, metadata in zip(ids, sparse_embeds, dense_embeds, meta):
        vectors.append({
            'id': _id,
            'sparse_values': sparse,
            'values': dense,
            'metadata': metadata
        })

    # upload the documents to the new hybrid index
    pinecone.upsert(vectors)

# show index description after uploading the documents
pinecone.describe_index_stats()

  0%|          | 0/6 [00:00<?, ?it/s]

{'namespaces': {'': {'vectorCount': 171}},
 'dimension': 384,
 'indexFullness': 0,
 'totalVectorCount': 171}

## Querying

Now we can query the index, providing the sparse and dense vectors of a question, along with a weight for keyword relevance (“alpha”). `Alpha=1` will provide a purely semantic-based search result and `alpha=0` will provide a purely keyword-based result equivalent to BM25. The default value is `0.5`.

Let's write a helper function to execute queries and after that run some queries.

In [37]:
def hybrid_query(question, top_k, alpha):
    # convert the question into a sparse vector
    sparse_vec = generate_sparse_vectors([question])
    # convert the question into a dense vector
    dense_vec = model.encode([question]).tolist()
    # set the query parameters to send to pinecone
    query = {
      "topK": top_k,
      "vector": dense_vec,
      "sparseVector": sparse_vec[0],
      "alpha": alpha,
      "includeMetadata": True
    }
    # query pinecone with the query parameters
    result = pinecone.query(query)
    # return search results as json
    return result

In [38]:
question = "are you in us-east-2"

First, we will do a pure semantic search by setting the alpha value as 1.

In [39]:
hybrid_query(question, top_k=1, alpha=1)

{'matches': [{'id': '128',
   'score': 0.113913476,
   'values': [],
   'sparseValues': {},
   'metadata': {'url': 'https://www.pinecone.io/docs/api/operation/upsert/'}}],
 'namespace': ''}

The most relevant result from above is the second document with id 711. Now let's try with an alpha value of 0.3.

In [40]:
hybrid_query(question, top_k=1, alpha=0.3)

{'matches': [{'id': '44',
   'score': 0.302282244,
   'values': [],
   'sparseValues': {},
   'metadata': {'url': 'https://www.pinecone.io/pricing/'}}],
 'namespace': ''}

The most relevant document is now ranked the highest.

# Delete the Index

In [41]:
#pinecone.delete_index("hybrid-test")

In [42]:
url_ids = []
for doc in df.response.docs:
    #content =  str(doc.get('title')) + ' ' + str(doc.get('content'))
    content =  url_ids.append(str(doc.get('id')))

print(url_ids[44])

https://www.pinecone.io/pricing/
