# BERT search in Pinecone

## **Dependencies**


In [None]:
!pip install --quiet pandas
!pip install --quiet progressbar2

In [None]:
import re
import bz2
import time
import numpy
import pandas as pd
from typing import List
from statistics import mean
from progressbar import progressbar

## **Dataset**

The dataset used in this notebook is the dbpedia dataset that contains full abstracts of Wikipedia articles, usually the first section.


Downloading the dataset

In [None]:
!rm long_abstracts_en.ttl.bz2
!wget http://downloads.dbpedia.org/2016-10/core-i18n/en/long_abstracts_en.ttl.bz2

We will be conducting a similar test as described in this blog post: [Speeding up BERT Search in Elasticsearch](https://towardsdatascience.com/speeding-up-bert-search-in-elasticsearch-750f1f34f455#e7c4-d62eca28921b). The code is avaliable on this link: https://github.com/DmitryKey/bert-solr-search.git.

**Parsing the dataset**


We will be using the same method that was used for parsing the dataset in the blogpost. Original source of this method can be found on this link: https://github.com/DmitryKey/bert-solr-search/blob/master/src/data_utils.py

In [None]:
def parse_dbpedia_data(source_file, max_docs: int):
    """
    Parses the input file of abstracts and returns an iterable
    :param max_docs: maximum number of input documents to process; -1 for no limit
    :param source_file: input file
    :return: yields document by document to the consumer
    """
    global VERBOSE
    count = 0
    max_tokens = 0

    if -1 < max_docs < 50:
        VERBOSE = True

    percent = 0.1
    bulk_size = (percent / 100) * max_docs

    print(f"bulk_size={bulk_size}")

    if bulk_size <= 0:
        bulk_size = 1000

    for line in source_file:
        line = line.decode("utf-8")

        # skip commented out lines
        comment_regex = '^#'
        if re.search(comment_regex, line):
            continue

        token_size = len(line.split())
        if token_size > max_tokens:
            max_tokens = token_size

        # skip lines with 20 tokens or less, because they tend to contain noise
        # (this may vary in your dataset)
        if token_size <= 20:
            continue

        first_url_regex = '^<([^\>]+)>\s*'

        x = re.search(first_url_regex, line)
        if x:
            url = x.group(1)
            # also remove the url from the string
            line = re.sub(first_url_regex, '', line)
        else:
            url = ''

        # remove the second url from the string: we don't need to capture it, because it is repetitive across
        # all abstracts
        second_url_regex = '^<[^\>]+>\s*'
        line = re.sub(second_url_regex, '', line)

        # remove some strange line ending, that occurs in many abstracts
        language_at_ending_regex = '@en \.\n$'
        line = re.sub(language_at_ending_regex, '', line)

        # form the input object for this abstract
        doc = {
            "_text_": line,
            "url": url,
            "id": count+1
        }

        yield doc
        count += 1

        if count % bulk_size == 0:
            print(f"Processed {count} documents", end="\r")

        if count == max_docs:
            break

    source_file.close()
    print("Maximum tokens observed per abstract: {}".format(max_tokens))

If you are experiencing an issue with RAM, lower the number of MAX_DOCS.

In [None]:
MAX_DOCS = 1000000

source_file = bz2.BZ2File("long_abstracts_en.ttl.bz2", "r")
docs_iter = parse_dbpedia_data(source_file, MAX_DOCS)

**Creating a pandas dataframe**

In [None]:
id = []
text = []

for doc in docs_iter:
    id.append(doc['id'])
    text.append(doc['_text_'])

data = pd.DataFrame({'id': id, 'text': text})

In [None]:
data.head()

**Generating embeddings using BERT**

Generating embeddings is a time consuming process. Please use GPU or lower the number of MAX_DOCS. On Google Colab you should be expecting around 1.5 hours for 1M documents with GPU.

In [None]:
!pip install --quiet sentence_transformers==1.0.4
!pip install --quiet tqdm==4.41.1

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
expensive: downloads the model, creates embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')
import numpy as np
i = 0
for chunk in np.array_split(data, 4):
#     print(chunk.head)
    text = chunk['text'].apply(str).values
    sentence_embeddings = model.encode(text, show_progress_bar=True)
    with open('emb_'+str(i)+'.npy', 'wb') as f:
        np.save(f,sentence_embeddings)

In [None]:
import numpy as np
vec_embeds = []
for i in range(4):
    with open('emb_'+str(i)+'.npy', 'rb') as f:        
        a = np.load(f)        
        vec_embeds = vec_embeds + a.tolist()
print(len(vec_embeds))

In [None]:
data['embeddings'] = pd.Series(vec_embeds)

## **Pinecone**

In [None]:
!pip install --quiet -U pinecone-client

In [None]:
import pinecone

In [None]:
# load Pinecone API key
api_key = 'YOUR_API_KEY'

pinecone.init(api_key=api_key)

index_name = 'bert-stats-test'


[Get the Pinecone API key](https://www.pinecone.io/start/) if you don’t have one already.

In [None]:
items_to_upload = data[['id', 'embeddings']]
items_to_upload = [tuple(x) for x in items_to_upload.to_numpy()]

We are defining a variable which we will be using to query vectors in batches. The reason for this is to make our results comparable to the ones published in the blog. By querying in batches and then dividing the elapsed time with the same number in the end, we minimize the influence of the networking time.

In [None]:
BATCH_VEC = 1000

In [None]:
def upload_items(items_to_upload: List, batch_size: int) -> float:
    print(f"\nUpserting {len(items_to_upload)} vectors...")
    start = time.perf_counter()
    upsert_cursor = index.upsert(items=items_to_upload,batch_size=batch_size)
    end = time.perf_counter()
    return (end - start) / 60.0 # minutes

def restart_service(index_name: str, shards: int, timeout: int = 300):
    if index_name in pinecone.list_indexes():
        pinecone.delete_index(index_name)
    pinecone.create_index(index_name,metric='cosine', shards=shards)
    index = pinecone.Index(index_name)
    return index

def query(test_vectors: List, index) -> float:
    print(f"\nQuerying...")
    times = []

    for test_vector in test_vectors:
        start = time.perf_counter()
        query_results = index.query(queries=BATCH_VEC*[test_vector[1]])              # querying BATCH_VEC number of same vectors top_k=10,
        end = time.perf_counter()
        times.append((end-start)/BATCH_VEC)                                         # dividing time spent on querying BATCH_VEC number of same vectors with BATCH_VEC so we can get time needed for querying a single vector
        break
    return mean(times) * 1000 # milliseconds

Testing uploading and querying

In [None]:
BATCH_SIZE = 1000
NUMBER_OF_DOCS = [200000]

upsert_times = {}                  
query_times = {}

for doc_size in progressbar(NUMBER_OF_DOCS):
    if doc_size > len(items_to_upload):
        print(f"There are no {doc_size} vectors to be uploaded.")
        continue

    test_vectors = items_to_upload[:10]
    index = restart_service(index_name, shards=3)
    time_for_upsert = upload_items(items_to_upload[:doc_size], BATCH_SIZE)
    time_for_query = query(test_vectors, index)
    upsert_times[doc_size] = time_for_upsert
    query_times[doc_size] = time_for_query

Stop the service

In [None]:
if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

## **Displaying results**


In [None]:
time_results = pd.DataFrame({
    'number_of_docs': upsert_times.keys(),
    'indexing_time(min)': upsert_times.values(),
    'avg_search_speed(ms)': query_times.values()
})
time_results['index_size(mb)'] = (time_results['number_of_docs'] * len(items_to_upload[0][1]) * 32) / 8000000 # megabytes

In [None]:
time_results

In [None]:
time_results.plot(x="number_of_docs", y=["indexing_time(min)"], kind="bar")

In [None]:
time_results.plot(x="number_of_docs", y=["avg_search_speed(ms)"], kind="bar")