<a href="https://colab.research.google.com/github/nicks165/VectorDatabases/blob/main/Pinecone_evaluation_cohere_wiki_english_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Cohere example

In [None]:
!pip install -U cohere pinecone-client datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pinecone

def create_return_index():
  
  pinecone.init(api_key="199c7b9a-c651-4fb0-b5b9-4bc3f706b9b3",
              environment="us-east-1-aws")

  index_name = 'cohere-wiki-35m-en-cosine'
  dimension = 768
  
  # only create index if it doesn't exist
  if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=dimension,
        metric='cosine'
    )

  # now connect to the index
  index = pinecone.Index(index_name)

  return index

In [None]:
from IPython.utils.text import string
import time

def upsert_one_record(index, co):

  # add 1 more record
  eText1 = "How are you WORLD?"

  ewMetadata = [{"text" : eText1, "views" : 600}]
  xq = co.embed(texts=[eText1], model='multilingual-22-12').embeddings
  newRecords = zip([uuid.uuid4().hex], 
              xq,
              ewMetadata
              )

  start_time = time.time() 
  index.upsert(vectors=newRecords)

  print("Updated with one Record and Time taken --- %s seconds ---" % (time.time() - start_time))

  # check number of records in the index
  #index.describe_index_stats()

In [None]:
import numpy

def issue_measure_query_time(index, co):

  query1 = "What was the cause of the major recession in the early 20th century?"
  query2 = "Where is Mount Everest?"
  query3 = "something else"

  queries = [query1, query2, query3]

  timeTakenList = []

  ## Issue 3 queries and take the average

  for i in range(0, 2):
    # create the query embedding
    nearVector = {
        "vector": co.embed(texts=queries, model='multilingual-22-12').embeddings[i]
    }

    query_start_time = time.time()

    res = index.query(nearVector['vector'], top_k=5)

    query_end_time = time.time()

    timeTakenList.append(query_end_time - query_start_time)

    #print("For query number {0}, time taken for search = {1} ".format(queries[i], timeTakenList[i]))

   # print(json.dumps(result, indent=4))
  
  averageTimeTaken = numpy.average(timeTakenList)
  print("Average time taken for search = {0} ".format(averageTimeTaken))

In [None]:
def conditional_search(index, co):
  
  query1 = "What was the cause of the major recession in the early 20th century?"
  query2 = "Where is Mount Everest?"
  query3 = "something else"

  queries = [query1, query2, query3]
  near_vectors = co.embed(texts=queries, model='multilingual-22-12').embeddings

  timeTakenList = []

  ## Issue 3 queries and take the average
  for i in range(0, 2):
    query_start_time = time.time()

    result = index.query(
              vector=near_vectors[i],
              filter={
                "views": {"$gt": 300}
                },
              top_k=5,
              include_metadata=True
              )

    query_end_time = time.time()

    timeTakenList.append(query_end_time - query_start_time)

    #print("For query number {0}, time taken for conditional search = {1} ".format(i+1, timeTakenList[i]))
  
  averageTimeTaken = numpy.average(timeTakenList)
  print("Average time taken for conditional search = {0} ".format(averageTimeTaken))

In [None]:
import time
import uuid
from multiprocessing.pool import Pool
import itertools

# We are expecting no more than 35M entries
MAX_ENTRIES = 35000000

def upsert_db_measure(docs, doc_embeddings, index, batch_size, total_inserted, workload_start_time):

  co = cohere.Client(f"o7lTEJeC1QHjU5I4Ee6U2I0m6l5wCOUPWqwoGM7H")  # Add your cohere API key from www.cohere.com

  # create a list of unique ids for each item to be inserted
  ids = [uuid.uuid4().hex for i in range(batch_size)]''''''''''''''A$@#E:/
  # create list of metadata dictionaries
  meta = [{'text': doc['text'], 'views' : doc["views"]} for doc in docs]
  
  # create list of (id, vector, metadata) tuples to be upserted
  to_upsert = list(zip(ids, doc_embeddings, meta))

  # issue two parallel upserts wih half the chunk size
  #with Pool() as pool:
  #  pool.map(internal_insert_method, internal_payload(index, to_upsert))
  index.upsert(vectors=to_upsert)

  total_inserted += batch_size

  ''''''''''''''''''''''''''''''''''''if(total_inserted in range(0, MAX_ENTRIES, 100000)):''''''''''''''''''''''''''''''''''''
      print("=======================================================================================================")
      print("For {0} entries, time taken for inserts = {1} ".format(total_inserted, time.time() - workload_start_time))
      upsert_one_record(index, co)
      issue_measure_query_time(index, co)
      conditional_search(index, co)
      
  # let's view the index statistics
  #print("==========================================")
  #print(index.describe_index_stats())

  return total_inserted

In [None]:
def execute_test_query():

  co = cohere.Client(f"o7lTEJeC1QHjU5I4Ee6U2I0m6l5wCOUPWqwoGM7H")  # Add your cohere API key from www.cohere.com

  query1 = "What was the cause of the major recession in the early 20th century?"
  query2 = "Where is Mount Everest?"


  # create the query embedding
  xq = co.embed(texts=[query1], model='multilingual-22-12').embeddings

  query_start_time = time.time()

  # query, returning the top 5 most similar results
  res = index.query(xq, top_k=5, include_metadata=True)

  print(" For 1 query, time taken for search = {0} ".format(time.time() - query_start_time))

  for item in res:
    print(item)

In [None]:
from datasets import load_dataset
import torch
import cohere

# Create a generator that yields chunks of the dataset
def chunk_generator(dataset, chunk_size):
  for i in range(0, len(dataset), chunk_size):
    yield dataset[i:i + chunk_size]

def load_cohere_dataset():
   # bring dataset to disk in Arrow table format
  dataset = load_dataset(f"Cohere/wikipedia-22-12-en-embeddings", split="train")
  return dataset

def load_execute_workload(index, dataset):

  limit = -1 # keep -1 for all, else update to a positive number to limit

  # Increasing to 1000 because we will issue two calls in chunks of 500 each in parallel. One call of 1000 fails
  chunk_size = 500 # size of batch upserts and items kept in memory 

  max_docs_loaded = 0

  start_time = time.time()
  docs = []
  doc_embeddings = []

  # Iterate over the chunks
  for chunk in chunk_generator(dataset, chunk_size):
    for i in range(0, chunk_size):
      docs.append({"text" : chunk["text"][i], "views" : chunk['views'][i]})
      doc_embeddings.append(chunk['emb'][i])

    max_docs_loaded = upsert_db_measure(docs, doc_embeddings, index, chunk_size, max_docs_loaded, start_time)
    
    # clear the lists because we want to re-use them for the next chunk
    docs.clear()
    doc_embeddings.clear()

    if (limit > 0 and max_docs_loaded == limit):
      break
  
  print ("Workload Completed! Succesfully executed workload for {0} entries with total time {1}"
    .format(max_docs_loaded, time.time() - start_time))

Loading a big dataset is expensive. Seperate this step so that executing and debugging the main functions would be simple 

In [None]:
dataset = load_cohere_dataset()



In [None]:
# Debugging code - to be deleted afterwards
#docs = []
#doc_embeddings = []
#for chunk in chunk_generator(dataset, 2):
#  for i in range(0,2):
#    docs.append({"text" : chunk["text"][i], "views" : chunk['views'][i]})
#    doc_embeddings.append(chunk['emb'][i])
#    print (chunk)

#  break
  

{'id': [0, 1], 'title': ['Deaths in 2022', 'YouTube'], 'text': ['The following notable deaths occurred in 2022. Names are reported under the date of death, in alphabetical order. A typical entry reports information in the following sequence:', 'YouTube is a global online video sharing and social media platform headquartered in San Bruno, California. It was launched on February 14, 2005, by Steve Chen, Chad Hurley, and Jawed Karim. It is owned by Google, and is the second most visited website, after Google Search. YouTube has more than 2.5 billion monthly users who collectively watch more than one billion hours of videos each day. , videos were being uploaded at a rate of more than 500 hours of content per minute.'], 'url': ['https://en.wikipedia.org/wiki?curid=69407798', 'https://en.wikipedia.org/wiki?curid=3524766'], 'wiki_id': [69407798, 3524766], 'views': [5674.44921875, 5409.56103515625], 'paragraph_id': [0, 0], 'langs': [38, 184], 'emb': [[0.2865696847438812, -0.03181683272123337,

Main workload function

In [None]:

#from multiprocessing.pool import Pool

#global internal_insert_method
#global internal_payload

#def internal_payload(index, to_upsert):
#  return [(index, to_upsert[::2]), (index, to_upsert[1::2])]

#def internal_insert_method(index, vectors):
#  index.upsert(vectors=vectors)

# initialize Pinecone and create index
index = create_return_index()

#download dataset and execute workload
load_execute_workload(index, dataset)

For 100000 entries, time taken for inserts = 328.3341681957245 
Updated with one Record and Time taken --- 0.07896256446838379 seconds ---
Average time taken for search = 0.10893785953521729 
Average time taken for conditional search = 0.10816872119903564 
For 200000 entries, time taken for inserts = 649.5869767665863 
Updated with one Record and Time taken --- 0.0813286304473877 seconds ---
Average time taken for search = 0.11520087718963623 
Average time taken for conditional search = 0.11725997924804688 
For 300000 entries, time taken for inserts = 928.8132243156433 
Updated with one Record and Time taken --- 0.08108186721801758 seconds ---
Average time taken for search = 0.12428629398345947 
Average time taken for conditional search = 0.11459982395172119 
For 400000 entries, time taken for inserts = 1254.1247761249542 
Updated with one Record and Time taken --- 0.0834054946899414 seconds ---
Average time taken for search = 0.10823285579681396 
Average time taken for conditional sea



For 16500000 entries, time taken for inserts = 53894.508600473404 
Updated with one Record and Time taken --- 0.35337209701538086 seconds ---
Average time taken for search = 0.15692007541656494 
Average time taken for conditional search = 0.16180527210235596 
For 16600000 entries, time taken for inserts = 54254.22629857063 
Updated with one Record and Time taken --- 0.3440392017364502 seconds ---
Average time taken for search = 0.14885163307189941 
Average time taken for conditional search = 0.13963830471038818 
For 16700000 entries, time taken for inserts = 54624.203129053116 
Updated with one Record and Time taken --- 0.35073089599609375 seconds ---
Average time taken for search = 0.1645108461380005 
Average time taken for conditional search = 0.1448218822479248 
For 16800000 entries, time taken for inserts = 54984.2496547699 
Updated with one Record and Time taken --- 0.3630332946777344 seconds ---
Average time taken for search = 0.15541470050811768 
Average time taken for condition