<a href="https://colab.research.google.com/github/nicks165/VectorDatabases/blob/main/Weaviate_evaluation_cohere_wiki_english_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://weaviate.io/developers/weaviate/quickstart/

In [None]:
!pip install weaviate-client cohere datasets

Initiatize Weaviate

In [None]:
def instantiateWeaviateClient():

  auth_config = weaviate.AuthApiKey(api_key="ODYjhwMLhyrzL3daMEx7PsKna3AtBDJZU98J")
  client = weaviate.Client(
    url="https://osv1ykq1soaj5kb6lb1uq.gcp-d.weaviate.cloud",
    auth_client_secret=auth_config
  )

  return client

Initialize index/collection (class in weaviate)


In [None]:
import time
import string
import random
import json
import weaviate
import cohere
import numpy

def initialize_class(client):
  # Class definition object. Weaviate's autoschema feature will infer properties when importing.
  class_obj = {
      "class": "wikipedia_articles",
      "vectorizer": "none",
      "vectorIndexConfig": {
          "distance": "cosine"
      }
  }

  #client = instantiateWeaviateClient()
  #client.schema.delete_class(class_obj["class"])

  # Add the class to the schema
  #IMPORTANT = Only execute once per cloud account
  if(client.schema.exists(class_obj["class"])):
    print("Class exists")
  else:
    client.schema.create_class(class_obj)


  #client.schema.exists(class_obj["class"])
  return class_obj

Confirm shape and save the dataset to disk

Load embeddings and metadata into Weaviate -
 1. Load in batch
 2. After hitting a threshold, insert 1 object and measure time
 3. Measure query performance for the same threshold

In [None]:
def upsert_one_record(client, co, class_obj):
  # initializing size of string
  N = 7

  # using random.choices()
  # generating random strings
  randomString = ''.join(random.choices(string.ascii_uppercase +
                             string.digits, k=N))

  dataToupd = "This is a new record " + randomString

  newVector = {
      "vector": co.embed(texts=[dataToupd], model='multilingual-22-12').embeddings[0]
  }

  # Configure a batch process
  # create metadata dictionary
  properties = {
          "text": "New Data added " + dataToupd,
          }

  start_time = time.time()
  #client.batch.add_data_object(properties, class_obj["class"], vector=newVector["vector"])

  data_uuid = client.data_object.create(
    properties,
    class_obj["class"],
    vector = newVector["vector"]
  )

  print("Updated with one Record (uuid: {0}) and Time taken --- {1} seconds ---".format(data_uuid, time.time() - start_time))

Querying the dataset

In [None]:
def issue_measure_query_time(client, co, class_obj):

  query1 = "What was the cause of the major recession in the early 20th century?"
  query2 = "Where is Mount Everest?"
  query3 = "something else"

  queries = [query1, query2, query3]

  timeTakenList = []

  ## Issue 3 queries and take the average

  for i in range(0, 2):
    # create the query embedding
    nearVector = {
        "vector": co.embed(texts=queries, model='multilingual-22-12').embeddings[i]
    }

    query_start_time = time.time()

    result = client.query.get(class_obj["class"], ["text"]).with_near_vector(nearVector).with_limit(2).with_additional(['certainty']).do()

    query_end_time = time.time()

    timeTakenList.append(query_end_time - query_start_time)

    #print(" For query number {0}, time taken for search = {1} ".format(queries[i], timeTakenList[i]))

   # print(json.dumps(result, indent=4))

  averageTimeTaken = numpy.average(timeTakenList)
  print("Average time taken for search = {0} ".format(averageTimeTaken))

conditional filtering on search query

In [None]:
def conditional_search(client, co, class_obj):

  query1 = "What was the cause of the major recession in the early 20th century?"
  query2 = "Where is Mount Everest?"
  query3 = "something else"

  queries = [query1, query2, query3]

  timeTakenList = []

  ## Issue 3 queries and take the average

  for i in range(0, 2):
    # create the query embedding
    nearVector = {
        "vector": co.embed(texts=queries, model='multilingual-22-12').embeddings[i]
    }

    query_start_time = time.time()

    result = (client.query
              .get(class_obj["class"], ["text"])
              .with_near_vector(nearVector)
              .with_where({
                "path": ["text"],
                "operator": "Like",
                "valueText": "recession"
                })
              .with_limit(2)
              .with_additional(['certainty'])
              .do())

    query_end_time = time.time()

    timeTakenList.append(query_end_time - query_start_time)

    #print(" For query number {0}, time taken for conditional search = {1} ".format(queries[i], timeTakenList[i]))

    #print(json.dumps(result, indent=4))

  averageTimeTaken = numpy.average(timeTakenList)
  print("Average time taken for conditional search = {0} ".format(averageTimeTaken))

In [None]:
import time
import uuid
MAX_ENTRIES = 35000000

def upsert_db_measure(docs, doc_embeddings, client, batch_size, total_inserted, workload_start_time, class_obj, previous_run_time):

  co = cohere.Client(f"o7lTEJeC1QHjU5I4Ee6U2I0m6l5wCOUPWqwoGM7H")  # Add your cohere API key from www.cohere.com

  # Configure a batch process, calls flush at the end of with block implicitly
  with client.batch as batch:
    batch.batch_size=batch_size
    start_time = time.time()

    # Batch import all Questions
    for i, doc in enumerate(docs):
      # create metadata dictionary
      properties = {
          "text": doc["text"],
      }

      client.batch.add_data_object(properties, class_obj["class"], vector=doc_embeddings[i])

  total_inserted += batch_size

  if(total_inserted in range(0, MAX_ENTRIES, 100000)):
    print("=======================================================================================================")
    total_time = (time.time() - workload_start_time) + previous_run_time
    print("For {0} entries, time taken for inserts = {1} ".format(total_inserted, total_time))
    upsert_one_record(client, co, class_obj)
    issue_measure_query_time(client, co, class_obj)
    conditional_search(client, co, class_obj)

  return total_inserted

Load datset and get the embeddings from Cohere dataset at Huggingface

In [None]:
from datasets import load_dataset
import torch
import cohere

# Create a generator that yields chunks of the dataset
def chunk_generator(dataset, chunk_size, starting_index):
  for i in range(starting_index, len(dataset), chunk_size):
    yield dataset[i:i + chunk_size]

def load_cohere_dataset():
   # bring dataset to disk in Arrow table format
  dataset = load_dataset(f"Cohere/wikipedia-22-12-en-embeddings", split="train")
  return dataset

def load_execute_workload(client, class_obj, dataset):

  limit = -1 # keep -1 for all, else update to a positive number to limit

  chunk_size = 1000 # size of batch upserts and items kept in memory

  max_docs_loaded = 0

  #if the runs fails, we want to re-start and for the subsequent time measurements to be valid. Use this variable
  # paste the runtime for previous starting_index or set it to 0 otherwise
  previous_run_time = 124516.61101400756 # set to 0 when starting from scratch
  previous_docs_loaded = 30500000 # set to 0 when starting from scratch
  max_docs_loaded = previous_docs_loaded

  start_time = time.time()
  docs = []
  doc_embeddings = []
  # Iterate over the chunks
  for chunk in chunk_generator(dataset, chunk_size, previous_docs_loaded):
    for i in range(0, chunk_size):
      docs.append({"text" : chunk["text"][i]})
      doc_embeddings.append(chunk['emb'][i])

    max_docs_loaded = upsert_db_measure(docs, doc_embeddings, client, chunk_size, max_docs_loaded, start_time, class_obj, previous_run_time)

    # clear the lists because we want to re-use them for the next chunk
    docs.clear()
    doc_embeddings.clear()

    if (limit > 0 and max_docs_loaded >= limit):
      break

  total_time = (time.time() - start_time) + previous_run_time
  print ("succesfully executed workload for {0} entries with total time {1}"
    .format(max_docs_loaded, total_time))

Loading a big dataset is expensive. Seperate this step so that executing and debugging the main functions would be simple

In [None]:
dataset = load_cohere_dataset()

Seperating from main execution.

In [None]:
#client = instantiateWeaviateClient()
#class_obj = initialize_class(client)

In [None]:
client = instantiateWeaviateClient()
class_obj = initialize_class(client)
#download dataset and execute workload
load_execute_workload(client, class_obj, dataset)