In [1]:
import datetime
import pickle
import uuid
import datetime
import numpy as np
import time

# Elastic Search

In [None]:
import elasticsearch

In [None]:
elasticsearch.__version__

In [None]:
from elasticsearch import Elasticsearch

In [None]:
es = Elasticsearch(hosts="http://localhost:9200" , verify_certs=False)

In [None]:
index_name = "test-index"

In [None]:
doc = {
    'author': 'kimchy',
    'text': 'Elasticsearch: cool. bonsai cool.',
    'timestamp': datetime.datetime.now(),
}

In [None]:
resp = es.index(index=index_name, id=1, document=doc)


In [None]:
print(resp['result'])

resp = es.get(index=index_name, id=1)
print(resp['_source'])

es.indices.refresh(index=index_name)

resp = es.search(index=index_name, query={"match_all": {}})
print("Got %d Hits:" % resp['hits']['total']['value'])
for hit in resp['hits']['hits']:
    print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])

# Milvus

https://github.com/milvus-io/pymilvus/blob/master/examples/hello_milvus.ipynb

In [None]:
from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)


In [None]:
num_entities, dim = 3000, 8


In [None]:
collection_name="hello_milvus"

In [None]:
!ls

In [None]:
connections.connect("default", host="localhost", port="19530")


In [None]:
if  utility.has_collection(collection_name):
    utility.drop_collection(collection_name)

In [None]:
utility.list_collections()

create collection

In [None]:
fields = [
    FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
    FieldSchema(name="random", dtype=DataType.DOUBLE),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
]

schema = CollectionSchema(fields, "hello_milvus is the simplest demo to introduce the APIs")

hello_milvus = Collection(collection_name, schema, consistency_level="Strong")

In [None]:
rng = np.random.default_rng(seed=19530)
entities = [
    # provide the pk field because `auto_id` is set to False
    [str(i) for i in range(num_entities)],
    rng.random(num_entities).tolist(),  # field random, only supports list
    rng.random((num_entities, dim)),    # field embeddings, supports numpy.ndarray and list
]

insert_result = hello_milvus.insert(entities)

print(f"Number of entities in Milvus: {hello_milvus.num_entities}")  # check the num_entites

In [None]:
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}

hello_milvus.create_index("embeddings", index)

In [None]:
hello_milvus.load()


In [None]:
vectors_to_search = entities[-1][-2:]
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}

start_time = time.time()
result = hello_milvus.search(vectors_to_search, "embeddings", search_params, limit=3, output_fields=["random"])
end_time = time.time()

for hits in result:
    for hit in hits:
        print(f"hit: {hit}, random field: {hit.entity.get('random')}")
print((end_time - start_time))

# weaviate

In [None]:
#!pip install weaviate-client==3.8.0

In [None]:
import weaviate


In [None]:
def generate_uuid(class_name: str, identifier: str,
                  test: str = 'teststrong') -> str:
    """ Generate a uuid based on an identifier
    :param identifier: characters used to generate the uuid
    :type identifier: str, required
    :param class_name: classname of the object to create a uuid for
    :type class_name: str, required
    """
    test = 'overwritten'
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, class_name + identifier))

def log(i: str) -> str:
    """ A simple logger
    :param i: the log message
    :type i: str
    """
    now = datetime.datetime.utcnow()
    print(now, "| " + str(i))

In [None]:
client = weaviate.Client("http://localhost:8081")
print("Client created")

In [None]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') #, Initially load using this, then start using pickle to save time.

In [None]:
# from sentence_transformers import SentenceTransformer
# # sbert_model = SentenceTransformer('bert-base-nli-mean-tokens'), Initially load using this, then start using pickle to save time.
# with open("sbert",'rb') as f:
#     sbert_model = pickle.load(f)

print("sbert loaded")

# I am adding the texts in this list,
# We can also add sentences of a large text individually to get more precise results when we query.
documents = [
    '''Taj mahal is an immense mausoleum of white marble, built in Agra between 1631 and 1648 by order of the Mughal emperor Shah Jahan in memory of his favourite wife, the Taj Mahal is the jewel of Muslim art in India and one of the universally admired masterpieces of the world's heritage.''',
    '''The Statue of Liberty is a 305-foot (93-metre) statue located on Liberty Island in Upper New York Bay, off the coast of New York City. The statue is a personification of liberty in the form of a woman. She holds a torch in her raised right hand and clutches a tablet in her left.''',
    '''The Statue of Liberty was sculpted between 1875 and 1884 under the direction of French sculptor Frédéric-Auguste Bartholdi, who began drafting designs in 1870. Bartholdi and his team hammered roughly 31 tons of copper sheets onto a steel frame. Before being mounted on its current pedestal, the statue stood over 151 feet (46 metres) tall and weighed 225 tons.''',
    '''Badminton is a racquet sport played using racquets to hit a shuttlecock across a net. Although it may be played with larger teams, the most common forms of the game are "singles" (with one player per side) and "doubles" (with two players per side). Badminton is often played as a casual outdoor activity in a yard or on a beach; formal games are played on a rectangular indoor court. Points are scored by striking the shuttlecock with the racquet and landing it within the opposing side's half of the court.''',
    '''James Bond is a fictional character created by novelist Ian Fleming in 1953.''',
    '''A British secret agent working for MI6 under the codename 007, he has been portrayed on film by actors Sean Connery, David Niven, George Lazenby, Roger Moore, Timothy Dalton, Pierce Brosnan and Daniel Craig in twenty-seven productions.'''
]

# A dictionary to store the document and its feature vector (the vector generated by SBERT)
doc_and_vec = {}

def giveVector(texts):
    # this function returns the vector using SBERT
    return sbert_model.encode(texts)

vectors = giveVector(documents)

for doc,vec in zip(documents,vectors):
    doc_and_vec[doc] = vec

print("vectors formed")

client.schema.delete_all()
class_obj = {
    "class": "Post",
    "vectorizer": "none", # we are providing the vectors ourselves through our SBERT model, so this field is none
    "properties": [{
        "name": "content",
        "dataType": ["text"],
    }]
}

client.schema.create_class(class_obj)
print("Schema class created")

for doc,vec in doc_and_vec.items():
    data_obj = {
    "content": doc
    }
    client.data_object.create(
    data_obj,
    "Post",
    generate_uuid('Post',doc),
    vector = vec,
    )
print("Finished importing data")

def process_query(vec):
    nearVector = {"vector": vec}
    res = client.query.get("Post", ["content", "_additional {certainty}"]).with_near_vector(nearVector).do()
    print(res)
    print("------------------------------------------------------------------------------------------------")
    print("-----------------------------------Most similar text -------------------------------------------")
    print(res['data']['Get']['Post'][0]['content'])
    print("------------------------------------------------------------------------------------------------")
    print(res['data']['Get']['Post'][1]['content'])
    print("------------------------------------------------------------------------------------------------")

    


In [None]:
query ="american tourist destination"
query_vec = sbert_model.encode(query)
process_query(query_vec)
