# Data from Mongo to Vector DB

In [11]:
from pymongo import MongoClient
import pandas as pd
import json
import os

from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
from qdrant_client.http import models
from qdrant_client.models import Distance,VectorParams

In [13]:
mongo_url = os.environ.get("MONGO_URL")
qdrant_url = os.environ.get("QDRANT_CLUSTER_URL")
qdrant_key = os.environ.get("QDRANT_API_KEY")
openai_key = os.environ.get("OPENAI_API_KEY")

m_client = MongoClient(mongo_url)

qdrant_client = QdrantClient(
    url=qdrant_url, 
    api_key=qdrant_key,
)

Encoder for the vector_db

In [14]:
encoder = SentenceTransformer('all-MiniLM-L6-v2')

Obtendo dados do Mongo

In [15]:
db = m_client['api']
collection_m = db['google_v0']

In [16]:
mon_summ = {}
mon_rev = {}

for doc in collection_m.find():
    #vector dictionary for summary
    summary = doc.get('summary','')
    
    #encoder
    try:
        vector_s = encoder.encode(summary).tolist()
        mon_summ[doc.get('name','')] = vector_s
    except:
        vector_s = [0] * 384
        mon_summ[doc.get('name','')] = vector_s

    #dictionary for reviews
    reviews_text = (
    " ".join(review.get('text', '') for review in doc.get('reviews', []) if isinstance(doc.get('reviews', []), list))
    )
    try:
        vector_r = encoder.encode(reviews_text).tolist()
        mon_rev[doc.get('name','')] = vector_r
    except:
        vector_r = [0] * 384
        mon_rev[doc.get('name','')] = vector_r


summ_df = pd.DataFrame(list(mon_summ.items()), columns=['name', 'vector_summary'])
rev_df = pd.DataFrame(list(mon_rev.items()), columns=['name', 'vector_reviews'])

vect_df = summ_df.merge(rev_df, on='name', how='left')
vect_df.head()

Unnamed: 0,name,vector_summary,vector_reviews
0,Let's Beer,"[0.07094738632440567, -0.02893933653831482, -0...","[0.015329340472817421, 0.06257352977991104, -0..."
1,Barbirô,"[0.020421776920557022, -0.03191355988383293, -...","[0.0406663678586483, 0.055937591940164566, -0...."
2,"Esquina do Meninão - Cerveja, Drinks e Petiscos","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.019606823101639748, -0.0064268470741808414,..."
3,Paróquia Bar o Santto Chopp,"[0.05131859332323074, -0.02564861997961998, -0...","[0.02331162430346012, 0.01523871161043644, 0.0..."
4,Bar da Vila,"[0.0338745042681694, 0.02122640796005726, -0.0...","[0.09387751668691635, 0.0036726687103509903, -..."


Inserindo dados do mongo no Qdrant
- Aqui serão criadas duas colections distintas para cada tipo de requisição (reviews e summary)

In [17]:
qdrant_client.create_collection(
    collection_name = 'summary_db'
    ,vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE
    )
)


True

In [18]:
# collections = ['summary_db','reviews_db']

# for c in collections:
#     collect = c
#     qdrant_client.create_collection(
#         collection_name = collect
#         ,vectors_config=models.VectorParams(
#             size=encoder.get_sentence_embedding_dimension(),
#             distance=models.Distance.COSINE
#         )
#     )

In [19]:
# lista de pontos a serem inseridos no vector db

points_summ = [
    models.PointStruct(
        id=idx,
        vector=row['vector_summary'],
        payload={'name': row['name']}
    )
    for idx, row in vect_df.iterrows()
]

# # Create points for the reviews collection
# points_review = [
#     models.PointStruct(
#         id=idx,
#         vector=row['vector_reviews'],
#         payload={'name': row['name']}
#     )
#     for idx, row in vect_df.iterrows()
# ]

In [20]:
# inserção de pontos no vector db

qdrant_client.upload_points(
    collection_name="summary_db",
    points=points_summ
)

# q_client.upload_points(
#     collection_name="reviews_db",
#     points=points_review
# )

In [21]:
user_query = "Busco por um bar de música ao vivo"
query_vector = encoder.encode(user_query).tolist()

hits = qdrant_client.search(
    collection_name="summary_db",
    query_vector=query_vector,
    limit=10,
)

print(hits)

[ScoredPoint(id=159, version=2, score=0.6450157, payload={'name': 'Bar do Peixe'}, vector=None, shard_key=None), ScoredPoint(id=60, version=0, score=0.6412624, payload={'name': 'Bar Providência'}, vector=None, shard_key=None), ScoredPoint(id=201, version=3, score=0.6412548, payload={'name': 'Praça de Minas Bar e Restaurante II'}, vector=None, shard_key=None), ScoredPoint(id=3, version=0, score=0.630394, payload={'name': 'Paróquia Bar o Santto Chopp'}, vector=None, shard_key=None), ScoredPoint(id=226, version=3, score=0.6283034, payload={'name': 'Botequim Saúde'}, vector=None, shard_key=None), ScoredPoint(id=111, version=1, score=0.5905249, payload={'name': 'Quintal da Casemiro'}, vector=None, shard_key=None), ScoredPoint(id=53, version=0, score=0.5901459, payload={'name': 'Mascarino Pizza-Bar'}, vector=None, shard_key=None), ScoredPoint(id=43, version=0, score=0.5739639, payload={'name': 'Karaoke Bar 5th Avenue'}, vector=None, shard_key=None), ScoredPoint(id=4, version=0, score=0.54753