In [1]:
import pandas as pd
import polars as pl
import googlemaps
import pprint
from pymongo import MongoClient
import json
import os

In [2]:
#Mudança de diretório para obter credenciais de configs

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
granny_dir = os.path.dirname(parent_dir)

os.chdir(granny_dir)

configs_dir = os.path.join(granny_dir, 'configs')
mongo_config_file = os.path.join(configs_dir, 'mongo.json')

with open(mongo_config_file, 'r') as f:
    mongo_config = json.load(f)

client = MongoClient(mongo_config['url'])

db = client['api']
collection = db['google_v0']

In [3]:
client

MongoClient(host=['147.79.83.71:27017'], document_class=dict, tz_aware=False, connect=True)

In [5]:
# Aqui transforma-se a base do mongo em um dataframe polars e encerra-se a conexão com o mongo
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [34]:

for doc in collection.find():
    reviews_text = (
        " ".join(review.get('text', '') for review in doc.get('reviews', []) if isinstance(doc.get('reviews', []), list))
    )

    summary_text = doc.get('summary', '')

    types_text = " ".join(doc.get('types', [])) if isinstance(doc.get('types', []), list) else ""


    text = f"{reviews_text} {summary_text} {types_text}"
    
    vector = model.encode(text).tolist()

    collection.update_one(
        {'_id':doc['_id']},
        {'$set': {'vector':vector}}
    )

    print(f"Vetor adicionado no bar {doc['name']} com ID {doc['_id']}")

print("Todos os vetores foram adicionados.")

Vetor adicionado no bar Let's Beer com ID 672857778eaa66902e38b8cc
Vetor adicionado no bar Barbirô com ID 672857778eaa66902e38b8cd
Vetor adicionado no bar Esquina do Meninão - Cerveja, Drinks e Petiscos com ID 672857778eaa66902e38b8ce
Vetor adicionado no bar Paróquia Bar o Santto Chopp com ID 672857778eaa66902e38b8cf
Vetor adicionado no bar Bar da Vila com ID 672857778eaa66902e38b8d0
Vetor adicionado no bar Pirajá Vila Mariana com ID 672857778eaa66902e38b8d1
Vetor adicionado no bar Bar Genuíno com ID 672857778eaa66902e38b8d2
Vetor adicionado no bar Barxaréu com ID 672857778eaa66902e38b8d3
Vetor adicionado no bar Fortunato Bar com ID 672857778eaa66902e38b8d4
Vetor adicionado no bar Joca Vila Mariana com ID 672857778eaa66902e38b8d5
Vetor adicionado no bar VKS Beer House com ID 672857778eaa66902e38b8d6
Vetor adicionado no bar Taquarica Bar com ID 672857778eaa66902e38b8d7
Vetor adicionado no bar Bar Veloso com ID 672857778eaa66902e38b8d8
Vetor adicionado no bar Matriz Bar & Choperia com ID

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def vector_search(query_text, top_n=10):
    """
    Perform vector search on MongoDB documents based on a query text.
    
    Parameters:
        query_text (str): The text query for which to find similar documents.
        top_n (int): The number of top results to return.
        
    Returns:
        list: A list of dictionaries containing the top N most similar documents and their scores.
    """
    # Transform the query text into a vector
    query_vector = model.encode(query_text).reshape(1, -1)

    # Retrieve only necessary fields to minimize memory usage
    cursor = collection.find({}, {'vector': 1, **{field: 1 for field in collection.find_one().keys() if field != 'vector'}})

    # Compute cosine similarities
    similarities = []
    for doc in cursor:
        stored_vector = np.array(doc.get('vector', []))
        if stored_vector.size > 0:
            score = cosine_similarity(query_vector, stored_vector.reshape(1, -1))[0][0]
            doc_copy = {key: value for key, value in doc.items() if key != 'vector'}
            doc_copy['score'] = float(round(score,2))  # Add the score directly to the document
            similarities.append(doc_copy)

    # Sort by similarity score
    sorted_results = sorted(similarities, key=lambda x: x['score'], reverse=True)

    # Return the top N results
    return sorted_results[:top_n]


In [41]:
# Example usage
results = vector_search("Bares para levar meus filhos", top_n=5)
for result in results:
    pprint.pprint(result)

{'_id': ObjectId('672857778eaa66902e38b92b'),
 'business_status': 'OPERATIONAL',
 'curbside_pickup': False,
 'formatted_address': 'R. Abílio Soares, 998 - Paraíso, São Paulo - SP, '
                      '04005-003, Brazil',
 'geometry': "{'location': {'lat': -23.5770771, 'lng': -46.6506476}, "
             "'viewport': {'northeast': {'lat': -23.57576452010728, 'lng': "
             "-46.64926607010728}, 'southwest': {'lat': -23.57846417989272, "
             "'lng': -46.65196572989272}}}",
 'icon': 'https://maps.gstatic.com/mapfiles/place_api/icons/v1/png_71/restaurant-71.png',
 'icon_background_color': '#FF9E67',
 'icon_mask_base_uri': 'https://maps.gstatic.com/mapfiles/place_api/icons/v2/restaurant_pinlet',
 'name': 'Bar & Lanchonete Bandeira Um',
 'opening_hours': {'open_now': False,
                   'periods': [{'close': {'day': 0, 'time': '21:00'},
                                'open': {'day': 0, 'time': '08:00'}},
                               {'close': {'day': 1, 'time': '