In [8]:
import numpy as np
import time
from tqdm import tqdm
from pymongo import MongoClient

In [9]:
client = MongoClient("mongodb://localhost:27017")
db = client["benchmark"]
collection = db["glove_vectors"]

print("✅ Connected to MongoDB.")

✅ Connected to MongoDB.


In [10]:
def load_queries(path, num=100):
    queries = []
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if i >= num:
                break
            parts = line.strip().split()
            word, vec = parts[0], list(map(float, parts[1:]))
            queries.append((word, np.array(vec)))
    return queries

queries = load_queries("/Users/palakarora/Downloads/Palak_Thesis/datasets/glove/glove.6B.100d.txt", num=100)
print(f"✅ Loaded {len(queries)} queries.")

✅ Loaded 100 queries.


In [11]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


In [None]:
import numpy as np
import pymongo
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import time

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["benchmark"]
collection = db["glove_vectors"]

cursor = collection.find({}, {"_id": 0, "word": 1, "vector": 1})
docs = list(cursor)
words = [doc["word"] for doc in docs]
vectors = np.array([doc["vector"] for doc in docs])

def load_queries(path, num=100):
    queries = []
    with open(path, "r") as f:
        for i, line in enumerate(f):
            if i >= num:
                break
            parts = line.strip().split()
            word = parts[0]
            vector = np.array([float(x) for x in parts[1:]])
            queries.append((word, vector))
    return queries

queries = load_queries("datasets/glove/glove.6B.100d.txt")

timings = []
top1_hits = 0

for word, query_vec in tqdm(queries, desc="🔍 Running queries"):
    start = time.time()
    sims = cosine_similarity([query_vec], vectors)[0]
    top_idx = np.argmax(sims)
    top_word = words[top_idx]
    timings.append(time.time() - start)
    if top_word == word:
        top1_hits += 1

average_time = np.mean(timings)
throughput = len(timings) / sum(timings)
recall_at_1 = top1_hits / len(queries)

print("\n📊 MongoDB Brute-Force Results:")
print(f"Average Query Time: {average_time:.4f} seconds")
print(f"Throughput: {throughput:.2f} queries/sec")
print(f"Recall@1: {recall_at_1:.2%}")
