# Perform RAG
Perform RAG by finding the k most similar vectors of an input query.

In [5]:
import pandas as pd
import pandas as pd
from transformers import BertModel, BertTokenizer
import torch
import pymongo
from pymongo import MongoClient
import numpy as np

## Embed input query

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def embed_query(query, tokenizer, model):
    """
    Embed input query using BERT

    Input: quert (str), tokenizer (BertTokenizer), model (BertModel)
    Output: mean_embedding (list)
    """
    tokenized_query = tokenizer(query, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**tokenized_query)
    
    # extract the last hidden states
    last_hidden_states = outputs.last_hidden_state

    # take the mean of the last hidden states
    mean_embedding = torch.mean(last_hidden_states, dim=1).tolist()[0]

    return mean_embedding

In [27]:
query = "Daniel, Santiago, and Kika wearing Michigan gear"
embedded_query = embed_query(query, tokenizer, model)
print(embedded_query)

[-0.06434137374162674, -0.17787635326385498, -0.08905681222677231, 0.2846422493457794, 0.13387943804264069, -0.06510957330465317, 0.2032398134469986, 0.27382925152778625, -0.0873512402176857, -0.06959721446037292, 0.050441328436136246, -0.4247698485851288, -0.1580558568239212, 0.30573713779449463, 0.06551408022642136, 0.5254742503166199, -0.5825499296188354, 0.4060409963130951, 0.059788744896650314, 0.4864809215068817, -0.08516538143157959, 0.033373940736055374, -0.1900520920753479, 0.5553510189056396, 0.40654048323631287, -0.15224511921405792, 0.055364787578582764, 0.15506185591220856, -0.13610690832138062, 0.04696604236960411, 0.2418111115694046, -0.23305706679821014, -0.09244593232870102, -0.2982737421989441, 0.004730590153485537, -0.46035826206207275, 0.06008392572402954, -0.1153469905257225, 0.07094923406839371, 0.22253461182117462, -0.20752912759780884, -0.3711542785167694, 0.3359468877315521, 0.2804170250892639, -0.002964053303003311, -0.40367916226387024, 0.32839158177375793, 0

## Find K most similar vectors

In [28]:
def cosine_similarity(embedding1, embedding2):
    """
    Compute the cosine similarity between two embeddings

    Input: embedding1 (list), embedding2 (list)
    Output: similarity (float)
    """
    dot_product = sum([a*b for a, b in zip(embedding1, embedding2)])
    magnitude1 = sum([a**2 for a in embedding1])**0.5
    magnitude2 = sum([a**2 for a in embedding2])**0.5

    similarity = dot_product / (magnitude1 * magnitude2)

    return similarity

In [29]:
def find_similar_documents(query_embedding, top_k=5):
    """
    Find top_k similar documents in mongoDB database
    """
    client = MongoClient('localhost', 27017)
    db = client['photo-rag-db']
    collection = db['photo-embeddings']

    similarities = []
    for document in collection.find():
        db_vectors = np.array(document['embedding'])
        similarity = cosine_similarity(query_embedding, db_vectors)
        similarities.append((document['filename'], similarity))
    
    # sort the results by similarity score in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    # return the top K similar documents
    return similarities[:top_k]

In [30]:
similar_documents = find_similar_documents(embedded_query)

In [31]:
print(similar_documents)

[('3884d63c-2c39-46f3-9bad-8cfaa1b1f9a3.jpeg', 0.6949096975861797), ('IMG_6930.jpeg', 0.6946970950327269), ('IMG_6929.jpeg', 0.6939686960341374), ('IMG_6931.jpeg', 0.6925287959646121), ('63d54fec-6212-4093-b7e0-9c519739294a.jpeg', 0.6799742458834294)]
