In [1]:
# Load beir dataset
from beir.datasets.data_loader import GenericDataLoader
data_path = "datasets/hotpotqa"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

  from tqdm.autonotebook import tqdm
100%|██████████| 5233329/5233329 [00:38<00:00, 134956.85it/s]


In [2]:
from azure.cosmos import CosmosClient, PartitionKey, exceptions
import logging
import os

from dotenv import load_dotenv

# Load the environment variables from .env
load_dotenv()

# Initialize the Cosmos client
connection_string = os.getenv('COSMOSDB_CONN_STR') 
client = CosmosClient.from_connection_string(connection_string)

# Define the database and container
database_name = os.getenv('COSMOSDB_DB_NAME') 
container_name = os.getenv('COSMOSDB_CONTAINER_NAME') 
database = client.get_database_client(database_name)
container = database.get_container_client(container_name)

In [3]:
import os
from openai import AzureOpenAI

# 1. Set up Azure OpenAI and Cosmos DB credentials
embeddings_endpoint = os.environ.get('AOAI_ENDPOINT')
embeddings_api_version = "2024-12-01-preview"
embeddings_api_key = os.environ.get('AOAI_API_KEY')
model_name = "text-embedding-3-small"

openai_client = AzureOpenAI(
    api_version=embeddings_api_version,
    azure_endpoint=embeddings_endpoint,
    api_key=embeddings_api_key
)

# Function to get embeddings from Azure OpenAI
def get_embeddings(text):
    response = openai_client.embeddings.create(
        model=model_name,
        input=text,
        dimensions=1536
    )
    return response.data[0].embedding
    
print(get_embeddings("Were Scott Derrickson and Ed Wood of the same nationality?"))

[-0.046079982072114944, -0.0005055809742771089, -0.016030503436923027, 0.006410967092961073, -0.01726456731557846, -0.009662727825343609, 0.0605185441672802, 0.042994819581508636, -0.04832598194479942, -0.04432760924100876, 0.013698120601475239, -0.0347265861928463, 0.0003783180145546794, 0.0038163457065820694, -0.00441795215010643, 0.04630211368203163, -0.011847022920846939, 0.008200361393392086, -0.02034972980618477, -0.019658653065562248, 0.024780023843050003, 0.013488329015672207, 0.0051306248642504215, 0.027741778641939163, -0.01284661516547203, 0.03169078752398491, -0.018856512382626534, 0.018177775666117668, 0.019399499520659447, 0.035022761672735214, 0.024397462606430054, -0.029815007001161575, -0.019276093691587448, 0.026927296072244644, -0.023755749687552452, 0.010464870370924473, -0.040205832570791245, -0.014253449626266956, 0.015203679911792278, -0.03364060819149017, -0.0485234335064888, -0.05148518830537796, -0.007638861890882254, 0.026655802503228188, -0.01058827713131904

In [24]:
#Query ID: 5a8b57f25542995d1e6f1371, Question: Were Scott Derrickson and Ed Wood of the same nationality?
#Query ID: 5a8c7595554299585d9e36b6, Question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?
#Query ID: 5a85ea095542994775f606a8, Question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?
#Query ID: 5adbf0a255429947ff17385a, Question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?
#Query ID: 5a8e3ea95542995a26add48d, Question: The director of the romantic comedy "Big Stone Gap" is based in what New York city?
# Suppress logging for Azure Cosmos SDK
logging.getLogger("azure.cosmos").setLevel(logging.CRITICAL)
logging.getLogger("openai").setLevel(logging.CRITICAL)
logging.getLogger("httpcore").setLevel(logging.CRITICAL)
# Query to get the count of items
#embedded_query = get_embeddings('Were Scott Derrickson and Ed Wood of the same nationality?')
embedded_query = get_embeddings('The director of the romantic comedy "Big Stone Gap" is based in what New York city?')
query = f"""SELECT TOP 1000 c.title, 
            c.text, 
            VectorDistance(c.vectorized_text, {embedded_query}) AS SimilarityScore   
            FROM c  
            ORDER BY VectorDistance(c.vectorized_text,{embedded_query})"""

# Execute the query
result = list(container.query_items(query=query, enable_cross_partition_query=True))

# The result will be a list with a single value (the count)
for r in result:
    print(r)

{'title': 'Big Stone Gap (film)', 'text': "Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios, a subsidiary of Media Society. Based on Trigiani's 2000 best-selling novel of the same name, the story is set in the actual Virginia town of Big Stone Gap circa 1970s. The film had its world premiere at the Virginia Film Festival on November 6, 2014.", 'SimilarityScore': 0.8332125319673994}
{'title': 'Little Manhattan', 'text': 'Little Manhattan is a 2005 American romantic comedy film directed and written by husband and wife Mark Levin and Jennifer Flackett. Though Levin is credited as the director and Flackett as the writer, in the film\'s DVD commentary the two reveal that they collaborated on both tasks. "Little Manhattan" depicts the story of ten-year-old Gabe\'s realization that girls can be pretty and nice to be with. The story takes place, and was filmed on location, in Manhatta

In [13]:
#Query ID: 5a8b57f25542995d1e6f1371, Question: Were Scott Derrickson and Ed Wood of the same nationality?
#Query ID: 5a8c7595554299585d9e36b6, Question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?
#Query ID: 5a85ea095542994775f606a8, Question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?
#Query ID: 5adbf0a255429947ff17385a, Question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?
#Query ID: 5a8e3ea95542995a26add48d, Question: The director of the romantic comedy "Big Stone Gap" is based in what New York city?
q_id = "5a8e3ea95542995a26add48d"
print(f"The question: {queries[q_id]}")
corpus_id_list = list(qrels[q_id].keys())
print(f"The corpus IDs: {corpus_id_list}")

for corpus_id in corpus_id_list:
    print(f"Most relevant corpus: title: {corpus[corpus_id]['title']}, text: {corpus[corpus_id]['text']}")

The question: The director of the romantic comedy "Big Stone Gap" is based in what New York city?
The corpus IDs: ['41146297', '5382358']
Most relevant corpus: title: Big Stone Gap (film), text: Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios, a subsidiary of Media Society. Based on Trigiani's 2000 best-selling novel of the same name, the story is set in the actual Virginia town of Big Stone Gap circa 1970s. The film had its world premiere at the Virginia Film Festival on November 6, 2014.
Most relevant corpus: title: Adriana Trigiani, text: Adriana Trigiani is an Italian American best-selling author of sixteen books, television writer, film director, and entrepreneur based in Greenwich Village, New York City. Trigiani has published a novel a year since 2000.


In [14]:
# Keys to select
keys_to_select = ['41146297', '5382358']

# Create a new dictionary with only the selected keys
selected_corpus = {key: corpus[key] for key in keys_to_select}

print(selected_corpus)
    
json_array = []
for key, value in selected_corpus.items():
    text = f"Title: {value['title']}, Text:{value['text']}"
    vector_text = get_embeddings(text)
    if vector_text is not None:
        json_array.append({'id': key, 'text': value['text'], 'title':value['title'] , 'vectorized_text': vector_text})

{'41146297': {'text': "Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios, a subsidiary of Media Society. Based on Trigiani's 2000 best-selling novel of the same name, the story is set in the actual Virginia town of Big Stone Gap circa 1970s. The film had its world premiere at the Virginia Film Festival on November 6, 2014.", 'title': 'Big Stone Gap (film)'}, '5382358': {'text': 'Adriana Trigiani is an Italian American best-selling author of sixteen books, television writer, film director, and entrepreneur based in Greenwich Village, New York City. Trigiani has published a novel a year since 2000.', 'title': 'Adriana Trigiani'}}


In [15]:
for e in json_array:
    print(e)
    container.upsert_item(e)

{'id': '41146297', 'text': "Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios, a subsidiary of Media Society. Based on Trigiani's 2000 best-selling novel of the same name, the story is set in the actual Virginia town of Big Stone Gap circa 1970s. The film had its world premiere at the Virginia Film Festival on November 6, 2014.", 'title': 'Big Stone Gap (film)', 'vectorized_text': [-0.015521317720413208, 0.019699301570653915, -0.0029927785508334637, -0.013551387935876846, -0.030371559783816338, -0.04788445681333542, 0.03273114562034607, -0.018595274537801743, -0.024764835834503174, -0.08438228070735931, 0.018995754420757294, -0.011061915196478367, -0.09014054387807846, 0.019287997856736183, -0.026410052552819252, 0.04331681504845619, -0.007863485254347324, -0.009687294252216816, 0.020186372101306915, 0.017058296129107475, 0.03766679763793945, 0.04115205630660057, 0.007279000245

In [20]:
logging.getLogger("azure.cosmos").setLevel(logging.CRITICAL)
logging.getLogger("openai").setLevel(logging.CRITICAL)
logging.getLogger("httpcore").setLevel(logging.CRITICAL)
# Query to get the count of items
embedded_query = get_embeddings('The director of the romantic comedy "Big Stone Gap" is based in what New York city?')
query = f"""SELECT c.title, 
            c.text, 
            VectorDistance(c.vectorized_text, {embedded_query}) AS SimilarityScore   
            FROM c  
            WHERE c.id = '5382358'"""

# Execute the query
result = list(container.query_items(query=query, enable_cross_partition_query=True))

# The result will be a list with a single value (the count)
for r in result:
    print(r)

{'title': 'Adriana Trigiani', 'text': 'Adriana Trigiani is an Italian American best-selling author of sixteen books, television writer, film director, and entrepreneur based in Greenwich Village, New York City. Trigiani has published a novel a year since 2000.', 'SimilarityScore': 1.1514988741930259}
