In [1]:
import textwrap
import numpy as np
import pandas as pd
import json
import google.generativeai as genai
from local_settings import API_KEY,uri,token


In [2]:
from local_settings import API_KEY

genai.configure(api_key=API_KEY)

In [3]:
for m in genai.list_models():
  if 'embedContent' in m.supported_generation_methods:
    print(m.name)

models/embedding-001
models/text-embedding-004


In [4]:
documents = []
with open('insuranceQA-v2/train_part4.jsonl', 'r') as file:
    for line in file:
        documents.append(json.loads(line))

# Extract input and output from the JSON data
documents = [(entry['input'], entry['output']) for entry in documents]
print(len(documents))
documents[0][1]

5332


'The term basic is relative to the person asking the question . Your rate for basic insurance which I would consider to be liability only coverage and at your state minimum levels at this time can be as low as $ 30/mo . Your rates depend on your driving record , geographical area , and your credit history , among other factors . Please contact a local agent to discuss your needs . '

In [5]:
title = "The next generation of AI for developers and Google Workspace"
sample_text = ("Title: The next generation of AI for developers and Google Workspace"
    "\n"
    "Full article:\n"
    "\n"
    "Gemini API & Google AI Studio: An approachable way to explore and prototype with generative AI applications")

model = 'models/text-embedding-004'
embedding = genai.embed_content(model=model,
                                content=sample_text,
                                task_type="retrieval_document",
                                title=title)

embedding_dim=(len(embedding['embedding']))
print(embedding_dim)
print(embedding)

768
{'embedding': [-0.0021609126, -0.003164448, -0.060120765, -0.0071218405, 0.00087754615, 0.04058192, 0.04457149, 0.035524692, -0.047465388, 0.008888606, -0.027958257, 0.011335692, -0.0024438684, 0.0030851841, -0.018796144, -0.055550933, 0.031426456, 0.00065491674, -0.11370059, 0.06370807, -0.021750022, -0.021367034, -0.09982074, -0.008604742, -0.033300586, -0.012815639, 0.07153146, 0.03706478, 0.02297012, 0.043331206, 0.01067061, 0.040685344, 0.03636141, -0.036222056, -0.017799364, -0.014820968, 0.0053205043, -0.017382711, 0.07044941, 0.0020212498, -0.018208733, 0.017558081, 0.006493213, 0.12724239, -0.023805205, 0.010057812, -0.0006948954, 0.07085626, -0.056457285, 0.01831114, 0.09046226, 0.021575559, -0.06656088, 0.026865069, -0.0034812505, -0.0011228691, -0.06535635, -0.0018169151, 0.08672994, 0.02874761, -0.024817277, 0.004653874, -0.058998518, 0.03206169, -0.022604037, -0.015454266, -0.013758667, 0.021129975, -0.047893398, 0.02573244, 0.013028228, -0.018002002, -0.039879415, 0.

In [6]:
def embed_fn(title, text):
  return genai.embed_content(model=model,
                             content=text,
                             task_type="retrieval_document",
                             title=title)["embedding"]

In [7]:
from pymilvus import MilvusClient,utility

milvus_client = MilvusClient(uri=uri,token=token)

collection_name = "my_rag_collection2"

In [8]:
'''if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
)'''

'if milvus_client.has_collection(collection_name):\n    milvus_client.drop_collection(collection_name)\nmilvus_client.create_collection(\n    collection_name=collection_name,\n    dimension=embedding_dim,\n    metric_type="IP",  # Inner product distance\n    consistency_level="Strong",  # Strong consistency level\n)'

In [9]:
from tqdm import tqdm

data = []

for i, line in enumerate(tqdm(documents, desc="Creating embeddings")):
    data.append({"primary_key": i, "vector": embed_fn(line[0],line[1]), "text": line})
    

milvus_client.insert(collection_name=collection_name, data=data)

Creating embeddings:   0%|          | 21/5332 [00:09<39:47,  2.22it/s] 


KeyboardInterrupt: 

In [None]:
question = "What Are Tax Sheltered Annuities?"

search_res = milvus_client.search(
    collection_name=collection_name,
    data=[
        embed_fn(question,question)
    ],  # Use the `emb_text` function to convert the question to an embedding vector
    limit=3,  # Return top 3 results
    search_params={"metric_type": "COSINE",  "params": {"level": 2}},  # Inner product distance
    output_fields=["text"],  # Return the text field
)

In [None]:
retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))

[
    [
        [
            "Should I Shop For Homeowners Insurance? ",
            "I always suggest contacting 5 insurance companies . Saving all your quotes so that comparison is the same with all of the companies . You will find a wide range of prices . I personally suggest going with a company that does not high ball you on your coverage when not needed . They have tendency to raise your premiums yearly based on a national formula as comparing with your actual home value and location "
        ],
        0.8984155654907227
    ],
    [
        [
            "Where To Purchase Homeowners Insurance? ",
            "You can purchase your homeowners insurance in a few different ways . I would recommend finding a local agent that you are comfortable dealing with as they will your first and last point of contact in many instances with your insurance carrier . Your local agent is there to provide you the information that you need to make an informed decision about your coverage needs .