In [152]:
from pymilvus import MilvusClient, DataType
import pandas as pd
import ast

# 1. Set up a Milvus client
client = MilvusClient(
    uri="http://localhost:19530"
)


In [153]:
# 2. Create a collection in quick setup mode
client.create_collection(
    collection_name="open_api_embeddings",
    dimension=1536 # length for embeddings vector
)

In [154]:
# 3. Create a collection in customized setup mode

# 3.1. Create schema
schema = MilvusClient.create_schema(
    auto_id=False,
    enable_dynamic_field=True,
)

# 3.2. Add fields to schema
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="my_vector", datatype=DataType.FLOAT_VECTOR, dim=1536)

# 3.3. Prepare index parameters
index_params = client.prepare_index_params()

# 3.4. Add indexes
index_params.add_index(
    field_name="id"
)

index_params.add_index(
    field_name="my_vector",
    index_type="AUTOINDEX",
    metric_type="COSINE"
)

# 3.5. Create a collection
client.create_collection(
    collection_name="customized_setup",
    schema=schema,
    index_params=index_params
)

In [155]:
try:
    df = pd.read_csv("/Users/nardoarevalo/Desktop/pandas_learning/notebooks/openai_embeddings/data.csv")
except Exception as e:
    print("loading from web!")
    df = pd.read_csv(
        "https://raw.githubusercontent.com/nardoguy14/jupyter_notebooks/main/notebooks/openai_embeddings/data.csv")
df['embeddings'] = df['embeddings'].apply(ast.literal_eval)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        41 non-null     object
 1   embeddings  41 non-null     object
dtypes: object(2)
memory usage: 784.0+ bytes


In [156]:
len(df['embeddings'])
ids = []
for i in range(41):
    ids.append(i)
df['id'] = ids
df
    

Unnamed: 0,text,embeddings,id
0,Aetna Insurance\nAetna is a health insurance p...,"[0.011871634051203728, 0.009219045750796795, 0...",0
1,Insurance Company\nAnthem Blue Cross Health In...,"[0.002933195559307933, -0.003226845059543848, ...",1
2,Insurance Company\nBlue Shield Health Insuranc...,"[0.005859837867319584, -0.015740295872092247, ...",2
3,Bronze 60 Plan: Peace of Mind for Unexpected H...,"[0.01280609704554081, 0.015537003986537457, 0....",3
4,Chinese Community Health Insurance Coverage\nF...,"[0.007296297233551741, 0.012715067714452744, 0...",4
5,Covered California Deadline\nGeneral Applicati...,"[-0.013264380395412445, 0.003993576858192682, ...",5
6,Family Health Insurance in California\nMany fa...,"[0.01131445448845625, 0.007411356084048748, 0....",6
7,Covered California Gold 80 Plan: Gold is Golde...,"[0.024426080286502838, 0.01196149829775095, 0....",7
8,The Subsidy or Tax Credit of Healthcare Reform...,"[0.004846894647926092, -0.0026914558839052916,...",8
9,Health Net Health Insurance Coverage\nAbout He...,"[-0.008957654237747192, -0.0032053696922957897...",9


In [157]:
df['vector'] = df['embeddings']
df.drop(['embeddings'], axis=1, inplace=True)

df


Unnamed: 0,text,id,vector
0,Aetna Insurance\nAetna is a health insurance p...,0,"[0.011871634051203728, 0.009219045750796795, 0..."
1,Insurance Company\nAnthem Blue Cross Health In...,1,"[0.002933195559307933, -0.003226845059543848, ..."
2,Insurance Company\nBlue Shield Health Insuranc...,2,"[0.005859837867319584, -0.015740295872092247, ..."
3,Bronze 60 Plan: Peace of Mind for Unexpected H...,3,"[0.01280609704554081, 0.015537003986537457, 0...."
4,Chinese Community Health Insurance Coverage\nF...,4,"[0.007296297233551741, 0.012715067714452744, 0..."
5,Covered California Deadline\nGeneral Applicati...,5,"[-0.013264380395412445, 0.003993576858192682, ..."
6,Family Health Insurance in California\nMany fa...,6,"[0.01131445448845625, 0.007411356084048748, 0...."
7,Covered California Gold 80 Plan: Gold is Golde...,7,"[0.024426080286502838, 0.01196149829775095, 0...."
8,The Subsidy or Tax Credit of Healthcare Reform...,8,"[0.004846894647926092, -0.0026914558839052916,..."
9,Health Net Health Insurance Coverage\nAbout He...,9,"[-0.008957654237747192, -0.0032053696922957897..."


In [158]:
new_order = ['id', 'vector', 'text', 'embeddings_type']
df['embeddings_type'] = "covered_california_insurance"
df_reordered = df[new_order]
data = df_reordered.to_dict(orient='records')
df_reordered

Unnamed: 0,id,vector,text,embeddings_type
0,0,"[0.011871634051203728, 0.009219045750796795, 0...",Aetna Insurance\nAetna is a health insurance p...,covered_california_insurance
1,1,"[0.002933195559307933, -0.003226845059543848, ...",Insurance Company\nAnthem Blue Cross Health In...,covered_california_insurance
2,2,"[0.005859837867319584, -0.015740295872092247, ...",Insurance Company\nBlue Shield Health Insuranc...,covered_california_insurance
3,3,"[0.01280609704554081, 0.015537003986537457, 0....",Bronze 60 Plan: Peace of Mind for Unexpected H...,covered_california_insurance
4,4,"[0.007296297233551741, 0.012715067714452744, 0...",Chinese Community Health Insurance Coverage\nF...,covered_california_insurance
5,5,"[-0.013264380395412445, 0.003993576858192682, ...",Covered California Deadline\nGeneral Applicati...,covered_california_insurance
6,6,"[0.01131445448845625, 0.007411356084048748, 0....",Family Health Insurance in California\nMany fa...,covered_california_insurance
7,7,"[0.024426080286502838, 0.01196149829775095, 0....",Covered California Gold 80 Plan: Gold is Golde...,covered_california_insurance
8,8,"[0.004846894647926092, -0.0026914558839052916,...",The Subsidy or Tax Credit of Healthcare Reform...,covered_california_insurance
9,9,"[-0.008957654237747192, -0.0032053696922957897...",Health Net Health Insurance Coverage\nAbout He...,covered_california_insurance


In [159]:

res = client.insert(
    collection_name="open_api_embeddings",
    data=data
)

print(res)

# client.delete(collection_name="open_api_embeddings", ids=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40])



{'insert_count': 41, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]}


In [149]:
# 6. Search with a single vector
# 6.1. Prepare query vectors
# query_vectors = [
# ]
# 
# # 6.2. Start search
# res = client.search(
#     collection_name="quick_setup",     # target collection
#     data=query_vectors,                # query vectors
#     limit=3,                           # number of returned entities
# )
# 
# print(res)

In [160]:
# 10. Query with a filter expression using a schema-defined field
res = client.query(
    collection_name="open_api_embeddings",
    filter='$meta["embeddings_type"] == "covered_california_insurance"',
    output_fields=["text"],
    limit=1
)
print(res)

[{'text': 'Aetna Insurance\nAetna is a health insurance provider owned by CVS Health. It’s one of the oldest and largest health insurance companies in the country, with a history dating back to the 1800s. Aetna got its start selling life insurance. Today, it also offers various medical and dental plans nationwide.\n\nAetna wasn’t always available in California. In 2018, the California Department of Health Care Services (DHCS) approved Aetna joining Medi-Cal with its Better Health of California plan, offered in San Diego and Sacramento counties. Then, in 2023, Aetna CVS Health joined Covered California to offer individual and family health insurance.\n\nWhat Is Aetna Known For?\nPeople choose Aetna for its vast network of providers, competitive rates and long history in the insurance industry. It also offers benefits like access to 24/7 virtual care at no or low costs, depending on your plan, and mental health coverage. Aetna is regarded as a good health insurance provider overall.\n\nH

In [161]:
# 15. Drop collection
# client.drop_collection(
#     collection_name="open_api_embeddings"
# )