In [None]:
!python3 -m pip install pymilvus

# You'd better restart the runtime after installing pymilvus.
# To do so, choose "Runtime > Restart Runtime" from the above main menu.

Collecting pymilvus
  Downloading pymilvus-2.2.14-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.9/147.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting grpcio<=1.56.0,>=1.49.1 (from pymilvus)
  Downloading grpcio-1.56.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
Collecting environs<=9.5.0 (from pymilvus)
  Downloading environs-9.5.0-py2.py3-none-any.whl (12 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.9/53.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow>=3.0.0 (from environs<=9.5.0->pymilvus)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

## Connect to cluster

When creating a dedicated cluster, you need to configure a cluster credential consisting of a pair of username and password. Be sure to take note of these details, as you’ll need them to connect to the cluster.

If you prefer private links, just replace the uri with your private links. Before that, ensure you have access to your private links. For details, see [Set up Private Link](https://docs.zilliz.com/docs/set-up-a-private-link).

In [None]:
import json
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

CLUSTER_ENDPOINT = "replace-this-with-your-cluster-endpoint"
TOKEN = "replace-this-with-your-token"

# Connect to cluster
connections.connect(
  alias='default',
  #  Public endpoint obtained from Zilliz Cloud
  uri=CLUSTER_ENDPOINT,
  secure=True,
  token=TOKEN, # Username and password specified when you created this cluster
    # Or continue using legacy method `user` and `password` to replace `token`:
    # user='',
    # password=''
)

## Defind JSON field

To define a JSON field, simply follow the same procedure as defining fields of other types.

In the following code, `article_meta` is a JSON field because its `dtype` is set to `DataType.JSON`.

In [None]:
import json
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType

# 1. define fields
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True, max_length=100),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=768),
    FieldSchema(name="article_meta", dtype=DataType.JSON),
]
# 2. create schema
schema = CollectionSchema(
        fields
)
# 3. reference the schema in a collection
collection = Collection("medium_articles_with_json", schema)

# 4. index the vector field
index_params = {
    "index_type": "AUTOINDEX",
    "metric_type": "L2",
    "params": {}
}

collection.create_index(
  field_name="title_vector",
  index_params=index_params
)

# 5. load the collection
collection.load()



## Insert field values

After creating a collection from the `CollectionSchema` object, dictionaries such as the one above can be inserted into it.

In [None]:
# Download the dataset

!curl https://assets.zilliz.com/medium_articles_2020_dpr_a13e0377ae.json \
    --output medium_articles_2020_dpr.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 60.4M  100 60.4M    0     0   122M      0 --:--:-- --:--:-- --:--:--  122M


In [None]:
# 6. prepare data
with open("medium_articles_2020_dpr.json") as f:
    data = json.load(f)
    list_of_rows = data['rows']

    data_rows = []
    for row in list_of_rows:
        data_rows.append({
            "title": row["title"],
            "title_vector": row["title_vector"],
            "article_meta": dict(
                link=row["link"],
                reading_time=row['reading_time'],
                publication=row["publication"],
                claps=row["claps"],
                responses=row["responses"],
            )
        })

    print(data_rows[0])

# Output:
# {
#       'title': 'The Reported Mortality Rate of Coronavirus Is Not Important',
#       'title_vector': [0.041732933, 0.013779674, -0.027564144, ..., 0.030096486],
#       'article_meta': {
#        'link': 'https://medium.com/swlh/the-reported-mortality-rate-of-coronavirus-is-not-important-369989c8d912',
#         'reading_time': 13,
#         'publication': 'The Startup',
#         'claps': 1100,
#         'responses': 18
#       }
# }

# 7. insert data
collection.insert(data_rows)
collection.flush()

print("Entity counts: ", collection.num_entities)

# Output
# Number of entities in collection:  5979

{'title': 'The Reported Mortality Rate of Coronavirus Is Not Important', 'title_vector': [0.041732933, 0.013779674, -0.027564144, -0.013061441, 0.009748648, 0.00082446384, -0.00071647146, 0.048612226, -0.04836573, -0.04567751, 0.018008126, 0.0063936645, -0.011913628, 0.030776596, -0.018274948, 0.019929802, 0.020547243, 0.032735646, -0.031652678, -0.033816382, -0.051087562, -0.033748355, 0.0039493158, 0.009246126, -0.060236514, -0.017136049, 0.028754413, -0.008433934, 0.011168004, -0.012391256, -0.011225835, 0.031775184, 0.002929508, -0.007448661, -0.005337719, -0.010999258, -0.01515909, -0.005130484, 0.0060212007, 0.0034560722, -0.022935811, -0.04970116, -0.0155887455, 0.06627353, -0.006052789, -0.051570725, -0.109865054, 0.033205193, 0.00041118253, 0.0029823708, 0.036160238, -0.011256539, 0.00023560718, 0.058322437, 0.022275906, 0.015206677, -0.02884609, 0.0016338055, 0.0049200393, 0.014388571, -0.0049061654, -0.04664761, -0.027454877, 0.017526226, -0.005100602, 0.018090058, 0.0270099

# Search within JSON field

Once all of your data has been added, you can conduct searches using the keys in the JSON field in the same manner as you would with a standard scalar field. Simply follow these steps:

In [None]:
# 8. search data
result = collection.search(
    data=[data_rows[0]['title_vector']],
    anns_field="title_vector",
    param={"metric_type": "L2", "params": {"nprobe": 10}},
    limit=3,
    expr='article_meta["claps"] > 30 and article_meta["reading_time"] < 10',
    output_fields=["title", "article_meta" ],
)

for hits in result:
    print("Matched IDs: ", hits.ids)
    print("Distance to the query vector: ", hits.distances)
    print("Matched articles: ")
    for hit in hits:
        print(
            "Title: ",
            hit.entity.get("title"),
            ", Reading time: ",
            hit.entity.get("article_meta")['reading_time'],
            ", Claps",
            hit.entity.get("article_meta")['claps']
        )

# Output:
# Matched IDs:  [442206870370198289, 442206870370198323, 442206870370196123]
# Distance to the query vector:  [0.36103835701942444, 0.37674015760421753, 0.4162980318069458]
# Matched articles:
# Title:  The Hidden Side Effect of the Coronavirus , Reading time:  8 , Claps 83
# Title:  Why The Coronavirus Mortality Rate is Misleading , Reading time:  9 , Claps 2900
# Title:  Coronavirus shows what ethical Amazon could look like , Reading time:  4 , Claps 51

Matched IDs:  [443301488834178231, 443301488834184214, 443301488834178265]
Distance to the query vector:  [0.36103835701942444, 0.36103835701942444, 0.37674015760421753]
Matched articles: 
Title:  The Hidden Side Effect of the Coronavirus , Reading time:  8 , Claps 83
Title:  The Hidden Side Effect of the Coronavirus , Reading time:  8 , Claps 83
Title:  Why The Coronavirus Mortality Rate is Misleading , Reading time:  9 , Claps 2900


## Drop collection

You can drop the collection as follows:

In [None]:
res = utility.drop_collection("medium_articles_with_json")

print(res)

None
