In [13]:
%pip install --upgrade openai plotly scikit-learn pymilvus numpy
%load_ext dotenv
%dotenv

Defaulting to user installation because normal site-packages is not writeable
Collecting numpy
  Downloading numpy-1.24.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Installing collected packages: numpy
Successfully installed numpy-1.24.1
Note: you may need to restart the kernel to use updated packages.
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [4]:
import json
import pandas as pd
import re
import os
from os import walk
from openai.embeddings_utils import get_embedding, get_embeddings
import openai
openai.api_key = os.environ['OPENAI_API_KEY']


fhir_txt_filenames = []
for (dirpath, dirnames, filenames) in walk("./txt-simple/"):
    fhir_txt_filenames.extend([f for f in filenames])
    break

CHUNK_MAX = 4000
def chunk_file(filename, contents):
    contents_clamped = []
    for c in contents:
        for i in range(0, len(c), CHUNK_MAX):
            contents_clamped.append(c[i*CHUNK_MAX:(i+1)*CHUNK_MAX])
    contents = contents_clamped
    def new_chunk(c):
        chunks.append(f'{filename}\n{c}')
    chunks = []
    new_chunk("")
    start_line = 0
    for i, l in enumerate(contents):
        if re.match("^(\d+\.\d+)|Raw JSON", l):
            start_line = i
            break

    for c in contents[start_line:]:
        if len(chunks[-1]) + len(c) > CHUNK_MAX or re.match("^(\d+\.\d+)|Raw JSON", c):
            new_chunk(c)
        else:
            chunks[-1] += c
    return chunks[1:]

filename_to_chunks = {}
for fname in fhir_txt_filenames:
    with open(f'./txt-simple/{fname}') as fhandle:
        contents = fhandle.readlines()
        chunks = chunk_file(fname, contents)
        filename_to_chunks[fname] = chunks

with open("./chunks.json", "w") as chunks_file:
    json.dump(filename_to_chunks, chunks_file, indent=2)

with open("./simpler-embedded.json", "w") as embedded_json:
    for filename in fhir_txt_filenames:
        embeddings = []
        chunks = filename_to_chunks[filename]
        if len(chunks) == 0: continue
        # print(chunks)
        print("Call", filename)
        embed_data = openai.Embedding.create(input = chunks, model="text-embedding-ada-002")['data']
        chunks_embedded = [e['embedding'] for e in embed_data]
        for v in [[filename, i, embedding] for i, embedding in enumerate(chunks_embedded)]:
            json.dump(v, embedded_json)
            embedded_json.write("\n")
        print("Embedded", filename)


Call valueset-all-distance-units.txt
Embedded valueset-all-distance-units.txt
Call sc-valueset-medicationknowledge-status.json.txt
Embedded sc-valueset-medicationknowledge-status.json.txt
Call valueset-encounter-type.json.txt
Embedded valueset-encounter-type.json.txt
Call valueset-condition-stage.txt
Embedded valueset-condition-stage.txt
Call valueset-verificationresult-status.json.txt
Embedded valueset-verificationresult-status.json.txt
Call catalogentry-definitions.txt
Embedded catalogentry-definitions.txt
Call healthcareservice-mappings.txt
Embedded healthcareservice-mappings.txt
Call hlaresult.txt
Embedded hlaresult.txt
Call medicationadministration-mappings.txt
Embedded medicationadministration-mappings.txt
Call valueset-procedure-not-performed-reason.txt
Embedded valueset-procedure-not-performed-reason.txt
Call valueset-legal-status-of-supply.txt
Embedded valueset-legal-status-of-supply.txt
Call valueset-measure-population.json.txt
Embedded valueset-measure-population.json.txt
Ca

In [5]:
import numpy as np
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)
connections.connect("default", host="localhost", port="19530")

fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="filename", dtype=DataType.VARCHAR, max_length=128),
    FieldSchema(name="core", dtype=DataType.BOOL),
    FieldSchema(name="definition", dtype=DataType.BOOL),
    FieldSchema(name="example", dtype=DataType.BOOL),
    FieldSchema(name="valueset", dtype=DataType.BOOL),
    FieldSchema(name="chunk", dtype=DataType.INT16),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=1536)
]
schema = CollectionSchema(fields, "FHIR Spec embedded by openai")
fhir_milvus = Collection("fhir_spec_txt", schema)

pk = 0
with open("./simpler-embedded.json") as embedded_json:
    for l in embedded_json:
        pk += 1
        e = json.loads(l)
        core = True if "-" not in e[0] else False
        definition = True if "definition" in e[0] else False
        example = True if "example" in e[0] else False
        valueset = True if "valueset-" in e[0] else False
        insert_result = fhir_milvus.insert([[pk], [e[0]],[core],[definition],[example],[valueset],  np.array([e[1]]).astype(np.int16), [e[2]]])
# After final entity is inserted, it is best to call flush to have no growing segments left in memory
fhir_milvus.flush()  

index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
fhir_milvus.create_index(field_name="embeddings", index_params=index)



Status(code=0, message=)

In [14]:
fhir_milvus.release()
fhir_milvus.drop_index()

index = {
    "index_type": "FLAT",
    "metric_type": "L2",
    "params": {},
}
fhir_milvus.create_index(field_name="embeddings", index_params=index)
# fhir_milvus.load()


Status(code=0, message=)

In [69]:
# 

In [32]:
fhir_milvus.load()
search_params = {"metric_type": "L2", "params": {}, "offset": 0}

query_embedding = openai.Embedding.create(input = ["fhir patient identifier"], model="text-embedding-ada-002")['data'][0]['embedding']

results = fhir_milvus.search(
	data=[query_embedding], 
	anns_field="embeddings", 
	param=search_params,
	output_fields=['filename', 'chunk'],
	limit=5, 
	expr="core==true",
)[0]

for hit in results:
	print(hit.entity)
	chunk = filename_to_chunks[hit.entity.get('filename')][hit.entity.get('chunk')]
	print(chunk)

id: 5876, distance: 0.32363826036453247, entity: {'filename': 'patient-mappings.txt', 'chunk': 5}
patient-mappings.txt
8.1.16.5 RIM Mapping (http://hl7.org/v3 []) []

 
  PatientPatient[classCode=PAT]
      identifierid
      activestatusCode
      namename
      telecomtelecom
      genderplayer[classCode=PSN|ANM and determinerCode=INSTANCE]/administrativeGender
      birthDateplayer[classCode=PSN|ANM and determinerCode=INSTANCE]/birthTime
      deceased[x]player[classCode=PSN|ANM and determinerCode=INSTANCE]/deceasedInd, player[classCode=PSN|ANM and determinerCode=INSTANCE]/deceasedTime
      addressaddr
      maritalStatusplayer[classCode=PSN]/maritalStatusCode
      multipleBirth[x] player[classCode=PSN|ANM and determinerCode=INSTANCE]/multipleBirthInd, player[classCode=PSN|ANM and determinerCode=INSTANCE]/multipleBirthOrderNumber
      photoplayer[classCode=PSN|ANM and determinerCode=INSTANCE]/desc
      contactplayer[classCode=PSN|ANM and determinerCode=INSTANCE]/scopedRole[class