<b> Make sure to start the IRIS database before running this file. A quick way to do that: </b> \
docker run -d --name iris-comm -p 1972:1972 -p 52773:52773 -e IRIS_PASSWORD=demo -e IRIS_USERNAME=demo intersystemsdc/iris-community:latest

# Loading the data to a dataframe
The data used here is the D_ICD_DIAGNOSES table from the Medical Information Mart for Intensive Care (MIMIC)-IV Demo database \
<i>" Medical Information Mart for Intensive Care (MIMIC)-IV database is comprised of deidentified electronic health records for patients admitted to the Beth Israel Deaconess Medical Center " </i> \
Johnson, A., Bulgarelli, L., Pollard, T., Horng, S., Celi, L. A., & Mark, R. (2023). MIMIC-IV Clinical Database Demo (version 2.2). PhysioNet. https://doi.org/10.13026/dp1f-ex47.

In [2]:
import pandas as pd

In [3]:
diagnoses = pd.read_csv("D_ICD_DIAGNOSES.csv")

In [4]:
diagnoses.head()

Unnamed: 0,row_id,icd9_code,short_title,long_title
0,1,1716,Erythem nod tb-oth test,Erythema nodosum with hypersensitivity reactio...
1,2,1720,TB periph lymph-unspec,"Tuberculosis of peripheral lymph nodes, unspec..."
2,3,1721,TB periph lymph-no exam,"Tuberculosis of peripheral lymph nodes, bacter..."
3,4,1722,TB periph lymph-exam unk,"Tuberculosis of peripheral lymph nodes, bacter..."
4,5,1723,TB periph lymph-micro dx,"Tuberculosis of peripheral lymph nodes, tuberc..."


# Generating embeddings

In [None]:
# !pip install sentence_transformers
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
# Generate embeddings for all descriptions at once. 
embeddings = model.encode(diagnoses['long_title'].tolist(), normalize_embeddings=True)
# Add the embeddings to the DataFrame
diagnoses['long_title_vector'] = embeddings.tolist()

In [8]:
diagnoses.head()

Unnamed: 0,row_id,icd9_code,short_title,long_title,long_title_vector
0,1,1716,Erythem nod tb-oth test,Erythema nodosum with hypersensitivity reactio...,"[0.050381243228912354, -0.01662275940179825, -..."
1,2,1720,TB periph lymph-unspec,"Tuberculosis of peripheral lymph nodes, unspec...","[0.03993239626288414, 0.006474997382611036, -0..."
2,3,1721,TB periph lymph-no exam,"Tuberculosis of peripheral lymph nodes, bacter...","[0.09322471916675568, 0.0053528607822954655, -..."
3,4,1722,TB periph lymph-exam unk,"Tuberculosis of peripheral lymph nodes, bacter...","[0.07309558987617493, -0.0028698795940726995, ..."
4,5,1723,TB periph lymph-micro dx,"Tuberculosis of peripheral lymph nodes, tuberc...","[0.03763028606772423, -0.02204110100865364, -0..."


# IRIS database operations

In [None]:
!pip install intersystems_irispython-3.2.0-py3-none-any.whl
import iris
import time

## Database connection settings

In [12]:
namespace="USER"
port = 1972
hostname="localhost"
connection_string = f"{hostname}:{port}/{namespace}"
username = "demo"
password = "demo"

In [23]:
# Note: Ideally conn and cursor should be used with context manager or with try-execpt-finally 
conn = iris.connect(connection_string, username, password)
cursor = conn.cursor()

In [14]:
tableName = "Demo.VectorDiagnoses"
tableDefinition = "(row_id INTEGER, icd9_code VARCHAR(255), short_title VARCHAR(255), long_title VARCHAR(''), long_title_vector VECTOR(DOUBLE,384))"

In [15]:
try:
    cursor.execute(f"DROP TABLE {tableName}")  
except:
    pass
cursor.execute(f"CREATE TABLE {tableName} {tableDefinition}")

0

In [16]:
##looping through dataframe and adding all the data to IRIS table
sql = "Insert into Demo.VectorDiagnoses (row_id, icd9_code,short_title,long_title,long_title_vector) values (?, ?, ?, ?, ?)"
start_time = time.time()
for index,row in diagnoses.iterrows():
    cursor.execute(sql, [row.row_id,row.icd9_code,row.short_title,row.long_title,str(row.long_title_vector)])
end_time = time.time()
print(f"time taken to add {len(diagnoses)} entries: {end_time-start_time} seconds")

time taken to add 14567 entries: 40.83518600463867 seconds


## difficulty walking

In [17]:
issueDescription = "difficulty walking"
issueDescription_vector = model.encode(issueDescription, normalize_embeddings=True).tolist()

In [18]:
sql = "select Top ? row_id, icd9_code, short_title, long_title from Demo.VectorDiagnoses ORDER BY VECTOR_DOT_PRODUCT(long_title_vector, TO_VECTOR(?)) DESC"
numberOfResults = 10
cursor.execute(sql,[numberOfResults,str(issueDescription_vector)])
fetched_data = cursor.fetchall()
for row in fetched_data:
    print(row)

[14143, '7197', 'Difficulty in walking', 'Difficulty in walking']
[13460, 'E0010', 'Walking,marching,hiking', 'Activities involving walking, marching and hiking']
[12108, '7812', 'Abnormality of gait', 'Abnormality of gait']
[9681, 'E0190', 'Walking an animal', 'Activities involving walking an animal']
[14034, 'V690', 'Lack of physical exercse', 'Lack of physical exercise']
[9838, 'E8859', 'Fall from slipping NEC', 'Fall from other slipping, tripping, or stumbling']
[3596, '32751', 'Periodic limb movement', 'Periodic limb movement disorder']
[11690, 'E9272', 'Excess physical exert', 'Excessive physical exertion']
[10205, 'V499', 'Probl influ health NOS', 'Unspecified problems with limbs and other problems']
[10176, 'V491', 'Mechanical prob w limbs', 'Mechanical problems with limbs']


## respiratory issues

In [24]:
issueDescription = "respiratory issues"
issueDescription_vector = model.encode(issueDescription, normalize_embeddings=True).tolist()

In [25]:
sql = "select Top ? row_id, icd9_code, short_title, long_title from Demo.VectorDiagnoses ORDER BY VECTOR_DOT_PRODUCT(long_title_vector, TO_VECTOR(?)) DESC"
numberOfResults = 9
cursor.execute(sql,[numberOfResults,str(issueDescription_vector)])
fetched_data = cursor.fetchall()
for row in fetched_data:
    print(row)

[12369, '99739', 'Respiratory comp NEC', 'Other respiratory complications']
[13003, '78609', 'Respiratory abnorm NEC', 'Other respiratory abnormalities']
[12995, '78600', 'Respiratory abnorm NOS', 'Respiratory abnormality, unspecified']
[5281, '51883', 'Chronic respiratory fail', 'Chronic respiratory failure']
[5282, '51884', 'Acute & chronc resp fail', 'Acute and chronic respiratory failure']
[2933, '3061', 'Psychogenic respir dis', 'Respiratory malfunction arising from mental factors']
[14505, 'V814', 'Screen-respir cond NEC', 'Screening for other and unspecified respiratory conditions']
[9040, '77089', 'Resp prob after brth NEC', 'Other respiratory problems after birth']
[7376, '7488', 'Respiratory anomaly NEC', 'Other specified anomalies of respiratory system']


In [26]:
##close the connection
cursor.close()
conn.commit()
conn.close()