## Extract Most important MIMIC notes

In [2]:
import pandas as pd
from utils import *

config = load_config()
PROJECT_PATH = config.project_path
DATA_PATH = PROJECT_PATH.joinpath("data/processed")

In [3]:
NOTE_PATH = config.note_path
HOSP_PATH = config.hosp_path


discharge = pd.read_csv(Path(NOTE_PATH).joinpath("discharge.csv.gz"), compression="gzip")
# discharge_detail = pd.read_csv(Path(NOTE_PATH).joinpath("discharge_detail.csv.gz"), compression="gzip", nrows=100)
# radiology = pd.read_csv(Path(NOTE_PATH).joinpath("radiology.csv.gz"), compression="gzip", nrows=100)
# radiology_detail = pd.read_csv(Path(NOTE_PATH).joinpath("radiology_detail.csv.gz"), compression="gzip", nrows=100)

def load_mimic_hosp() :
    df = pd.read_csv(Path(HOSP_PATH).joinpath("diagnoses_icd.csv.gz"), compression="gzip")
    return df

mimic_hosp = load_mimic_hosp()

In [24]:
# filter those notes that contains discharge notes
# use the "subject_id" & "hadm_id" as primary key

mimic_hosp['primary_key'] = mimic_hosp.apply(lambda x : str(x['subject_id']) + str(x["hadm_id"]), axis=1)
discharge['primary_key'] = discharge.apply(lambda x :str(x['subject_id']) + str(x["hadm_id"]), axis=1)

In [28]:
intersection = list(set(mimic_hosp.primary_key) & set(discharge.primary_key))

In [31]:
# filter
mimic_hosp_filtered = mimic_hosp[mimic_hosp.primary_key.isin(intersection)].reset_index(drop=True)
discharge_filtered = discharge[discharge.primary_key.isin(intersection)].reset_index(drop=True)

In [38]:
# icd codes to taking only three digits
mimic_hosp_filtered["icd_code_modified"] = mimic_hosp_filtered["icd_code"].apply(lambda x : x[:3])

In [55]:
# Re-import necessary libraries after kernel reset
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import CountVectorizer

# Re-load the dataset
icd_data = pd.read_csv(DATA_PATH.joinpath("icd_grouped.csv"))

# Tokenize the icd_code_modified column
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
X = vectorizer.fit_transform(icd_data['icd_code_modified'])

# Reduce dimensionality with TruncatedSVD
svd = TruncatedSVD(n_components=100, random_state=42)
X_reduced = svd.fit_transform(X)

# Perform hierarchical clustering using AgglomerativeClustering
agglomerative = AgglomerativeClustering(n_clusters=None, distance_threshold=10, linkage='ward')
clusters = agglomerative.fit_predict(X_reduced)

# Add the new clusters to the dataframe
icd_data['cluster'] = clusters

# Save the updated CSV file with cluster information
output_file_path = DATA_PATH.joinpath('icd_grouped_with_clusters.csv')
icd_data.to_csv(output_file_path, index=False)



: 

In [39]:
mimic_hosp_filtered.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,primary_key,icd_code_modified
0,10000032,22595853,1,5723,9,1000003222595853,572
1,10000032,22595853,2,78959,9,1000003222595853,789
2,10000032,22595853,3,5715,9,1000003222595853,571
3,10000032,22595853,4,7070,9,1000003222595853,70
4,10000032,22595853,5,496,9,1000003222595853,496


In [52]:
# form custom codes per patient 

icd_grouped = mimic_hosp_filtered[["icd_code_modified", "primary_key"]].groupby("primary_key", as_index=False).agg({"icd_code_modified": ", ".join})

In [53]:
icd_grouped.to_csv(DATA_PATH.joinpath("icd_grouped.csv"), index=False)

In [32]:
# save discharge notes that are selected

discharge_filtered.to_pickle(DATA_PATH.joinpath("discharged_filtered.pkl"))

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,primary_key
0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...,1000003222595853
1,10000032-DS-22,10000032,22841357,DS,22,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...,1000003222841357
2,10000032-DS-23,10000032,29079034,DS,23,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...,1000003229079034
3,10000032-DS-24,10000032,25742920,DS,24,2180-08-07 00:00:00,2180-08-10 05:43:00,\nName: ___ Unit No: _...,1000003225742920
4,10000084-DS-17,10000084,23052089,DS,17,2160-11-25 00:00:00,2160-11-25 15:09:00,\nName: ___ Unit No: __...,1000008423052089
...,...,...,...,...,...,...,...,...,...
331641,19999828-DS-6,19999828,29734428,DS,6,2147-08-04 00:00:00,2147-08-12 15:36:00,\nName: ___ Unit No: ___...,1999982829734428
331642,19999828-DS-7,19999828,25744818,DS,7,2149-01-18 00:00:00,2149-01-19 07:03:00,\nName: ___ Unit No: ___...,1999982825744818
331643,19999840-DS-20,19999840,26071774,DS,20,2164-07-28 00:00:00,2164-07-29 14:52:00,\nName: ___ Unit No: ___\...,1999984026071774
331644,19999840-DS-21,19999840,21033226,DS,21,2164-09-17 00:00:00,2164-09-18 01:36:00,\nName: ___ Unit No: ___\...,1999984021033226


Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,primary_key
0,10000032,22595853,1,5723,9,1000003222595853
1,10000032,22595853,2,78959,9,1000003222595853
2,10000032,22595853,3,5715,9,1000003222595853
3,10000032,22595853,4,7070,9,1000003222595853
4,10000032,22595853,5,496,9,1000003222595853


In [12]:
# Creating own criteria of ICD Codes

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,10000032,22595853,1,5723,9
1,10000032,22595853,2,78959,9
2,10000032,22595853,3,5715,9
3,10000032,22595853,4,07070,9
4,10000032,22595853,5,496,9
...,...,...,...,...,...
95,10000764,27897940,2,41071,9
96,10000764,27897940,3,5849,9
97,10000764,27897940,4,2875,9
98,10000764,27897940,5,7802,9


In [22]:
print(discharge.text[0])                                    

 
Name:  ___                     Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: MEDICINE
 
Allergies: 
No Known Allergies / Adverse Drug Reactions
 
Attending: ___
 
Chief Complaint:
Worsening ABD distension and pain 
 
Major Surgical or Invasive Procedure:
Paracentesis

 
History of Present Illness:
___ HCV cirrhosis c/b ascites, hiv on ART, h/o IVDU, COPD, 
bioplar, PTSD, presented from OSH ED with worsening abd 
distension over past week.  
Pt reports self-discontinuing lasix and spirnolactone ___ weeks 
ago, because she feels like "they don't do anything" and that 
she "doesn't want to put more chemicals in her." She does not 
follow Na-restricted diets. In the past week, she notes that she 
has been having worsening abd distension and discomfort. She 
denies ___ edema, or SOB, or orthopnea. She denies f/c/n/v, d/c, 
dysuria. She had food poisoning a week ago from eating stale 
cake (n/v 20 min after fo