# Join DB

In this notebook we create the dataset we will use in the future. In particular we joined some of the .csv that we can find in Mimic so that we have all the informations about the hospitalizations and the clinical notes in a single dataset.

In [None]:
import pandas as pd
import importlib
import pickle
import numpy as np
from nltk.tokenize import RegexpTokenizer
from os import listdir
from os.path import isfile, join
sys.path.append('..')
from lib.utils import Utils
import lib.NotesCleaning
from lib.NotesCleaning import NotesCleaning
importlib.reload(lib.NotesCleaning)
pd.set_option('display.max_columns', None)

## Notes dataset

First of all we load the csv with all the notes and then we clean it using the clean_note method.

In [3]:
notes_csv = pd.read_csv("../../Dati/mimic-iii-clinical-database-1.4/NOTEEVENTS.csv", low_memory=False)

In [None]:
notes_csv.shape

In [None]:
notes_csv['Cleaned_Text'] = notes_csv.TEXT.apply(lambda x: NotesCleaning().clean_note(x))

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
notes_csv['Token'] = notes_csv['Cleaned_Text'].apply(lambda x: NotesCleaning().tokenize(x, tokenizer))

## Admission and diagnoses dataset

Now we load the admission dataset and the diagnoses dataset.
These two were cleaned using the pre_processing function of Doctor XAI.

In [None]:
# Load the already cleaned csv
admission_df = pd.read_csv("../../Dati/mimic-iii-clinical-database-1.4/CSV/clean_ADMISSIONS.csv")
diagnoses_csv = pd.read_csv("../../Dati/mimic-iii-clinical-database-1.4/CSV/clean_DIAGNOSES_ICD.csv")
diagnoses_df = diagnoses_csv.groupby(['SUBJECT_ID','HADM_ID'], as_index=False).agg( {'ICD9_CODE':list })

## Join the datasets

In [None]:
tmp_join = admission_df.merge(diagnoses_df, on=['SUBJECT_ID', 'HADM_ID'])

In [None]:
df = notes_csv.merge(tmp_join, on=['SUBJECT_ID', 'HADM_ID'])

In [None]:
pd.set_option('display.max_columns', None)
df.head()

In [None]:
df.shape

In [None]:
cleanedText = pd.DataFrame()

In [None]:
cleanedText['Cleaned_Text'] = df['Cleaned_Text']

In [None]:
cleanedText.to_csv('../data/cleanedText.csv')

In [None]:
df = df[['SUBJECT_ID', 'HADM_ID', 'CATEGORY', 'INSURANCE', 'ADMISSION_TYPE', 'INSURANCE', 'TEXT', 'Token', 'ICD9_CODE', 'DIAGNOSIS']]

In [None]:
df.to_csv("../data/merged_dataset.csv")

## Reduced dataset with only "Discharge summary" notes

In [None]:
df.shape

In [None]:
df_discharge = df[df.CATEGORY == "Discharge summary"]

In [None]:
df_discharge.shape

In [None]:
df_discharge.head()

In [None]:
df_discharge.to_csv("../data/df_discharge.csv")

## Reduced dataset with only diabetic patients

Now we reduce the size of the dataset considering only the patients that are classified with a code related with diabetes (250.00)

In [None]:
reduced_df = Utils().extract_diabete_only(df_discharge)

In [None]:
reduced_df.shape

In [None]:
reduced_df.to_csv("../data/only_diabete.csv")

# Create a list with all the admissions and the diagnoses

In [None]:
dataset_sequences = np.load('../doctorXAI/preprocessing_doctorai/mimic_sequences.npy',allow_pickle=True)
admission_mimic_sequences = np.load('../doctorXAI/preprocessing_doctorai/admission_mimic_sequences.npy',allow_pickle=True)

In [None]:
admissions = []
icd_9_list = []
for icd_9_sequence, admission_mimic_sequences in zip(dataset_sequences, admission_mimic_sequences):
    for code, admission in zip(icd_9_sequence, admission_mimic_sequences):
        admissions.append(admission)
        icd_9_list.append(code)

In [None]:
admissions_and_icd_9 = {admission:code_list for admission, code_list in zip(admissions, icd_9_list)}

In [None]:
df = pd.read_csv("../data/merged_dataset.csv")

In [None]:
df.head()

In [None]:
admissions_id = df[df['CATEGORY'] == "Discharge summary"]['HADM_ID'].values
subject_id = df[df['CATEGORY'] == "Discharge summary"]['SUBJECT_ID'].values
df_dict = {}
for adm, sub in zip(admissions_id, subject_id):
    if sub not in df_dict:
        df_dict[sub] = [adm]
    else:
        df_dict[sub].append(adm)


In [None]:
df_dictionary = {}
for k in df_dict.keys():
    icd_9_list = []
    for code in df_dict[k]:
        try:
            icd_9_list.append(admissions_and_icd_9[int(code)])
        except:
            pass
    if icd_9_list:
        df_dictionary[k] = ([int(x) for x in df_dict[k]], icd_9_list)

In [None]:
pickle.dump(df_dictionary, open("../data/patient_admission_dictionary.pkl", "wb"))