<a href="https://colab.research.google.com/github/okechukwuchude/Automating-Medical-Coding/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
import itertools
import pickle
import warnings
warnings.filterwarnings('ignore')

In [3]:
#ICD codes and meaningg
D_ICD_DIAG = pd.read_csv('/content/drive/MyDrive/#medical coding/mimic-iii-clinical-database-carevue-subset-1.4/NOTEEVENTS.csv.gz',compression='gzip')
D_ICD_PROC = pd.read_csv('/content/drive/MyDrive/#medical coding/mimic-iii-clinical-database-carevue-subset-1.4/D_ICD_PROCEDURES.csv.gz',compression='gzip')

In [5]:
#original ICD files
diagnoses_icd = pd.read_csv('/content/drive/MyDrive/#medical coding/mimic-iii-clinical-database-carevue-subset-1.4/DIAGNOSES_ICD.csv.gz',compression='gzip')
procedures_icd = pd.read_csv('/content/drive/MyDrive/#medical coding/mimic-iii-clinical-database-carevue-subset-1.4/PROCEDURES_ICD.csv.gz',compression='gzip')

In [6]:
diagnoses_icd.head(5)

Unnamed: 0,row_id,subject_id,hadm_id,seq_num,icd9_code
0,1,2,163353,1.0,V3001
1,2,2,163353,2.0,V053
2,3,2,163353,3.0,V290
3,4,3,145834,1.0,0389
4,5,3,145834,2.0,78559


In [7]:
diagnoses_icd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225345 entries, 0 to 225344
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   row_id      225345 non-null  int64  
 1   subject_id  225345 non-null  int64  
 2   hadm_id     225345 non-null  int64  
 3   seq_num     225339 non-null  float64
 4   icd9_code   225339 non-null  object 
dtypes: float64(1), int64(3), object(1)
memory usage: 8.6+ MB


In [9]:
#Read notes source file
notes = pd.read_csv('/content/drive/MyDrive/#medical coding/mimic-iii-clinical-database-carevue-subset-1.4/NOTEEVENTS.csv.gz', compression='gzip')
keep = notes[['hadm_id','category','text']]

In [10]:
keep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880107 entries, 0 to 880106
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   hadm_id   861038 non-null  float64
 1   category  880107 non-null  object 
 2   text      880107 non-null  object 
dtypes: float64(1), object(2)
memory usage: 20.1+ MB


In [11]:
len(diagnoses_icd['icd9_code'].unique())


5054

In [12]:
keep

Unnamed: 0,hadm_id,category,text
0,,Radiology,[**2119-1-4**] 12:59 PM\n ABDOMEN U.S. (COMPLE...
1,,Radiology,[**2119-1-9**] 1:05 PM\n MR LIVER WITH CONTRAS...
2,,Radiology,[**2119-1-16**] 9:24 PM\n CHEST (PORTABLE AP) ...
3,,Radiology,[**2119-1-18**] 1:24 PM\n CT ABD W&W/O C; CT P...
4,,Radiology,[**2119-1-18**] 3:45 PM\n PARACENTESIS DIAG. O...
...,...,...,...
880102,104049.0,Nursing/other,1. FEN\nTF= min140cc/k/d of BM/E24. min47cc q4...
880103,104049.0,Nursing/other,I have examined pt. & agree w/ [**First Name8 ...
880104,104049.0,Nursing/other,NPN nights\n\n\nFluids/Nutrition: Weight 2025...
880105,104049.0,Nursing/other,Attending Note\nDay of life 12 PMA 35 [**2-7**...


In [13]:
len(diagnoses_icd['hadm_id'].unique())

26836

In [14]:
#checking for null values
keep.isnull().sum()

hadm_id     19069
category        0
text            0
dtype: int64

In [15]:
# Dictionary to store diagnoses data with admission IDs as keys and lists of diagnosis ICD codes as values
diagnoses_dict = {}

# Iterate through each row in the diagnoses_icd DataFrame
for i in range(len(diagnoses_icd)):
    # Get the current row
    entry = diagnoses_icd.iloc[i]
    # Extract the admission ID and ICD code from the current row
    hadm = entry['hadm_id']
    icd = entry['icd9_code']
    # Check if the admission ID already exists in the diagnoses_dict
    if hadm not in diagnoses_dict:
        # If not, create a new entry with the admission ID as key and a list containing the ICD code as value
        diagnoses_dict[hadm] = [icd]
    else:
        # If the admission ID already exists, append the ICD code to the existing list of codes
        diagnoses_dict[hadm].append(icd)


In [16]:
# Dictionary to store procedures data with admission IDs as keys and lists of procedure ICD codes as values
procedures_dict = {}

# Iterate through each row in the procedures_icd DataFrame
for i in range(len(procedures_icd)):
    # Get the current row
    entry = procedures_icd.iloc[i]
    # Extract the admission ID and ICD code from the current row
    hadm = entry['hadm_id']
    icd = entry['icd9_code']
    # Check if the admission ID already exists in the procedures_dict
    if hadm not in procedures_dict:
        # If not, create a new entry with the admission ID as key and a list containing the ICD code as value
        procedures_dict[hadm] = [icd]
    else:
        # If the admission ID already exists, append the ICD code to the existing list of codes
        procedures_dict[hadm].append(icd)

In [17]:
diagnoses_df = pd.DataFrame.from_dict(diagnoses_dict,orient='index')
procedures_df = pd.DataFrame.from_dict(procedures_dict,orient='index')

In [18]:
diagnoses_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
163353,V3001,V053,V290,,,,,,,,...,,,,,,,,,,
145834,0389,78559,5849,4275,41071,4280,6826,4254,2639,,...,,,,,,,,,,
178980,V3000,V053,V290,,,,,,,,...,,,,,,,,,,
118037,V3001,V053,V290,,,,,,,,...,,,,,,,,,,
159514,V3001,7706,7746,V290,V502,V053,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128876,V3000,V290,V053,,,,,,,,...,,,,,,,,,,
105824,V3001,76502,7705,7702,769,7742,7793,7756,7470,77082,...,,,,,,,,,,
104049,V3000,7731,76517,76527,V290,V053,V502,,,,...,,,,,,,,,,
140728,41401,42822,78820,44021,496,25000,4414,4280,4019,3051,...,,,,,,,,,,


In [19]:
# Rename the columns of the diagnoses DataFrame to include a prefix 'DIAG_CODE' followed by a number for each column
diagnoses_df.columns = ['DIAG_CODE'+str(i) for i in range(1, len(diagnoses_df.columns) + 1)]

# Set the name of the index of the diagnoses DataFrame to 'HADM_ID'
diagnoses_df.index.name = 'hadm_id'

# Rename the columns of the procedures DataFrame to include a prefix 'PRCD_CODE' followed by a number for each column
procedures_df.columns = ['PRCD_CODE'+str(i) for i in range(1, len(procedures_df.columns) + 1)]

# Set the name of the index of the procedures DataFrame to 'HADM_ID'
procedures_df.index.name = 'hadm_id'

# Merge the diagnoses and procedures DataFrames using an outer join based on the 'HADM_ID' column
# The resulting DataFrame will contain all unique 'HADM_ID' values from both DataFrames,
# with diagnosis and procedure codes aligned accordingly. Missing values will be filled with NaN.
codes_df = pd.merge(diagnoses_df, procedures_df, how='outer', on='hadm_id')


In [20]:
# For each row in the diagnoses DataFrame, join all non-null entries (diagnosis codes) into a single string, separated by commas
diagnoses_df['DIAG_CODES'] = diagnoses_df[diagnoses_df.columns[:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)

# For each row in the procedures DataFrame, join all non-null entries (procedure codes) into a single string, separated by commas
procedures_df['PROC_CODES'] = procedures_df[procedures_df.columns[:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)


In [21]:
# Extract the 'DIAG_CODES' column from the diagnoses DataFrame
diagnoses = diagnoses_df[['DIAG_CODES']]

# Extract the 'PROC_CODES' column from the procedures DataFrame
procedures = procedures_df[['PROC_CODES']]

# Merge the 'DIAG_CODES' and 'PROC_CODES' DataFrames using an outer join based on the 'HADM_ID' column
codes = pd.merge(diagnoses, procedures, how='outer', on='hadm_id')

# Drop any rows with missing values (NaN) from the merged DataFrame
codes = codes.dropna()


In [22]:
codes.to_csv('CODES.csv')

In [23]:
print(keep.columns)

Index(['hadm_id', 'category', 'text'], dtype='object')


In [24]:
print(codes.columns)

Index(['DIAG_CODES', 'PROC_CODES'], dtype='object')


In [25]:
# Merge the 'KEEP' DataFrame with the 'codes' DataFrame using a left join based on the 'HADM_ID' column
merged_df = pd.merge(keep, codes, how='left', on='hadm_id')

# Drop any rows with missing values (NaN) from the merged DataFrame
merged_df = merged_df.dropna()
merged_df= merged_df.set_index('hadm_id')


In [26]:
merged_df

Unnamed: 0_level_0,category,text,DIAG_CODES,PROC_CODES
hadm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
102314.0,Radiology,"[**2119-2-3**] 4:10 PM\n US ABD LIMIT, SINGLE ...",57220705457155724518550709985945385849,"5059.0,311.0,3324.0,9672.0,5011.0,3893.0,3995...."
102314.0,Radiology,"[**2119-2-4**] 10:38 AM\n US ABD LIMIT, SINGLE...",57220705457155724518550709985945385849,"5059.0,311.0,3324.0,9672.0,5011.0,3893.0,3995...."
185945.0,Radiology,[**2167-2-14**] 10:41 AM\n CHEST (PA & LAT) ...,4240428051881425442611997142731423999811,"3524.0,3403.0,3409.0,3961.0,3723.0,8856.0,8853..."
185945.0,Radiology,[**2167-2-16**] 7:50 PM\n CHEST (PORTABLE AP) ...,4240428051881425442611997142731423999811,"3524.0,3403.0,3409.0,3961.0,3723.0,8856.0,8853..."
185945.0,Radiology,[**2167-2-17**] 2:24 AM\n CHEST (PORTABLE AP) ...,4240428051881425442611997142731423999811,"3524.0,3403.0,3409.0,3961.0,3723.0,8856.0,8853..."
...,...,...,...,...
104049.0,Nursing/other,1. FEN\nTF= min140cc/k/d of BM/E24. min47cc q4...,"V3000,7731,76517,76527,V290,V053,V502","640.0,9983.0,9955.0"
104049.0,Nursing/other,I have examined pt. & agree w/ [**First Name8 ...,"V3000,7731,76517,76527,V290,V053,V502","640.0,9983.0,9955.0"
104049.0,Nursing/other,NPN nights\n\n\nFluids/Nutrition: Weight 2025...,"V3000,7731,76517,76527,V290,V053,V502","640.0,9983.0,9955.0"
104049.0,Nursing/other,Attending Note\nDay of life 12 PMA 35 [**2-7**...,"V3000,7731,76517,76527,V290,V053,V502","640.0,9983.0,9955.0"


In [27]:
sample = merged_df.sample(n=20000)


In [28]:
sample.to_csv('sample_20k.csv')


In [29]:
sample.columns

Index(['category', 'text', 'DIAG_CODES', 'PROC_CODES'], dtype='object')

In [30]:
sample

Unnamed: 0_level_0,category,text,DIAG_CODES,PROC_CODES
hadm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
154527.0,Nursing/other,NPN (1500-2300)\n\n\n2. F/N: Weight up45 gm t...,"V3001,7742,7793,76518,76527,7706,V290,V502,V053","9390.0,9983.0,640.0,9955.0"
104207.0,Nursing/other,0700-[**2084**] NPN\n\n\nRESP: In RA. RR 30'...,"V3001,769,7742,76514,76525,V290,V053,77082,7766","9955.0,9390.0,9983.0,8871.0,9915.0"
157915.0,Nursing/other,NURSING UPDATE\nCV:\n CVL PLACEMENT CONFIRM...,"V596,4019",503.0
104337.0,Radiology,[**2181-11-12**] 1:36 PM\n CHEST (PORTABLE AP)...,"44022,43491,78039,5180,2375,2449,2724,V1251,51...","3929.0,331.0,8872.0,8842.0,8848.0"
152912.0,Nursing/other,Respiratory Care\nPt. treated with Atrovent vi...,"4168,4280,5715,42731,2867,2800,99662,7907,5849...","3721.0,4513.0,8872.0,3893.0,3491.0,9904.0,9907.0"
...,...,...,...,...
143937.0,Radiology,[**2188-5-16**] 1:43 AM\n CHEST (PORTABLE AP);...,"41401,9971,42731,45829,4019,2724,1748,2449,790...","3615.0,3722.0,3614.0,3761.0,8960.0,3961.0,8856..."
107071.0,Nursing/other,Respiratory Care\nBaby remains on cpap 6 21%.B...,"V3101,769,7742,7793,7757,76524,76514,7470,7455","9604.0,9671.0,9390.0,3891.0,3893.0,9915.0,966...."
181879.0,Radiology,[**2115-5-8**] 7:02 PM\n CHEST (PORTABLE AP) ...,"56731,56738,7907,51881,42830,4280,4241,42731,2...","5491.0,8872.0,9390.0,3893.0"
120462.0,Nursing/other,Neonatology Attending Progress note:\nx-30 [**...,"V3001,7742,77211,76515,76525,7706,7424,V290","9390.0,9983.0,966.0"


**Codes To Dictionary**

In [31]:
sample_ids = sample.index


In [32]:
flt_diag = diagnoses_icd[diagnoses_icd['hadm_id'].isin(sample_ids)]
flt_proc = procedures_icd[procedures_icd['hadm_id'].isin(sample_ids)]

In [34]:
diag_keep = flt_diag['icd9_code'].value_counts()[:300]
proc_keep = flt_proc['icd9_code'].value_counts()[:100]


In [35]:
diag2idx, idx2diag = {},{}
for d in diag_keep.index:
    if d not in diag2idx:
        idx2diag[len(idx2diag)] = d
        diag2idx[d] = len(diag2idx)

proc2idx, idx2proc = {},{}
for p in proc_keep.index:
    if p not in proc2idx:
        idx2proc[len(idx2proc)] = p
        proc2idx[p] = len(proc2idx)

In [36]:
with open('diag2idx.pickle','wb') as f:
    pickle.dump(diag2idx,f,pickle.HIGHEST_PROTOCOL)
with open('idx2diag.pickle','wb') as f:
    pickle.dump(idx2diag,f,pickle.HIGHEST_PROTOCOL)
with open('proc2idx.pickle','wb') as f:
    pickle.dump(proc2idx,f,pickle.HIGHEST_PROTOCOL)
with open('idx2proc.pickle','wb') as f:
    pickle.dump(idx2proc,f,pickle.HIGHEST_PROTOCOL)

Convert CODE TO LIST

```
# This is formatted as code
```



In [37]:

def diag_code2idx(org_lst):
    coded_lst = []
    for c in org_lst.split(','):
        if c in diag2idx:
            coded_lst.append(diag2idx[c])
    return coded_lst

In [38]:
def proc_code2idx(org_lst):
    coded_lst = []
    for c in org_lst.split(','):
        c_ = int(str(c).split('.')[0])
        if c_ in proc2idx:
            coded_lst.append(proc2idx[c_])

    return coded_lst

In [39]:
sample['CODED_DIAG'] = sample['DIAG_CODES'].apply(diag_code2idx)
sample['CODED_PROC'] = sample['PROC_CODES'].apply(proc_code2idx)

In [40]:
sample

Unnamed: 0_level_0,category,text,DIAG_CODES,PROC_CODES,CODED_DIAG,CODED_PROC
hadm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
154527.0,Nursing/other,NPN (1500-2300)\n\n\n2. F/N: Weight up45 gm t...,"V3001,7742,7793,76518,76527,7706,V290,V502,V053","9390.0,9983.0,640.0,9955.0","[10, 4, 14, 32, 23, 49, 2, 24, 5]","[9, 8, 17, 7]"
104207.0,Nursing/other,0700-[**2084**] NPN\n\n\nRESP: In RA. RR 30'...,"V3001,769,7742,76514,76525,V290,V053,77082,7766","9955.0,9390.0,9983.0,8871.0,9915.0","[10, 9, 4, 85, 66, 2, 5, 144, 39]","[7, 9, 8, 5]"
157915.0,Nursing/other,NURSING UPDATE\nCV:\n CVL PLACEMENT CONFIRM...,"V596,4019",503.0,[0],[]
104337.0,Radiology,[**2181-11-12**] 1:36 PM\n CHEST (PORTABLE AP)...,"44022,43491,78039,5180,2375,2449,2724,V1251,51...","3929.0,331.0,8872.0,8842.0,8848.0","[124, 44, 48, 34, 58, 245, 195]","[16, 14, 67, 66]"
152912.0,Nursing/other,Respiratory Care\nPt. treated with Atrovent vi...,"4168,4280,5715,42731,2867,2800,99662,7907,5849...","3721.0,4513.0,8872.0,3893.0,3491.0,9904.0,9907.0","[108, 1, 121, 3, 138, 104, 50, 51, 8, 72, 172,...","[72, 25, 14, 1, 22, 4, 18]"
...,...,...,...,...,...,...
143937.0,Radiology,[**2188-5-16**] 1:43 AM\n CHEST (PORTABLE AP);...,"41401,9971,42731,45829,4019,2724,1748,2449,790...","3615.0,3722.0,3614.0,3761.0,8960.0,3961.0,8856...","[7, 42, 3, 110, 0, 58, 34, 261]","[21, 28, 39, 11, 15, 1, 4, 30]"
107071.0,Nursing/other,Respiratory Care\nBaby remains on cpap 6 21%.B...,"V3101,769,7742,7793,7757,76524,76514,7470,7455","9604.0,9671.0,9390.0,3891.0,3893.0,9915.0,966....","[19, 9, 4, 14, 198, 99, 85, 45, 84]","[2, 6, 9, 10, 1, 5, 0, 8, 7]"
181879.0,Radiology,[**2115-5-8**] 7:02 PM\n CHEST (PORTABLE AP) ...,"56731,56738,7907,51881,42830,4280,4241,42731,2...","5491.0,8872.0,9390.0,3893.0","[51, 6, 111, 1, 69, 3, 52, 126, 12, 0]","[29, 14, 9, 1]"
120462.0,Nursing/other,Neonatology Attending Progress note:\nx-30 [**...,"V3001,7742,77211,76515,76525,7706,7424,V290","9390.0,9983.0,966.0","[10, 4, 170, 64, 66, 49, 2]","[9, 8, 0]"


ROUTINE PROCESSING

In [41]:
def remove_stopwords(text):
        stop_words = set(stopwords.words("english"))
        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word not in stop_words]
        return filtered_text

def preprocess(note):
    note = note.replace('\n',' ')
    note = note.replace('w/', 'with')
    note = note.lower() #lower case
    note = re.sub(r'\d+', '', note) #remove numbers
    note = note.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
    note = " ".join(note.split())
    note = remove_stopwords(note)
    return note

In [42]:
sample_1k_removed['NOTE'] = sample_1k_removed['TEXT'].apply(preprocess)
sample_10k_removed['NOTE'] = sample_10k_removed['TEXT'].apply(preprocess)
merged_df_removed['NOTE'] = merged_df_removed['TEXT'].apply(preprocess)

NameError: name 'sample_1k_removed' is not defined