In [89]:
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from torch.utils.data.dataset import Dataset
import os
import torch
import torch.nn as nn
import sklearn.metrics as skm
import math
from torch.utils.data.dataset import Dataset
import random
import numpy as np
import torch
import time
import transformers
import json
import pytorch_pretrained_bert as Bert

ModuleNotFoundError: No module named 'pytorch_pretrained_bert'

In [25]:
#Data stuff
diagnoses_file_path = r'mimic_data/diagnoses_icd.csv.gz'
map_file_path = r'TransformerEHR/data/physionet.org/files/mimiciii-demo/1.4/D_ICD_DIAGNOSES.csv'


diagnoses_df = pd.read_csv(diagnoses_file_path)
print(diagnoses_df.columns)
map_df = pd.read_csv(map_file_path)

#list of patient id's that have been diagnosed with something
#make everything sequential and not patient_id key based
patient_ids = diagnoses_df['subject_id'].unique().tolist()

#2d array where each nested list is the hadm_id for each visit
visits = diagnoses_df.groupby('subject_id')['hadm_id'].apply(lambda x: list(set(x))).tolist()

#3d array contains a list of visits with respective ICD9 code per visit
patient_visits = (
    diagnoses_df.groupby(['subject_id', 'hadm_id'])['icd_code'].apply(list).groupby(level=0).apply(list).tolist()
)

#dict of {icd9_code : short_title}
#not all icd9_codes which are present in DIAGNOSES_ICD.csv are present in D_ICD_DIAGNOSES.csv, so not all codes will have a title
icd9_to_title = pd.Series(map_df['short_title'].values, index=map_df['icd9_code']).to_dict()

print("Patient ID:", patient_ids[53])
print("num of visits for patient: " , len(visits[53]))
for visit in range(len(visits[53])):
    print(f"\t{visit}-th visit id:", visits[53][visit])
    print(f"\t{visit}-th visit diagnosis codes:", patient_visits[53][visit])
    print(f"\t{visit}-th visit diagnosis short titles:",
[icd9_to_title.get(label, label) for label in patient_visits[53][visit]])


Index(['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version'], dtype='object')
Patient ID: 10002769
num of visits for patient:  2
	0-th visit id: 28314592
	0-th visit diagnosis codes: ['45342', '70713', '45981', '4019', '2724', 'V1251']
	0-th visit diagnosis short titles: ['Ac DVT/emb distl low ext', 'Ulcer of ankle', 'Venous insufficiency NOS', 'Hypertension NOS', 'Hyperlipidemia NEC/NOS', 'Hx-ven thrombosis/embols']
	1-th visit id: 25681387
	1-th visit diagnosis codes: ['45981', '70713', '4019', '2720', 'V1251', '4928', 'V1582', 'V113']
	1-th visit diagnosis short titles: ['Venous insufficiency NOS', 'Ulcer of ankle', 'Hypertension NOS', 'Pure hypercholesterolem', 'Hx-ven thrombosis/embols', 'Emphysema NEC', 'History of tobacco use', 'Hx of alcoholism']


In [27]:
#Descriptive Statistics

#Total rows with icd 9/10
count_icd_version_10 = (diagnoses_df['icd_version'] == 10).sum()
count_icd_version_9 = (diagnoses_df['icd_version'] == 9).sum()

print("Number of rows with icd_version = 10:", count_icd_version_10)
print("Number of rows with icd_version = 9:", count_icd_version_9)

#Num of unique ICD 9/10 codes
unique_icd9_codes = diagnoses_df[diagnoses_df['icd_version'] == 9]['icd_code'].nunique()
unique_icd10_codes = diagnoses_df[diagnoses_df['icd_version'] == 10]['icd_code'].nunique()

print("Number of unique ICD-9 codes:", unique_icd9_codes)
print("Number of unique ICD-10 codes:", unique_icd10_codes)

#num patients with atleast 1 ICD 9 code
icd9_df = diagnoses_df[diagnoses_df['icd_version'] == 9]
unique_patients_with_icd9 = icd9_df['subject_id'].unique()
num_patients_with_icd9 = len(unique_patients_with_icd9)
print("Number of patients with at least one ICD-9 code:", num_patients_with_icd9)

#num patients with both ICD 9 / 10 codes
grouped = diagnoses_df.groupby('subject_id')['icd_version'].agg(set)
patients_with_both = grouped[grouped.apply(lambda x: {9, 10}.issubset(x))]
print("Number of patients with both ICD-9 and ICD-10 codes:", len(patients_with_both))

# num pateients with ONLY ICD 9 codes
patient_versions = diagnoses_df.groupby('subject_id')['icd_version'].unique()
patients_with_only_icd9 = patient_versions[patient_versions.apply(lambda x: set(x) == {9})]
num_patients_only_icd9 = len(patients_with_only_icd9)
print("Number of patients with only ICD-9 codes:", num_patients_only_icd9)

Number of rows with icd_version = 10: 1989449
Number of rows with icd_version = 9: 2766877
Number of unique ICD-9 codes: 9072
Number of unique ICD-10 codes: 16757
Number of patients with at least one ICD-9 code: 124550
Number of patients with both ICD-9 and ICD-10 codes: 24123
Number of patients with only ICD-9 codes: 100427


In [28]:
# Total num of usable patients
patient_versions = diagnoses_df.groupby('subject_id')['icd_version'].unique()

patients_with_only_icd9 = patient_versions[patient_versions.apply(lambda x: set(x) == {9})].index

icd9_patients_df = diagnoses_df[diagnoses_df['subject_id'].isin(patients_with_only_icd9)]
visit_counts = icd9_patients_df.groupby('subject_id')['hadm_id'].nunique()
patients_more_than_three_visits = visit_counts[visit_counts > 3].index
num_patients = len(patients_more_than_three_visits)
print("Number of patients with only ICD-9 codes and more than 3 visits:", num_patients)

Number of patients with only ICD-9 codes and more than 3 visits: 11073


In [137]:
def count_visits_and_codes(patient_visits):
    visit_counts = {}
    codes_per_visit = {}

    for subject_id, visits in patient_visits.items():
        # Count the number of visits for each patient
        visit_counts[subject_id] = len(visits)
        
        # List to store the number of codes per visit for this patient
        codes_per_visit[subject_id] = []

        for visit in visits:
            # Count the number of codes in each visit
            codes_per_visit[subject_id].append(len(visit))

    return visit_counts, codes_per_visit

visit_counts, codes_per_visit = count_visits_and_codes(patient_visits)

# Printing the results
print("Visit Counts per Patient:" , sorted(visit_counts.values()))
#set total max visits = 40
#leave max codes = 39

Visit Counts per Patient: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4

In [133]:
#preparing the label and feature splits 

patients_with_icd10 = diagnoses_df[diagnoses_df['icd_version'] == 10]['subject_id'].unique()

icd9_only_df = diagnoses_df[~diagnoses_df['subject_id'].isin(patients_with_icd10)]

#Finds patients with only icd9 codes
icd9_only_df = icd9_only_df[icd9_only_df['icd_version'] == 9]
visit_counts = icd9_only_df.groupby('subject_id')['hadm_id'].nunique()

patients_more_than_three_visits = visit_counts[visit_counts > 3].index

#final DataFrame of patients with only ICD-9 codes and more than three visits
final_df = icd9_only_df[icd9_only_df['subject_id'].isin(patients_more_than_three_visits)]

patient_visits = final_df.groupby(['subject_id', 'hadm_id'])['icd_code'].apply(list).reset_index()
patient_visits = patient_visits.groupby('subject_id')['icd_code'].apply(list)

#extract all ICD9 codes but only take first 3 digits
map_df['truncated_icd9'] = map_df['icd9_code'].apply(lambda x: x[:3])

unique_truncated_codes = sorted(map_df['truncated_icd9'].unique())
code_to_index = {code: idx for idx, code in enumerate(unique_truncated_codes)}

def encode_labels(codes, code_to_index):
    label_vector = [0] * len(code_to_index)
    for code in codes:
        truncated_code = code[:3]
        if truncated_code in code_to_index:
            label_vector[code_to_index[truncated_code]] = 1
    return label_vector



features = []
labels = []

for subject_id, visits in patient_visits.items():
    if len(visits) > 3:
        split_index = len(visits) // 2

        
        feature_visits = []
        feature_visits.append('CLS')

        
        for sublist in visits[:split_index]:
            visit_codes = [code for code in sublist] 
            visit_codes.append('SEP')
            feature_visits.extend(visit_codes) 

        features.append(feature_visits)

       
        label_codes = [code for sublist in visits[split_index:] for code in sublist]
        labels.append(encode_labels(label_codes, code_to_index))

#output for one set of features and labels
if features and labels:
    print("Example Features: ", features[0])
    print("length Labels: " , len(labels[0]))

Example Features:  ['CLS', '5723', '78959', '5715', '07070', '496', '29680', '30981', 'V1582', 'SEP', '07071', '78959', '2875', '2761', '496', '5715', 'V08', '3051', 'SEP']
length Labels:  1042


In [140]:
#Creates Token Vocabulary

import json
# TODO i need full vocabulary     
truncated_codes = {str(code)[:3] for code in map_df['icd9_code']}
sorted_truncated_codes = sorted(truncated_codes)  # Sort codes

# Define special tokens with a specific order
special_tokens = ['[PAD]', '[CLS]', '[SEP]', '[UNK]', '[MASK]']

# Create dictionary mapping each code to a unique index, starting with special tokens
token2idx = {token: idx for idx, token in enumerate(special_tokens + sorted_truncated_codes)}

# Print the number of unique codes to verify
print("Number of unique truncated codes:", len(token2idx) - len(special_tokens))

# Print token to index mapping
print("Token to Index Mapping:", token2idx)

# Save the token2idx dictionary to a JSON file for later use
with open('token2idx.json', 'w') as f:
    json.dump(token2idx, f)

Number of unique truncated codes: 1042
Token to Index Mapping: {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[UNK]': 3, '[MASK]': 4, '001': 5, '002': 6, '003': 7, '004': 8, '005': 9, '006': 10, '007': 11, '008': 12, '009': 13, '010': 14, '011': 15, '012': 16, '013': 17, '014': 18, '015': 19, '016': 20, '017': 21, '018': 22, '020': 23, '021': 24, '022': 25, '023': 26, '024': 27, '025': 28, '026': 29, '027': 30, '030': 31, '031': 32, '032': 33, '033': 34, '034': 35, '035': 36, '036': 37, '037': 38, '038': 39, '039': 40, '040': 41, '041': 42, '042': 43, '045': 44, '046': 45, '047': 46, '048': 47, '049': 48, '050': 49, '051': 50, '052': 51, '053': 52, '054': 53, '055': 54, '056': 55, '057': 56, '058': 57, '059': 58, '060': 59, '061': 60, '062': 61, '063': 62, '064': 63, '065': 64, '066': 65, '070': 66, '071': 67, '072': 68, '073': 69, '074': 70, '075': 71, '076': 72, '077': 73, '078': 74, '079': 75, '080': 76, '081': 77, '082': 78, '083': 79, '084': 80, '085': 81, '086': 82, '087': 83, '088': 84, 

In [161]:
class NextVisit(Dataset):
    def __init__(self, token2idx, labels, patient_visits, max_len):
        self.token2idx = token2idx
        self.labels = labels
        self.patient_visits = patient_visits
        self.max_len = max_len

    def __len__(self):
        return len(self.patient_visits)

    def __getitem__(self, index):
        # Retrieve patient data by index
        patient_id = list(self.patient_visits.keys())[index]
        codes = self.patient_visits[patient_id]
        
        # Initialize sequence with [CLS] token
        sequence = [self.token2idx['[CLS]']]
        
        # Add each code to the sequence and append [SEP] after each visit
        for visit in codes:
            sequence.extend([self.token2idx.get(code, self.token2idx['[UNK]']) for code in visit])
            sequence.append(self.token2idx['[SEP]'])
        
        # Cut or pad the sequence to the maximum length
        if len(sequence) > self.max_len:
            sequence = sequence[:self.max_len]
        else:
            sequence.extend([self.token2idx['[PAD]']] * (self.max_len - len(sequence)))

        # Create position indices (positional embeddings)
        position_indices = list(range(len(sequence)))

        # Create a mask for the sequence
        mask = [1 if token != self.token2idx['[PAD]'] else 0 for token in sequence]

        # Prepare the labels
        label = torch.tensor(self.labels[patient_id], dtype=torch.float)

        return torch.tensor(sequence, dtype=torch.long), torch.tensor(position_indices, dtype=torch.long), torch.tensor(mask, dtype=torch.long), label

In [165]:
#testint stuff out

max_len = 512

patient_id = list(patient_visits.keys())[0]
codes = patient_visits[patient_id]

# Initialize sequence with [CLS] token
sequence = [token2idx['[CLS]']]

# Add each code to the sequence and append [SEP] after each visit
for visit in codes:
    sequence.extend([token2idx.get(code, token2idx['[UNK]']) for code in visit])
    sequence.append(token2idx['[SEP]'])

# Cut or pad the sequence to the maximum length
if len(sequence) > max_len:
    sequence = sequence[:max_len]
else:
    sequence.extend([token2idx['[PAD]']] * (max_len - len(sequence)))

# Create position indices (positional embeddings)
position_indices = list(range(len(sequence)))

mask = [1 if token != token2idx['[PAD]'] else 0 for token in sequence]
print(sequence)

'''
A bunch of 3 ([UNK]) because patient_visits includes full ICD Codes but token2idx has truncated version
Need to do version matching before feeding into model.
'''

[1, 3, 3, 3, 3, 459, 3, 3, 3, 2, 3, 3, 3, 3, 459, 3, 964, 3, 2, 3, 3, 3, 3, 3, 3, 459, 964, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 964, 3, 3, 459, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [163]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Subset

full_dataset = NextVisit(token2idx, labels, patient_visits, max_len)

train_idx, test_idx = train_test_split(range(len(full_dataset)), test_size=0.2, random_state=42)

train_dataset = Subset(full_dataset, train_idx)
test_dataset = Subset(full_dataset, test_idx)

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False) 

In [None]:
model_config = {
    'vocab_size': len(BertVocab['token2idx'].keys()), # number of disease + symbols for word embedding
    'hidden_size': 288, # word embedding and seg embedding hidden size
    'seg_vocab_size': 2, # number of vocab for seg embedding
    'age_vocab_size': len(ageVocab.keys()), # number of vocab for age embedding
    'max_position_embedding': global_params['max_len_seq'], # maximum number of tokens
    'hidden_dropout_prob': 0.2, # dropout rate
    'num_hidden_layers': 6, # number of multi-head attention layers required
    'num_attention_heads': 12, # number of attention heads
    'attention_probs_dropout_prob': 0.22, # multi-head attention dropout rate
    'intermediate_size': 512, # the size of the "intermediate" layer in the transformer encoder
    'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
    'initializer_range': 0.02, # parameter weight initializer range
}

class BertConfig(Bert.modeling.BertConfig):
    def __init__(self, config):
        super(BertConfig, self).__init__(
            vocab_size_or_config_json_file=config.get('vocab_size'),
            hidden_size=config['hidden_size'],
            num_hidden_layers=config.get('num_hidden_layers'),
            num_attention_heads=config.get('num_attention_heads'),
            intermediate_size=config.get('intermediate_size'),
            hidden_act=config.get('hidden_act'),
            hidden_dropout_prob=config.get('hidden_dropout_prob'),
            attention_probs_dropout_prob=config.get('attention_probs_dropout_prob'),
            max_position_embeddings = config.get('max_position_embedding'),
            initializer_range=config.get('initializer_range'),
        )
        self.seg_vocab_size = config.get('seg_vocab_size')
        self.age_vocab_size = config.get('age_vocab_size')


In [19]:
class BertEmbeddings(nn.Module):
    def __init__(self, config, feature_dict):
        super(BertEmbeddings, self).__init__()
        self.feature_dict = feature_dict
        
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.posi_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size).\
            from_pretrained(embeddings=self._init_posi_embedding(config.max_position_embeddings, config.hidden_size))

        self.LayerNorm = Bert.modeling.BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, word_ids, posi_ids=None,):

        if posi_ids is None:
            posi_ids = torch.zeros_like(word_ids)

        word_embed = self.word_embeddings(word_ids)
        posi_embeddings = self.posi_embeddings(posi_ids)
        
        embeddings = word_embed
        
        if self.feature_dict['posi']:
            embeddings = embeddings + posi_embeddings
        
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
    
    def _init_posi_embedding(self, max_position_embedding, hidden_size):
        def even_code(pos, idx):
            return np.sin(pos/(10000**(2*idx/hidden_size)))

        def odd_code(pos, idx):
            return np.cos(pos/(10000**(2*idx/hidden_size)))

        # initialize position embedding table
        lookup_table = np.zeros((max_position_embedding, hidden_size), dtype=np.float32)

        # reset table parameters with hard encoding
        # set even dimension
        for pos in range(max_position_embedding):
            for idx in np.arange(0, hidden_size, step=2):
                lookup_table[pos, idx] = even_code(pos, idx)
        # set odd dimension
        for pos in range(max_position_embedding):
            for idx in np.arange(1, hidden_size, step=2):
                lookup_table[pos, idx] = odd_code(pos, idx)

        return torch.tensor(lookup_table)

In [20]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.embeddings = 

        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        output_0 = self.embeddings(ids, mask, token_type_ids)
        _, output_1= self.l1(output_0, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output
    
model = BERTClass()


NameError: name 'config' is not defined