In [11]:
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from torch.utils.data.dataset import Dataset
import os
import torch
import torch.nn as nn
import pytorch_pretrained_bert as Bert
import sklearn.metrics as skm
import math
from torch.utils.data.dataset import Dataset
import random
import numpy as np
import torch
import time
import transformers

In [12]:
#Data stuff
diagnoses_file_path = r'mimic_data/diagnoses_icd.csv.gz'
map_file_path = r'TransformerEHR/data/physionet.org/files/mimiciii-demo/1.4/D_ICD_DIAGNOSES.csv'


diagnoses_df = pd.read_csv(diagnoses_file_path)
print(diagnoses_df.columns)
map_df = pd.read_csv(map_file_path)

#list of patient id's that have been diagnosed with something
#make everything sequential and not patient_id key based
patient_ids = diagnoses_df['subject_id'].unique().tolist()

#2d array where each nested list is the hadm_id for each visit
visits = diagnoses_df.groupby('subject_id')['hadm_id'].apply(lambda x: list(set(x))).tolist()

#3d array contains a list of visits with respective ICD9 code per visit
patient_visits = (
    diagnoses_df.groupby(['subject_id', 'hadm_id'])['icd_code'].apply(list).groupby(level=0).apply(list).tolist()
)

#dict of {icd9_code : short_title}
#not all icd9_codes which are present in DIAGNOSES_ICD.csv are present in D_ICD_DIAGNOSES.csv, so not all codes will have a title
icd9_to_title = pd.Series(map_df['short_title'].values, index=map_df['icd9_code']).to_dict()

print("Patient ID:", patient_ids[53])
print("num of visits for patient: " , len(visits[53]))
for visit in range(len(visits[53])):
    print(f"\t{visit}-th visit id:", visits[53][visit])
    print(f"\t{visit}-th visit diagnosis codes:", patient_visits[53][visit])
    print(f"\t{visit}-th visit diagnosis short titles:",
[icd9_to_title.get(label, label) for label in patient_visits[53][visit]])


Index(['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version'], dtype='object')
Patient ID: 10002769
num of visits for patient:  2
	0-th visit id: 28314592
	0-th visit diagnosis codes: ['45342', '70713', '45981', '4019', '2724', 'V1251']
	0-th visit diagnosis short titles: ['Ac DVT/emb distl low ext', 'Ulcer of ankle', 'Venous insufficiency NOS', 'Hypertension NOS', 'Hyperlipidemia NEC/NOS', 'Hx-ven thrombosis/embols']
	1-th visit id: 25681387
	1-th visit diagnosis codes: ['45981', '70713', '4019', '2720', 'V1251', '4928', 'V1582', 'V113']
	1-th visit diagnosis short titles: ['Venous insufficiency NOS', 'Ulcer of ankle', 'Hypertension NOS', 'Pure hypercholesterolem', 'Hx-ven thrombosis/embols', 'Emphysema NEC', 'History of tobacco use', 'Hx of alcoholism']


In [13]:
#Descriptive Statistics

#Total rows with icd 9/10
count_icd_version_10 = (diagnoses_df['icd_version'] == 10).sum()
count_icd_version_9 = (diagnoses_df['icd_version'] == 9).sum()

print("Number of rows with icd_version = 10:", count_icd_version_10)
print("Number of rows with icd_version = 9:", count_icd_version_9)

#Num of unique ICD 9/10 codes
unique_icd9_codes = diagnoses_df[diagnoses_df['icd_version'] == 9]['icd_code'].nunique()
unique_icd10_codes = diagnoses_df[diagnoses_df['icd_version'] == 10]['icd_code'].nunique()

print("Number of unique ICD-9 codes:", unique_icd9_codes)
print("Number of unique ICD-10 codes:", unique_icd10_codes)

#num patients with atleast 1 ICD 10 code
icd10_df = diagnoses_df[diagnoses_df['icd_version'] == 9]
unique_patients_with_icd10 = icd10_df['subject_id'].unique()
num_patients_with_icd10 = len(unique_patients_with_icd10)
print("Number of patients with at least one ICD-10 code:", num_patients_with_icd10)

#num patients with both ICD 9 / 10 codes
grouped = diagnoses_df.groupby('subject_id')['icd_version'].agg(set)
patients_with_both = grouped[grouped.apply(lambda x: {9, 10}.issubset(x))]
print("Number of patients with both ICD-9 and ICD-10 codes:", len(patients_with_both))

# num pateients with ONLY ICD 10 codes
patient_versions = diagnoses_df.groupby('subject_id')['icd_version'].unique()
patients_with_only_icd10 = patient_versions[patient_versions.apply(lambda x: set(x) == {9})]
num_patients_only_icd10 = len(patients_with_only_icd10)
print("Number of patients with only ICD-10 codes:", num_patients_only_icd10)

Number of rows with icd_version = 10: 1989449
Number of rows with icd_version = 9: 2766877
Number of unique ICD-9 codes: 9072
Number of unique ICD-10 codes: 16757
Number of patients with at least one ICD-10 code: 124550
Number of patients with both ICD-9 and ICD-10 codes: 24123
Number of patients with only ICD-10 codes: 100427


In [88]:
# Total num of usable patients
patient_versions = diagnoses_df.groupby('subject_id')['icd_version'].unique()

patients_with_only_icd9 = patient_versions[patient_versions.apply(lambda x: set(x) == {9})].index

icd9_patients_df = diagnoses_df[diagnoses_df['subject_id'].isin(patients_with_only_icd9)]
visit_counts = icd9_patients_df.groupby('subject_id').size()

patients_more_than_three_visits = visit_counts[visit_counts > 3]
num_patients = len(patients_more_than_three_visits)
print("Number of patients with only ICD-9 codes and more than 3 visits:", num_patients)

Number of patients with only ICD-9 codes and more than 3 visits: 83227


In [109]:
#preparing the label and feature splits 

patients_with_icd10 = diagnoses_df[diagnoses_df['icd_version'] == 10]['subject_id'].unique()

#
icd9_only_df = diagnoses_df[~diagnoses_df['subject_id'].isin(patients_with_icd10)]

#Finds patients with only icd9 codes
icd9_only_df = icd9_only_df[icd9_only_df['icd_version'] == 9]
visit_counts = icd9_only_df.groupby('subject_id')['hadm_id'].nunique()

patients_more_than_three_visits = visit_counts[visit_counts > 3].index

#final DataFrame of patients with only ICD-9 codes and more than three visits
final_df = icd9_only_df[icd9_only_df['subject_id'].isin(patients_more_than_three_visits)]

patient_visits = final_df.groupby(['subject_id', 'hadm_id'])['icd_code'].apply(list).reset_index()
patient_visits = patient_visits.groupby('subject_id')['icd_code'].apply(list)

features = []
labels = []
splits = []

for visits in patient_visits:
    if len(visits) > 3:
        split_index = len(visits) // 2
        splits.append(split_index)
        features.append([visit for visit in visits[:split_index]])
        labels.append([code for sublist in visits[split_index:] for code in sublist])

#output for one set of features and labels
if features and labels:
    print("Split Index: ", splits[:10])
    print("Example Features: ", features[0])
    print("Example Labels: " , labels[0])

Split Index:  [2, 2, 2, 2, 6, 2, 2, 2, 2, 2]
Example Features:  [['5723', '78959', '5715', '07070', '496', '29680', '30981', 'V1582'], ['07071', '78959', '2875', '2761', '496', '5715', 'V08', '3051']]
Example Labels:  ['07054', '78959', 'V462', '5715', '2767', '2761', '496', 'V08', '3051', '78791', '45829', '07044', '7994', '2761', '78959', '2767', '3051', 'V08', 'V4986', 'V462', '496', '29680', '5715']


In [14]:
from transformers import BertTokenizer
class ICDCodeDataset(Dataset):
    def __init__(self, features, labels, tokenizer, max_len=512):
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        #Encode features
        feature_encoded = self.tokenizer.encode_plus(
            ' '.join(self.features[index]),
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        #Encode labels
        labels_encoded = self.tokenizer.encode_plus(
            ' '.join(self.labels[index]),
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': feature_encoded['input_ids'].flatten(),
            'attention_mask': feature_encoded['attention_mask'].flatten(),
            'labels': labels_encoded['input_ids'].flatten()
        }

#initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#create dataset
dataset = ICDCodeDataset(features, labels, tokenizer)

In [15]:
from torch.utils.data import random_split

#train and test split
total_samples = len(dataset)
train_size = int(0.8 * total_samples)
test_size = total_samples - train_size 


train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
print("FULL Dataset: {}".format(len(dataset)))
print("TRAIN Dataset: {}".format(len(train_dataset)))
print("TEST Dataset: {}".format(len(test_dataset)))

FULL Dataset: 27873
TRAIN Dataset: 22298
TEST Dataset: 5575


In [16]:
#dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [18]:
# TODO i need full vocabulary     

In [None]:
model_config = {
    'vocab_size': len(BertVocab['token2idx'].keys()), # number of disease + symbols for word embedding
    'hidden_size': 288, # word embedding and seg embedding hidden size
    'seg_vocab_size': 2, # number of vocab for seg embedding
    'age_vocab_size': len(ageVocab.keys()), # number of vocab for age embedding
    'max_position_embedding': global_params['max_len_seq'], # maximum number of tokens
    'hidden_dropout_prob': 0.2, # dropout rate
    'num_hidden_layers': 6, # number of multi-head attention layers required
    'num_attention_heads': 12, # number of attention heads
    'attention_probs_dropout_prob': 0.22, # multi-head attention dropout rate
    'intermediate_size': 512, # the size of the "intermediate" layer in the transformer encoder
    'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
    'initializer_range': 0.02, # parameter weight initializer range
}

class BertConfig(Bert.modeling.BertConfig):
    def __init__(self, config):
        super(BertConfig, self).__init__(
            vocab_size_or_config_json_file=config.get('vocab_size'),
            hidden_size=config['hidden_size'],
            num_hidden_layers=config.get('num_hidden_layers'),
            num_attention_heads=config.get('num_attention_heads'),
            intermediate_size=config.get('intermediate_size'),
            hidden_act=config.get('hidden_act'),
            hidden_dropout_prob=config.get('hidden_dropout_prob'),
            attention_probs_dropout_prob=config.get('attention_probs_dropout_prob'),
            max_position_embeddings = config.get('max_position_embedding'),
            initializer_range=config.get('initializer_range'),
        )
        self.seg_vocab_size = config.get('seg_vocab_size')
        self.age_vocab_size = config.get('age_vocab_size')


In [19]:
class BertEmbeddings(nn.Module):
    def __init__(self, config, feature_dict):
        super(BertEmbeddings, self).__init__()
        self.feature_dict = feature_dict
        
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.posi_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size).\
            from_pretrained(embeddings=self._init_posi_embedding(config.max_position_embeddings, config.hidden_size))

        self.LayerNorm = Bert.modeling.BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, word_ids, posi_ids=None,):

        if posi_ids is None:
            posi_ids = torch.zeros_like(word_ids)

        word_embed = self.word_embeddings(word_ids)
        posi_embeddings = self.posi_embeddings(posi_ids)
        
        embeddings = word_embed
        
        if self.feature_dict['posi']:
            embeddings = embeddings + posi_embeddings
        
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
    
    def _init_posi_embedding(self, max_position_embedding, hidden_size):
        def even_code(pos, idx):
            return np.sin(pos/(10000**(2*idx/hidden_size)))

        def odd_code(pos, idx):
            return np.cos(pos/(10000**(2*idx/hidden_size)))

        # initialize position embedding table
        lookup_table = np.zeros((max_position_embedding, hidden_size), dtype=np.float32)

        # reset table parameters with hard encoding
        # set even dimension
        for pos in range(max_position_embedding):
            for idx in np.arange(0, hidden_size, step=2):
                lookup_table[pos, idx] = even_code(pos, idx)
        # set odd dimension
        for pos in range(max_position_embedding):
            for idx in np.arange(1, hidden_size, step=2):
                lookup_table[pos, idx] = odd_code(pos, idx)

        return torch.tensor(lookup_table)

In [20]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.embeddings = 

        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        output_0 = self.embeddings(ids, mask, token_type_ids)
        _, output_1= self.l1(output_0, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output
    
model = BERTClass()


NameError: name 'config' is not defined