In [4]:
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from torch.utils.data.dataset import Dataset
import os
import torch
import torch.nn as nn
import pytorch_pretrained_bert as Bert
import sklearn.metrics as skm
import math
from torch.utils.data.dataset import Dataset
import random
import numpy as np
import torch
import time
import transformers

ModuleNotFoundError: No module named 'pytorch_pretrained_bert'

In [50]:
#Data stuff
diagnoses_file_path = r'mimic_data/diagnoses_icd.csv.gz'
map_file_path = r'TransformerEHR/data/physionet.org/files/mimiciii-demo/1.4/D_ICD_DIAGNOSES.csv'


diagnoses_df = pd.read_csv(diagnoses_file_path)
map_df = pd.read_csv(map_file_path)

#list of patient id's that have been diagnosed with something
#make everything sequential and not patient_id key based
patient_ids = diagnoses_df['subject_id'].unique().tolist()

#2d array where each nested list is the hadm_id for each visit
visits = diagnoses_df.groupby('subject_id')['hadm_id'].apply(lambda x: list(set(x))).tolist()

#3d array contains a list of visits with respective ICD9 code per visit
patient_visits = (
    diagnoses_df.groupby(['subject_id', 'hadm_id'])['icd_code'].apply(list).groupby(level=0).apply(list).tolist()
)

#dict of {icd9_code : short_title}
#not all icd9_codes which are present in DIAGNOSES_ICD.csv are present in D_ICD_DIAGNOSES.csv, so not all codes will have a title
icd9_to_title = pd.Series(map_df['short_title'].values, index=map_df['icd9_code']).to_dict()

print("Patient ID:", patient_ids[53])
print("num of visits for patient: " , len(visits[53]))
for visit in range(len(visits[53])):
    print(f"\t{visit}-th visit id:", visits[53][visit])
    print(f"\t{visit}-th visit diagnosis codes:", patient_visits[53][visit])
    print(f"\t{visit}-th visit diagnosis short titles:",
[icd9_to_title.get(label, label) for label in patient_visits[53][visit]])


Patient ID: 10002769
num of visits for patient:  2
	0-th visit id: 28314592
	0-th visit diagnosis codes: ['45342', '70713', '45981', '4019', '2724', 'V1251']
	0-th visit diagnosis short titles: ['Ac DVT/emb distl low ext', 'Ulcer of ankle', 'Venous insufficiency NOS', 'Hypertension NOS', 'Hyperlipidemia NEC/NOS', 'Hx-ven thrombosis/embols']
	1-th visit id: 25681387
	1-th visit diagnosis codes: ['45981', '70713', '4019', '2720', 'V1251', '4928', 'V1582', 'V113']
	1-th visit diagnosis short titles: ['Venous insufficiency NOS', 'Ulcer of ankle', 'Hypertension NOS', 'Pure hypercholesterolem', 'Hx-ven thrombosis/embols', 'Emphysema NEC', 'History of tobacco use', 'Hx of alcoholism']


In [22]:
#filtering out patients that dont have more than 3 visits
num_patients_more_than_three_visits = sum(len(visit_list) > 3 for visit_list in visits)
print(f"Number of patients with more than three visits: {num_patients_more_than_three_visits}")

features = []
labels = []

for visits in patient_visits:
    if len(visits) > 3:
        split_index = len(visits) // 2
        features.append([code for visit in visits[:split_index] for code in visit])
        labels.append([code for visit in visits[split_index:] for code in visit])

#example features and labels
print("Features example:", features[0])
print("Labels example:", labels[0])

Number of patients with more than three visits: 27873
Features example: ['5723', '78959', '5715', '07070', '496', '29680', '30981', 'V1582', '07071', '78959', '2875', '2761', '496', '5715', 'V08', '3051']
Labels example: ['07054', '78959', 'V462', '5715', '2767', '2761', '496', 'V08', '3051', '78791', '45829', '07044', '7994', '2761', '78959', '2767', '3051', 'V08', 'V4986', 'V462', '496', '29680', '5715']


In [23]:
from transformers import BertTokenizer
class ICDCodeDataset(Dataset):
    def __init__(self, features, labels, tokenizer, max_len=512):
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        #Encode features
        feature_encoded = self.tokenizer.encode_plus(
            ' '.join(self.features[index]),
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        #Encode labels
        labels_encoded = self.tokenizer.encode_plus(
            ' '.join(self.labels[index]),
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': feature_encoded['input_ids'].flatten(),
            'attention_mask': feature_encoded['attention_mask'].flatten(),
            'labels': labels_encoded['input_ids'].flatten()
        }

#initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#create dataset
dataset = ICDCodeDataset(features, labels, tokenizer)

In [34]:
from torch.utils.data import random_split

#train and test split
total_samples = len(dataset)
train_size = int(0.8 * total_samples)
test_size = total_samples - train_size 


train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
print("FULL Dataset: {}".format(len(dataset)))
print("TRAIN Dataset: {}".format(len(train_dataset)))
print("TEST Dataset: {}".format(len(test_dataset)))

FULL Dataset: 27873
TRAIN Dataset: 22298
TEST Dataset: 5575


In [25]:
#dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [17]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.embeddings = BertEmbeddings(config=config)

        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        output_0 = self.embeddings(ids, mask, token_type_ids)
        _, output_1= self.l1(output_0, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output
    
model = BERTClass()


NameError: name 'BertEmbeddings' is not defined