In [1]:
import torchtext

# Define the tokenizer
tokenizer = torchtext.data.get_tokenizer('basic_english')

# Tokenize the data
tokenized_data = []
with open('data/chunk_1.csv', 'r') as f:
    lines = f.read().splitlines()
    for line in lines:
        tokenized_data.append(tokenizer(line))

# Build the vocabulary
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_data)

# Convert tokens to indices
sequences = []
for token_sequence in tokenized_data:
    sequences.append([vocab[token] for token in token_sequence])

In [2]:
# Print the first 5 sequences as indices
for sequence in sequences[:5]:
    print(sequence)

[34, 40, 30, 40346, 32, 1, 12, 0, 45, 38, 41, 37, 21, 47, 20, 0, 46, 0, 43, 4, 19, 16, 10, 7, 4, 2, 9, 8, 3, 0, 48, 90, 10, 7, 4, 2, 9, 8, 3, 85, 10, 7, 4, 2, 9, 8, 3, 87, 4, 62, 64, 15, 65, 91, 0, 63, 35, 11, 31, 42, 24, 1, 12, 0, 44, 23, 39, 27, 1, 5, 0, 6, 25, 89, 26, 16, 10, 7, 4, 2, 9, 8, 3, 28, 16, 10, 7, 4, 2, 9, 8, 3, 36, 14306, 29, 93450, 33, 774, 395, 463, 13, 73, 1907, 395, 463, 13, 22, 54155, 67, 73902, 51, 1, 5, 0, 6, 49, 88, 50, 69, 68, 2, 14, 3, 52, 69, 68, 2, 14, 3, 54, 11, 53, 280, 76, 80, 99, 114, 284, 57, 1, 5, 0, 6, 55, 118, 56, 75, 74, 94, 95, 2, 14, 3, 58, 75, 74, 94, 95, 2, 14, 3, 60, 11, 59, 280, 76, 80, 99, 114, 284]
[34, 40, 30, 36121, 32, 1, 12, 0, 45, 38, 41, 37, 21, 47, 20, 0, 46, 0, 43, 4, 19, 16, 10, 7, 4, 2, 9, 8, 3, 0, 48, 90, 10, 7, 4, 2, 9, 8, 3, 85, 10, 7, 4, 2, 9, 8, 3, 87, 4, 62, 64, 15, 65, 112, 91, 0, 63, 35, 11, 31, 42, 24, 1, 12, 0, 44, 23, 39, 27, 1, 5, 0, 6, 25, 89, 26, 16, 10, 7, 4, 2, 9, 8, 3, 28, 16, 10, 7, 4, 2, 9, 8, 3, 36, 14305, 29, 13

In [3]:
# Print the first 5 sequences as tokens
for sequence in sequences[:5]:
    print([vocab.get_itos()[index] for index in sequence])

['[resourcetype]', 'careplan', '[id]', '76d3b4b2-398f-4373-4957-599959c26e87', '[meta][profile][0]', 'http', '//hl7', '.', 'org/fhir/us/core/structuredefinition/us-core-careplan', '[text][status]', 'generated', '[text][div]', '<div', 'xmlns=http', '//www', '.', 'w3', '.', 'org/1999/xhtml>care', 'plan', 'for', 'infectious', 'disease', 'care', 'plan', '(', 'record', 'artifact', ')', '.', '<br/>activities', '<ul><li>infectious', 'disease', 'care', 'plan', '(', 'record', 'artifact', ')', '</li><li>infectious', 'disease', 'care', 'plan', '(', 'record', 'artifact', ')', '</li></ul><br/>care', 'plan', 'is', 'meant', 'to', 'treat', 'covid-19', '.', '</div>', '[status]', 'completed', '[intent]', 'order', '[category][0][coding][0][system]', 'http', '//hl7', '.', 'org/fhir/us/core/codesystem/careplan-category', '[category][0][coding][0][code]', 'assess-plan', '[category][1][coding][0][system]', 'http', '//snomed', '.', 'info/sct', '[category][1][coding][0][code]', '736376001', '[category][1][codi

In [4]:
import os
import torch
from torch import nn
from torch.utils.data import random_split, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
# Set the hyperparameters
embedding_dim = 64
hidden_dim = 128
vocab_size = len(vocab)
print(vocab_size)
num_epochs = 1
batch_size = 16

105506


In [6]:
# Define the model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.rnn(x)
        output = self.fc(output)
        return output

def pad_collate(batch):
    (sequences, targets) = zip(*batch)
    sequences_pad = pad_sequence(sequences, batch_first=True, padding_value=0)
    targets_pad = pad_sequence(targets, batch_first=True, padding_value=0)
    return sequences_pad, targets_pad

# Initialize the model and move it to the device

model = RNNModel(vocab_size, embedding_dim, hidden_dim)
model = model.to(device)

In [7]:
# Set the loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Load model and optimizer states from a PRETRAINED MODEL

if os.path.exists('saved_models/checkpoint2.pth'):
    checkpoint = torch.load('saved_models/checkpoint2.pth')
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Convert the sequences to tensors and move them to the device
sequences = [torch.tensor(sequence).to(device) for sequence in sequences]

# Prepare the data for training
inputs = [sequence[:-1] for sequence in sequences]
targets = [sequence[1:] for sequence in sequences]

# Combine the inputs and targets into a single dataset
dataset = list(zip(inputs, targets))

# Split the dataset into a training set and a validation set
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
# Set the number of training epochs and the learning rate
# Set sequence length
sequence_length = max(len(sequence) for sequence, _ in train_dataset)
# Set sequence length
#sequence_length = 100
# Pad or truncate all sequences to this length  FOR USE ONLY WITH GAN
#train_dataset = [sequence[:sequence_length] for sequence in train_dataset]


# Create data loaders for the training and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=pad_collate)

In [None]:
# Train the model
for epoch in range(num_epochs):
# Inside the training loop...
    for i, (sequence, target) in enumerate(train_loader):
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(sequence)

        # Reshape the output and target tensors
        output = output.view(-1, output.shape[-1])  # shape: (sequence_length * batch_size, num_classes)
        target = target.view(-1)  # shape: (sequence_length * batch_size,)

        # Compute the loss
        loss = loss_function(output, target)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        print("Step: " + str(i) + " loss: " + str(loss.item()))

    print(f'Epoch {epoch+1}, Training Loss: {loss.item()}')

    # Validate the model
    with torch.no_grad():
        val_loss = 0
        for sequence, target in val_loader:
            output = model(sequence)

            # Reshape the output and target tensors
            output = output.view(-1, output.shape[-1])  # shape: (sequence_length * batch_size, num_classes)
            target = target.view(-1)  # shape: (sequence_length * batch_size,)

            loss = loss_function(output, target)
            val_loss += loss.item()

    print(f'Epoch {epoch+1}, Validation Loss: {(val_loss / len(val_loader)).item()}')

#The training loop should execute below

In [11]:
import os

saved_model_dir = "./saved_models/"

if not os.path.exists(saved_model_dir):
    os.makedirs(saved_model_dir)


In [12]:

# Suppose 'model' is your RNNModel
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, 'saved_models/checkpoint2.pth')

In [28]:
n_sentences = 10
sentence_len = 100
sentence = '[resourcetype]'
last_word = sentence

for x in range(n_sentences):
    for x in range(sentence_len):
        pred = model.forward(torch.tensor([vocab[last_word]]))
        last_word = vocab.get_itos()[pred.argmax(dim=1)[0].item()]
        sentence += " " + last_word
    print(sentence)

[resourcetype] careplan [id] 65f7e062-3c12-feef-6e60-83841775b37a patient/01a0bddd-16ab-9923-ca26-f7e6efe10dc3 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
[resourcetype] careplan [id] 65f7e062-3c12-feef-6e60-83841775b37a patient/01a0bddd-16ab-9923-ca26-f7e6efe10dc3 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
[resourcetype] careplan [id] 65f7e062-3c12-feef-6e60-83841775b37a patient/01a0bddd-16ab-9923-ca26-f7e6efe10dc3 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .