In [1]:
import re
import os
import time
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
# from gensim.models import KeyedVectors
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, Dataset

[nltk_data] Downloading package punkt to /Users/nhungle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
import spacy
import string

In [3]:
raw_data = pd.read_csv('medicaltranscriptions.zip')

In [4]:
raw_data.columns

Index(['Unnamed: 0', 'description', 'medical_specialty', 'sample_name',
       'transcription', 'keywords'],
      dtype='object')

In [5]:
top_5_specialties = list(raw_data['medical_specialty'].value_counts()[:5].index)
data = raw_data.loc[raw_data['medical_specialty'].isin(top_5_specialties)]

### Clean Text

In [6]:
# Clean Data
import clean_text


[nltk_data] Downloading package punkt to /Users/nhungle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
list(data['transcription'].apply(type).value_counts().index)

[str, float]

**Note** We replace missing transcription with description

In [26]:
data['transcription'].isnull().sum()
data.loc[data['transcription'].apply(type)==float]

16

In [30]:
data['transcription'].fillna(data['description'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [33]:
data.loc[data.description == data.transcription].shape

(16, 6)

In [34]:
data['clean_transcription'] = clean_text.cleanText(data['transcription'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### Get Train/Val/Test set

In [53]:
import torch
torch.manual_seed(0)

<torch._C.Generator at 0x1a29be9550>

In [55]:
train_data, val_data = train_test_split(data, test_size=0.40, random_state=42)
val_data, test_data = train_test_split(data, test_size=0.50, random_state=40)

In [56]:
train_data.index = np.arange(len(train_data))
val_data.index = np.arange(len(val_data))
test_data.index = np.arange(len(test_data))

In [61]:
train_data.shape

(1571, 7)

In [58]:
def getTextTarget(data):
    text = data['clean_transcription']
    target = data['medical_specialty']
    return text, target

In [62]:
train_text, train_target = getTextTarget(train_data)
val_text, val_target = getTextTarget(val_data)
test_text, test_target = getTextTarget(test_data)

In [233]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_target)

LabelEncoder()

In [236]:
train_target = le.transform(train_target)
val_target = le.transform(val_target)
test_target = le.transform(test_target)

### Tokenize

In [79]:
class Tokenizer(object):
    def __init__(self, dataset):
        res = self.tokenize_dataset(dataset)
        self.token_dataset = res[0]
        self.all_tokens = res[1]
    
    def tokenize(self, sentence):
        punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        tokens = word_tokenize(sentence)
        return [token for token in tokens if (token not in punctuations)]
    
    def tokenize_dataset(self, dataset):
        token_dataset = []
        all_tokens = []
        # we are keeping track of all tokens in dataset 
        # in order to create vocabulary later

        for sample in dataset:
            tokens = self.tokenize(sample)
            token_dataset.append(tokens)
            all_tokens += tokens

        return token_dataset, all_tokens

In [81]:
train_tokenizer = Tokenizer(train_text)
train_tokens = train_tokenizer.token_dataset
all_tokens = train_tokenizer.all_tokens

In [84]:
test_tokenizer = Tokenizer(test_text)
test_tokens = test_tokenizer.token_dataset
val_tokenizer = Tokenizer(val_text)
val_tokens = val_tokenizer.token_dataset

In [86]:
len(val_tokens)

1309

In [90]:
data_root = '/Users/nhungle/Desktop/repos/Data-Science-Projects/MedTranscript/data'

In [91]:
import pickle as pkl
pkl.dump(val_tokens, open(os.path.join(data_root, "val_data_tokens.p"), "wb"))
pkl.dump(test_tokens, open(os.path.join(data_root, "test_data_tokens.p"), "wb"))
pkl.dump(train_tokens, open(os.path.join(data_root, "train_data_tokens.p"), "wb"))
pkl.dump(all_tokens, open(os.path.join(data_root, "all_data_tokens.p"), "wb"))


### Dictionary

In [93]:
class Dictionary(object):
    def __init__(self, all_tokens, max_vocab_size, PAD_IDX, UNK_IDX):
        res = self.buildVocab(all_tokens, max_vocab_size, PAD_IDX, UNK_IDX)
        # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
        self.id2token = res[1]
        # token2id: dictionary where keys represent tokens and corresponding values represent indices
        self.token2id = res[0]
    
    def buildVocab(self, all_tokens, max_vocab_size, PAD_IDX, UNK_IDX):
        token_counter = Counter(all_tokens)
        vocab, count = zip(*token_counter.most_common(max_vocab_size))
        id2token = list(vocab)
        token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
        id2token = ['<pad>', '<unk>'] + id2token
        token2id['<pad>'] = PAD_IDX 
        token2id['<unk>'] = UNK_IDX
        return token2id, id2token
    
    def __len__(self):
        return len(self.id2token)
 

In [94]:
from collections import Counter

max_vocab_size = 30000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1
corpus = Dictionary(all_tokens, max_vocab_size, PAD_IDX, UNK_IDX)
id2token = corpus.id2token
token2id = corpus.token2id

In [96]:
corpus.__len__()

30002

In [99]:
#token2id

### Convert token (word) to ids

For each dataset, each sample of tokens (i.e., words) will be represented as index of that word in the dictionary

In [106]:
class Token2IndexDataset(object):
    def __init__(self, tokens_data):
        self.indices_data = self.token2index_dataset(tokens_data)
        
    def token2index_dataset(self, tokens_data):
        indices_data = []
        for tokens in tokens_data:
            index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
            indices_data.append(index_list)
        return indices_data

In [163]:
train_data_indices = Token2IndexDataset(train_tokens).indices_data
val_data_indices= Token2IndexDataset(val_tokens).indices_data
test_data_indices= Token2IndexDataset(test_tokens).indices_data

In [126]:
# idx = 10
# print(train_tokens[idx])
# print(train_data_indices[idx])

In [104]:
# Lets check the dictionary by loading random token from it
import random
random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 2103 ; token serum
Token serum; token id 2103


### Embedding

When using deep learning methods on NLP tasks, we usually utilize [word embedding](https://en.wikipedia.org/wiki/Word_embedding). To put it briefly, word embedding represent words, or tokens, in a vocabulary as a distributed numerical vector. There are a lot of methods to obtain a word embedding, with some of the most famous shallow models being Word2Vec, GloVe, and FastText while the deeper models are BERT, RoBERTa, T5. It is not difficult to find a general purpose word embedding trained by one of the aforementioned methods on the Internet that's been trained with a massive amount of data. It is usually a good idea to use these pre-trained embedding to save yourself some time and computing resource.

In [112]:
!pip install gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/b3/54/1d7294672110d5c0565cabc4b99ed952ced9a2dc2ca1d59ad1b34303a6de/gensim-3.8.1-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (24.7MB)
[K    100% |████████████████████████████████| 24.7MB 1.4MB/s 
[?25hCollecting smart-open>=1.8.1 (from gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/5d/13/a2db017db801d0157fdc41814658396e6ae398d06adf69d73df1c8175b5d/smart_open-1.11.1.tar.gz (105kB)
[K    100% |████████████████████████████████| 112kB 10.5MB/s 
Building wheels for collected packages: smart-open
  Running setup.py bdist_wheel for smart-open ... [?25ldone
[?25h  Stored in directory: /Users/nhungle/Library/Caches/pip/wheels/51/88/6c/2bcd305b87c1e62cbcbf419e782ad9fc612cd4bbb71557502d
Successfully built smart-open
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.8.1 smart-open-1.11.1


In [115]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

In [116]:
_ = glove2word2vec(os.path.join(data_root,'glove.6B.50d.txt'), 'tmp_file')
glove_embedding = KeyedVectors.load_word2vec_format('tmp_file')

**Note** Dictionary (i.e., glove_embedding.vocab) is a dictionary, where keys are words, and values are arrays of position of that key in the 50 dimensional space.

In [125]:
len(glove_embedding['love'])

50

### Find similar words

The word embedding vectors can help us find words with similar meanings. Word similarities can be measured by [Cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity). The function below looks up the most similar words to a given word:

In [121]:
glove_embedding.similar_by_word('love', topn=5)

[('dream', 0.8429608941078186),
 ('life', 0.8403438925743103),
 ('dreams', 0.8398563861846924),
 ('loves', 0.8361135721206665),
 ('me', 0.8351818323135376)]

### Word arithmetic

In [122]:
glove_embedding.similar_by_word(glove_embedding['love'] - 
                                glove_embedding['better'] + 
                                glove_embedding['best'], topn=1)

[('love', 0.8820392489433289)]

In [133]:
class Token2VectorDataset(object):
    def __init__(self, tokens_data, embedding,):
        self.tokens_data = tokens_data
        self.embedding = embedding
        self.UNK_IDX = UNK_IDX
        self.indices_data = self.token2vector_dataset()
        
    def token2vector_dataset(self):
        indices_data = []
        for tokens in self.tokens_data:
            index_list = [self.embedding[token] if token in self.embedding.vocab else UNK_IDX
                          for token in tokens]
            indices_data.append(index_list)
        return indices_data

In [134]:
train_data_vectors = Token2VectorDataset(train_tokens, glove_embedding).indices_data

In [135]:
val_data_vectors = Token2VectorDataset(val_tokens, glove_embedding).indices_data
test_data_vectors = Token2VectorDataset(test_tokens, glove_embedding).indices_data

In [259]:
#train_data_vectors[0]

### Data Loader

In [None]:
lengths = [len(i) for i in train_data_indices]
np.mean(lengths)

In [158]:
MAX_SENTENCE_LENGTH = 600

In [237]:

import numpy as np
import torch
from torch.utils.data import Dataset

class MedTranscriptDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]


In [238]:
# train_dataset = MedTranscriptDataset(train_data_indices, train_target)
# idx = 0
# print("x {};\n y {}".format(train_dataset[idx][0], train_dataset[idx][2]))
# print("len sentence", train_dataset[idx][1])
# print(train_target[idx])

In [243]:
def medtranscript_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
#     batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    #print(data_list)
    #print(length_list)
    #print(label_list)
    return [torch.from_numpy(np.array(data_list)),
            torch.LongTensor(length_list),
            torch.LongTensor(label_list)]

In [249]:
BATCH_SIZE = 32

train_dataset = MedTranscriptDataset(train_data_indices, train_target)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=medtranscript_collate_func,
                                           shuffle=True)

val_dataset = MedTranscriptDataset(val_data_indices, val_target)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=medtranscript_collate_func,
                                           shuffle=True)

test_dataset = MedTranscriptDataset(test_data_indices, test_target)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=medtranscript_collate_func,
                                           shuffle=False)

In [251]:
data, lengths, labels = next(iter(test_loader))
print(data)
print(labels)

tensor([[   94,    70,  1720,  ...,    12,   795,   784],
        [   39, 26780,   725,  ...,     0,     0,     0],
        [   94,    70, 11367,  ...,     0,     0,     0],
        ...,
        [   94,    70,    21,  ...,   212,   870,    12],
        [ 2001,     5,    39,  ...,     0,     0,     0],
        [   39,   651,    18,  ...,    52,     3,    28]])
tensor([4, 4, 4, 4, 4, 2, 0, 0, 4, 4, 1, 0, 4, 0, 1, 1, 4, 4, 3, 2, 0, 3, 3, 0,
        1, 3, 4, 1, 1, 0, 0, 0])


In [260]:
embedded_train_dataset = MedTranscriptDataset(train_data_vectors, train_target)

In [261]:
idx = 0
print("x {};\n y {}".format(embedded_train_dataset[idx][0], embedded_train_dataset[idx][2]))
print("len sentence", embedded_train_dataset[idx][1])
print(train_target[idx])

x [array([ 2.4444e-01, -1.6053e-01,  6.6681e-02,  1.0138e+00,  1.1697e+00,
       -8.3291e-01, -3.1749e-01, -1.4103e-03,  4.6096e-02, -1.6956e+00,
        3.9535e-01,  9.7831e-01, -1.5315e+00,  4.1920e-02,  1.8251e-01,
        4.3828e-01,  1.8746e-02,  4.3894e-01,  5.7750e-02, -7.6984e-02,
        1.9547e-01,  1.2758e-01, -5.6895e-01, -4.8464e-01, -8.1186e-01,
       -2.3198e+00, -2.0311e-01, -4.0964e-01, -1.7205e+00,  1.0985e+00,
        2.2903e+00, -7.2011e-01, -2.7931e-01, -3.6026e-01,  1.3228e-02,
       -4.3625e-01, -2.4718e-01, -4.3086e-01,  7.8072e-01,  2.4224e-01,
       -2.7596e-01,  1.3542e+00, -6.4568e-01, -7.7009e-01,  6.6655e-01,
        2.1232e-02, -1.0171e+00,  7.4007e-01,  1.9035e-01,  1.0739e+00],
      dtype=float32), array([ 0.062616, -0.60173 , -0.67431 ,  0.87347 ,  0.1886  ,  1.1076  ,
        0.20502 ,  0.48316 ,  0.32292 , -0.37977 , -0.24198 , -0.022846,
       -0.35329 , -0.64825 ,  0.9643  , -0.46012 , -1.0898  , -0.4807  ,
        0.7422  , -0.51278 ,  0.479

In [255]:
BATCH_SIZE = 32

embedded_train_dataset = MedTranscriptDataset(train_data_vectors, train_target)
embedded_train_loader = torch.utils.data.DataLoader(dataset=embedded_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=medtranscript_collate_func,
                                           shuffle=True)

embedded_val_dataset = MedTranscriptDataset(val_data_vectors, val_target)
embedded_val_loader = torch.utils.data.DataLoader(dataset=embedded_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=medtranscript_collate_func,
                                           shuffle=True)

embedded_test_dataset = MedTranscriptDataset(test_data_vectors, test_target)
embedded_test_loader = torch.utils.data.DataLoader(dataset=embedded_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=medtranscript_collate_func,
                                           shuffle=False)

In [256]:
data, lengths, labels = next(iter(embedded_test_loader))
print(data)
print(labels)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, int64, int32, int16, int8, uint8, and bool.