In [1]:
import re
import os
import time
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
# from gensim.models import KeyedVectors
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, Dataset

[nltk_data] Downloading package punkt to /Users/nhungle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
import spacy
import string

In [3]:
raw_data = pd.read_csv('medicaltranscriptions.zip')

In [4]:
raw_data.columns

Index(['Unnamed: 0', 'description', 'medical_specialty', 'sample_name',
       'transcription', 'keywords'],
      dtype='object')

In [5]:
top_5_specialties = list(raw_data['medical_specialty'].value_counts()[:5].index)
data = raw_data.loc[raw_data['medical_specialty'].isin(top_5_specialties)]

### Clean Text

In [6]:
# Clean Data
import clean_text


[nltk_data] Downloading package punkt to /Users/nhungle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
list(data['transcription'].apply(type).value_counts().index)

[str, float]

**Note** We replace missing transcription with description

In [26]:
data['transcription'].isnull().sum()
data.loc[data['transcription'].apply(type)==float]

16

In [30]:
data['transcription'].fillna(data['description'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [33]:
data.loc[data.description == data.transcription].shape

(16, 6)

In [34]:
data['clean_transcription'] = clean_text.cleanText(data['transcription'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### Get Train/Val/Test set

In [53]:
import torch
torch.manual_seed(0)

<torch._C.Generator at 0x1a29be9550>

In [55]:
train_data, val_data = train_test_split(data, test_size=0.40, random_state=42)
val_data, test_data = train_test_split(data, test_size=0.50, random_state=40)

In [56]:
train_data.index = np.arange(len(train_data))
val_data.index = np.arange(len(val_data))
test_data.index = np.arange(len(test_data))

In [61]:
train_data.shape

(1571, 7)

In [58]:
def getTextTarget(data):
    text = data['clean_transcription']
    target = data['medical_specialty']
    return text, target

In [62]:
train_text, train_target = getTextTarget(train_data)
val_text, val_target = getTextTarget(val_data)
test_text, test_target = getTextTarget(test_data)

### Tokenize

In [79]:
class Tokenizer(object):
    def __init__(self, dataset):
        res = self.tokenize_dataset(dataset)
        self.token_dataset = res[0]
        self.all_tokens = res[1]
    
    def tokenize(self, sentence):
        punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        tokens = word_tokenize(sentence)
        return [token for token in tokens if (token not in punctuations)]
    
    def tokenize_dataset(self, dataset):
        token_dataset = []
        all_tokens = []
        # we are keeping track of all tokens in dataset 
        # in order to create vocabulary later

        for sample in dataset:
            tokens = self.tokenize(sample)
            token_dataset.append(tokens)
            all_tokens += tokens

        return token_dataset, all_tokens

In [81]:
train_tokenizer = Tokenizer(train_text)
train_tokens = train_tokenizer.token_dataset
all_tokens = train_tokenizer.all_tokens

In [84]:
test_tokenizer = Tokenizer(test_text)
test_tokens = test_tokenizer.token_dataset
val_tokenizer = Tokenizer(val_text)
val_tokens = val_tokenizer.token_dataset

In [86]:
len(val_tokens)

1309

In [90]:
data_root = '/Users/nhungle/Desktop/repos/Data-Science-Projects/MedTranscript/data'

In [91]:
import pickle as pkl
pkl.dump(val_tokens, open(os.path.join(data_root, "val_data_tokens.p"), "wb"))
pkl.dump(test_tokens, open(os.path.join(data_root, "test_data_tokens.p"), "wb"))
pkl.dump(train_tokens, open(os.path.join(data_root, "train_data_tokens.p"), "wb"))
pkl.dump(all_tokens, open(os.path.join(data_root, "all_data_tokens.p"), "wb"))


### Dictionary

In [93]:
class Dictionary(object):
    def __init__(self, all_tokens, max_vocab_size, PAD_IDX, UNK_IDX):
        res = self.buildVocab(all_tokens, max_vocab_size, PAD_IDX, UNK_IDX)
        # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
        self.id2token = res[1]
        # token2id: dictionary where keys represent tokens and corresponding values represent indices
        self.token2id = res[0]
    
    def buildVocab(self, all_tokens, max_vocab_size, PAD_IDX, UNK_IDX):
        token_counter = Counter(all_tokens)
        vocab, count = zip(*token_counter.most_common(max_vocab_size))
        id2token = list(vocab)
        token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
        id2token = ['<pad>', '<unk>'] + id2token
        token2id['<pad>'] = PAD_IDX 
        token2id['<unk>'] = UNK_IDX
        return token2id, id2token
    
    def __len__(self):
        return len(self.id2token)
 

In [94]:
from collections import Counter

max_vocab_size = 30000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1
corpus = Dictionary(all_tokens, max_vocab_size, PAD_IDX, UNK_IDX)
id2token = corpus.id2token
token2id = corpus.token2id

In [96]:
corpus.__len__()

30002

In [99]:
#token2id