In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import gensim
import matplotlib.pyplot as plt
#import scikitplot.plotters as skplt
import nltk
from collections import Counter
import time
import tensorflow as tf
from matplotlib.patches import Patch
from matplotlib.markers import MarkerStyle
import seaborn as sns
from sklearn.manifold import TSNE
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import random

In [3]:
import json as json
import pandas as pd

## Get the Data

with open("data/icd2pmid.json", 'r') as f:
    Data_pmid =  json.load(f)

heart_pmid = Data_pmid['heart failure']

with open("data/cvddocs.json", 'r') as f:
    Data_title =  json.load(f)

# creating a list with title and pmid
title = []
for item in Data_title:
    #print(item)
    title.append({"pmid": item['pmid'], "title": item['title']})

# converting list to dataframe
title_df = pd.DataFrame(title)

title_df = title_df[title_df['pmid'].isin(heart_pmid)]
title_df = title_df.reset_index(drop=True)
title_df = title_df.drop_duplicates('pmid', keep = False)

## Data Cleaning

import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

def clean_text(text):
    #Convert to lower case
    text = text.lower()
    #Lemmatizing the text
    lemma = WordNetLemmatizer()
    normalized = " ".join(lemma.lemmatize(word, pos = "v") for word in text.split())
    #Removing White spaces
    normalized = normalized.replace('\d+', '')
    normalized = normalized.strip()
    #Tokenize and extract words that are alpha-numeric
    tokens = word_tokenize(normalized)
    cleaned = [word for word in tokens if word.isalpha()]
    #Create a dictionary of stem-words such as "at" and 
    #"the" that don't contribute to meaning and remove them from the list
    stop_words = set(stopwords.words('english'))
    words = [w for w in cleaned if not w in stop_words]
    #Remove punctuations
    #exclude = set(string.punctuation)
    #punc_free = [ch for ch in stop_words if ch not in exclude]
    return words

title_df["title"] = [clean_text(text) for text in title_df["title"]]
title_df.head()

Unnamed: 0,pmid,title
48470,10694616,"[result, revascularization, patients, severe, ..."
48471,25746522,"[diastolic, dysfunction]"
48473,3963944,"[clinical, experience, timolol, maleate, monot..."
48477,29524314,"[semaphorin, level, heart, failure, patients, ..."
48478,12008175,"[effect, perindopril, aldosterone, production,..."


In [4]:
import itertools
from collections import Counter
title_list = itertools.chain(title_df['title'])
title_list = list(title_list)
flattened = [val for sublist in title_list for val in sublist]
counts = Counter(flattened)
df_counts = pd.DataFrame.from_dict(counts, orient='index').reset_index()
df_counts.columns = ['word', 'count']
df_counts = df_counts.sort_values(['count'], ascending=[False])
df_counts = df_counts.reset_index(drop=True)
df_counts.head(10)

Unnamed: 0,word,count
0,heart,30633
1,failure,27044
2,patients,13845
3,cardiac,7206
4,chronic,6328
5,ventricular,5479
6,leave,4384
7,effect,3863
8,study,3848
9,disease,3736


In [5]:
df_counts['rel_freq'] = df_counts['count']/sum(df_counts['count'])

In [6]:
keep_words = df_counts[df_counts['count'] > 100]
keep_words = keep_words['word']

In [7]:
keep_words = list(keep_words)
def list_comp(text):
    return [x for x in text if x in keep_words]

title_df["title"] = [list_comp(text) for text in title_df["title"]]
title_df.reset_index(inplace = True) 
title_df.head(10)

Unnamed: 0,index,pmid,title
0,48470,10694616,"[result, revascularization, patients, severe, ..."
1,48471,25746522,"[diastolic, dysfunction]"
2,48473,3963944,"[clinical, experience, hypertension]"
3,48477,29524314,"[level, heart, failure, patients, potential, n..."
4,48478,12008175,"[effect, aldosterone, production, fail, human,..."
5,48480,21080863,"[change, blood, pressure, acute, heart, failur..."
6,48481,29202359,"[intravenous, administration, rat, chronic, he..."
7,48486,16555861,"[mechanisms, management, digoxin]"
8,48490,21140063,"[heart, energy, metabolism, role, treatment, h..."
9,48491,12385167,"[status, outcomes, disease, management, progra..."


In [8]:
from w2vec_helper import preprocess,token_lookup,create_lookup_tables

In [9]:
all_txt = " ".join([wd for wd in flattened])
            
words = preprocess(all_txt)
print("Total words: {}".format(len(words)))
print("Unique words: {}".format(len(set(words))))
print(words[:30])

Total words: 461420
Unique words: 4692
['result', 'revascularization', 'patients', 'severe', 'leave', 'ventricular', 'dysfunction', 'diastolic', 'dysfunction', 'clinical', 'experience', 'monotherapy', 'hypertension', 'level', 'heart', 'failure', 'patients', 'potential', 'novel', 'biomarker', 'acute', 'heart', 'failure', 'effect', 'perindopril', 'aldosterone', 'production', 'fail', 'human', 'heart']


In [10]:
vocab_to_int, int_to_vocab = create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

In [11]:
from collections import Counter

threshold = 1e-5
word_counts = Counter(int_words)
total_count = len(int_words)
freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]

In [12]:
def get_target(words, idx, window_size=5):
    ''' Get a list of words in a window around an index. '''
    
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])
    
    return list(target_words)

In [13]:
def get_batches(words, batch_size, window_size=5):
    ''' Create a generator of word batches as a tuple (inputs, targets) '''
    
    n_batches = len(words)//batch_size
    
    # only full batches
    words = words[:n_batches*batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y

In [14]:
train_graph = tf.Graph()
with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')

In [15]:
n_vocab = len(int_to_vocab)
n_embedding = 200 # Number of embedding features 
with train_graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)

In [16]:
# Number of negative labels to sample
n_sampled = 100
with train_graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, n_embedding), stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(n_vocab))
    
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, 
                                      labels, embed,
                                      n_sampled, n_vocab)
    
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer().minimize(cost)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [17]:
with train_graph.as_default():
    ## From Thushan Ganegedara's implementation
    valid_size = 16 # Random set of words to evaluate similarity on.
    valid_window = 100
    # pick 8 samples from (0,100) and (1000,1100) each ranges. lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, 
                               random.sample(range(1000,1000+valid_window), valid_size//2))

    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [18]:
# If the checkpoints directory doesn't exist:
!mkdir checkpoints

mkdir: cannot create directory ‘checkpoints’: File exists


In [19]:
epochs = 10
batch_size = 1000
window_size = 10

with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())

    for e in range(1, epochs+1):
        batches = get_batches(train_words, batch_size, window_size)
        start = time.time()
        for x, y in batches:
            
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
            
            loss += train_loss
            
            if iteration % 100 == 0: 
                end = time.time()
                print("Epoch {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss/100),
                      "{:.4f} sec/batch".format((end-start)/100))
                loss = 0
                start = time.time()
                
            if iteration % 1000 == 0:
                # note that this is expensive (~20% slowdown if computed every 500 steps)
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = int_to_vocab[valid_examples[i]]
                    top_k = 8 # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    log = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        close_word = int_to_vocab[nearest[k]]
                        log = '%s %s,' % (log, close_word)
                    print(log)
            
            iteration += 1
    save_path = saver.save(sess, "checkpoints/text8.ckpt")
    embed_mat = sess.run(normalized_embedding)

Epoch 2/10 Iteration: 100 Avg. Training loss: 4.9495 0.0253 sec/batch
Epoch 4/10 Iteration: 200 Avg. Training loss: 4.8479 0.0015 sec/batch
Epoch 5/10 Iteration: 300 Avg. Training loss: 4.7520 0.0269 sec/batch
Epoch 7/10 Iteration: 400 Avg. Training loss: 4.6756 0.0030 sec/batch
Epoch 8/10 Iteration: 500 Avg. Training loss: 4.5989 0.0283 sec/batch
Epoch 10/10 Iteration: 600 Avg. Training loss: 4.5003 0.0045 sec/batch


In [20]:
with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    embed_mat = sess.run(embedding)

INFO:tensorflow:Restoring parameters from checkpoints/text8.ckpt


In [21]:
print(embed_mat)

[[-0.38370183  0.62634724 -0.01755597 ...  0.15212269 -0.5455985
   0.22186518]
 [-0.3499129  -0.3837383   0.73370355 ...  0.833947    0.14912339
  -0.75480413]
 [ 0.01575112 -0.44451842 -0.19250113 ...  0.00424699 -0.28120068
   0.13040693]
 ...
 [-0.3695202  -0.99509495  0.5127186  ...  0.8638124   0.3259946
  -0.28194782]
 [ 0.78613424 -0.06840792  0.68068725 ...  0.5955178  -0.7683486
  -0.11095252]
 [-0.59153724 -0.62557    -0.19154978 ... -0.01748623  0.07661288
  -0.8053517 ]]


In [22]:
embed_mat.shape

(4692, 200)

In [23]:
embed_mat[4691,]

array([-0.59153724, -0.62557   , -0.19154978,  0.9077609 , -0.4875117 ,
       -0.05857721, -0.7775072 , -0.39937663, -0.9316475 ,  0.75631523,
       -0.09152687, -0.52588445, -0.83428675, -0.56277335,  0.24134283,
        0.48273927,  0.6694711 , -0.07902664, -0.6406285 , -0.7445849 ,
        0.77208906,  0.953192  , -0.24833632,  0.58251107, -0.2996608 ,
       -0.3894344 , -0.5872499 ,  0.90599215, -0.34316316,  0.24182503,
       -0.9237751 ,  0.5776705 ,  0.8524521 ,  0.6696428 , -0.6918856 ,
       -0.45416665, -0.70951474,  0.10644202,  0.42990303,  0.85514194,
        0.44066688, -0.3827289 , -0.02647946,  0.69514537, -0.2562705 ,
        0.43391243, -0.95366085, -0.16764194, -0.96034676, -0.47389036,
       -0.39404026, -0.19322473, -0.23624629, -0.6699079 ,  0.29834384,
        0.48599827, -0.70355326,  0.43779343,  0.32674915,  0.0185207 ,
        0.75192124,  0.62641305, -0.49355423, -0.9114425 ,  0.9456332 ,
       -0.32953522,  0.862535  , -0.5511835 , -0.23035178, -0.66

In [24]:
unique_words =list(set(words))
unique_words

['attend',
 'construct',
 'intravenous',
 'core',
 'tai',
 'triatriatum',
 'address',
 'us',
 'neurons',
 'stable',
 'macrovascular',
 'bradykinin',
 'cardiologists',
 'match',
 'drosophila',
 'agonist',
 'swiss',
 'granulomatosis',
 'interdisciplinary',
 'present',
 'biologic',
 'injuries',
 'ntprobnp',
 'emergency',
 'classic',
 'molecule',
 'class',
 'doppler',
 'neonate',
 'chf',
 'india',
 'contemporary',
 'ligands',
 'vital',
 'actual',
 'cool',
 'timing',
 'italiano',
 'require',
 'isoproterenol',
 'cannabinoid',
 'inducible',
 'combination',
 'intracardiac',
 'evidence',
 'color',
 'mobilize',
 'bicycle',
 'transfer',
 'immunologic',
 'thrombus',
 'adjustments',
 'cyst',
 'g',
 'bioelectrical',
 'soul',
 'furosemide',
 'ectopic',
 'similarities',
 'nervous',
 'attack',
 'propensity',
 'preoperative',
 'get',
 'readmissions',
 'california',
 'wearable',
 'efficient',
 'maneuver',
 'titin',
 'relationships',
 'dipyridamole',
 'oxygen',
 'salt',
 'hospital',
 'dehiscence',
 'orien

In [25]:
index = [unique_words.index(x) for x in title_df['title'][1]]
np.sum(embed_mat[index])

-0.24326706

In [None]:
# k means determine k
distortions = []
K = range(1, 20)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(embed_mat)
    kmeanModel.fit(embed_mat)
    distortions.append(sum(np.min(cdist(embed_mat, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / len(embed_mat))

# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(embed_mat)

print("Top terms per cluster:")

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

## t-SNE for Word2Vec

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

In [None]:
w = set(words)
words_df = pd.DataFrame(list(w), columns=['words']) 
words_df.head()

In [None]:
words_df['cluster'] = model.fit_predict(embed_mat)

In [None]:
sns.set(font_scale=1.8)

In [None]:
from sklearn.manifold import TSNE

In [None]:
clrs = []
for row,cluster in words_df[['cluster']].T.iteritems():
    val = np.array(cluster)
    #cluster = max(val)
    for item in val:
        if  item == 0:
            grp =  'navy'
        elif item == 1:
            grp =  'green'
        elif item == 2:
            grp = 'firebrick'
        elif item == 3:
            grp = 'mediumslateblue'
        elif item == 4:
            grp = 'darkgoldenrod'
        elif item == 5:
            grp = 'deepskyblue'
        elif item == 6:
            grp = 'red'
        elif item == 7:
            grp = 'yellowgreen'
        elif item == 8:
            grp = 'yellow'
        elif item == 9:
            grp = 'violet'
    clrs.append(grp)

In [None]:
data = np.array(embed_mat)
Xtsne2d = TSNE(n_components=2).fit_transform(data)
Xtsne2d.shape

In [None]:
x_min, x_max = np.min(Xtsne2d, axis=0), np.max(Xtsne2d, axis=0)
Xtsne2d = (Xtsne2d - x_min) / (x_max - x_min)

In [None]:
Xtsne2d[0:5]

In [None]:
PN = Xtsne2d.shape[0]

In [None]:
X2d =[]
Y2d = []
for i in range(PN):
    X2d.append(Xtsne2d[i][0])
    Y2d.append(Xtsne2d[i][1])

In [None]:
plt.figure(figsize = [12,10])
plt.grid(True)
#plt.axhline(y=0, color='k')
#plt.axvline(x=0, color='k')
plt.title('Two dimensional manifold of ICD 11 Cardiovascular Disease Titles')
plt.xlabel("Dimension 1", fontsize=20)
plt.ylabel("Dimension 2", fontsize=20)
plt.scatter(X2d,Y2d,color = clrs, marker ='.')
#plt.savefig('tSNE2d.pdf')