I followed this tutorial:
https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
import pandas as pd

In [2]:
import pickle

conf = SparkConf().setAppName("word2vec_keras").setMaster("local3")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.master("local3").appName("word2vec_keras").getOrCreate()

ICD9CODES = pickle.load(open("./data/ICD9CODES.p", "r"))

df_hadm_top10 = pd.read_csv("./data/DATA_HADM_TOP10.csv", escapechar='\\')
print df_hadm_top10.head()

       id  4019  2724  25000  4280  41401  53081  51881  42731  5849  5990  \
0  117760     0     0      0     0      0      1      1      0     0     0   
1  129030     1     1      0     0      0      1      0      0     0     0   
2  172040     0     0      0     0      1      0      0      0     1     0   
3  156170     0     0      1     1      0      0      0      1     1     0   
4  199180     0     0      1     1      1      0      0      0     0     0   

                                                text  
0  "Admission Date:  [**2118-12-14**]            ...  
1  Admission Date:  [**2137-8-31**]              ...  
2  Admission Date:  [**2174-1-6**]              D...  
3  Admission Date:  [**2102-6-9**]              D...  
4  Admission Date:  [**2164-7-2**]       Discharg...  


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords

STOPWORDS_WORD2VEC = stopwords.words('english') + ICD9CODES

def preprocessor_word2vec(text):
    text = re.sub('\[\*\*[^\]]*\*\*\]', '', text)
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower()) 
    text = re.sub(" \d+", " ", text)
    
    return text

texts = df_hadm_top10['text'].apply(preprocessor_word2vec)
#texts = df_hadm_top10['text']  # list of text samples
#labels_index = {}  # dictionary mapping label name to numeric id
#labels = []  # list of label ids

toke = Tokenizer()
toke.fit_on_texts(texts)
sequence = toke.texts_to_sequences(texts)

word_index = toke.word_index
reverse_word_index = dict(zip(word_index.values(), word_index.keys())) # dict e.g. {1:'the', 2:'a' ...}
index_list = word_index.values()

print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequence)

#labels = to_categorical(np.asarray(labels))
#print('Shape of data tensor:', data.shape)
#print('Shape of label tensor:', labels.shape)

Found 117458 unique tokens.


In [None]:
import random

def separate(seed, N):    
    idx=list(range(N))
    random.seed(seed)
    random.shuffle(idx)
    idx_train= idx[0:int(N*0.50)]
    idx_val= idx[int(N*0.50):int(N*0.75)]
    idx_test= idx[int(N*0.75):N]

    return idx_train, idx_val, idx_test


idx_train, idx_val, idx_test = separate(1234, df_hadm_top10.shape[0])

#labels=['id',`4019`, `2724`,`25000`,`4280`,`41401`,`53081`,`51881`,`42731`,`5849`,`5990`]
labels = list(df_hadm_top10.columns)
labels.remove('text')
labels.remove('id')

train_label=df_hadm_top10.loc[idx_train][labels]
val_label=df_hadm_top10.loc[idx_val][labels]
test_label=df_hadm_top10.loc[idx_test][labels]

train_sequence=data[idx_train]
val_sequence=data[idx_val]
test_sequence=data[idx_test]


In [32]:
# split the data into a training set and a validation set
#indices = np.arange(data.shape[0])
#np.random.shuffle(indices)
#data = data[indices]
#labels = labels[indices]
#nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

#x_train = data[:-nb_validation_samples]
#y_train = labels[:-nb_validation_samples]
#x_val = data[-nb_validation_samples:]
#y_val = labels[-nb_validation_samples:]

Preparing the embedding layer

In [9]:
import numpy as np
import os

embeddings_index = {}
f = open(os.path.join("./data/", 'model_word2vec.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 33837 word vectors.


In [10]:
# Delete stopwords and ICD9 codes from pre-trained dictionary , 
# so they will be zeros when we create embedding_matrix

from nltk.corpus import stopwords

STOPWORDS_WORD2VEC = stopwords.words('english') + ICD9CODES
keys_updated = [word for word in embeddings_index.keys() if word not in STOPWORDS_WORD2VEC]
index2word_set=set(keys_updated)

We leverage our embedding_index dictionary and our word_index to compute our embedding matrix:

In [12]:
EMBEDDING_DIM=100  # dimensions of the word2vec model
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in index2word_set: 
        #embedding_vector = embeddings_index.get(word)
    #if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embeddings_index.get(word)

Finally, we load this embedding matrix into an Embedding layer. Note that we set trainable=False to prevent the weights from being updated during training.

In [13]:
from keras.layers import Embedding

MAX_SEQUENCE_LENGTH= len(max(sequence,key=len))  # calculation of maximun sequence length
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [26]:
import numpy as np
print train_label.head()
print val_label.head()
print test_label.head()

train_label = n
print train_sequence[:5,:]
print val_sequence[:5,:]
print test_sequence[:5,:]



           id  4019  2724  25000  4280  41401  53081  51881  42731  5849  5990
22653  112108     1     0      0     1      0      0      0      0     0     0
14729  136683     1     0      0     0      0      0      0      0     0     0
17620  187756     0     0      1     1      0      1      0      1     0     0
25598  108901     0     1      0     1      0      0      0      1     0     1
48887  138443     0     0      0     0      0      0      0      0     0     0
           id  4019  2724  25000  4280  41401  53081  51881  42731  5849  5990
2763   103972     0     0      0     0      0      0      0      0     0     0
36538  167599     0     0      0     0      0      0      0      0     0     0
824    144673     0     0      0     0      0      0      1      0     0     0
10865  189107     0     0      0     0      1      0      0      0     0     0
50721  116601     1     0      0     0      0      0      0      0     1     0
           id  4019  2724  25000  4280  41401  53081