In [None]:
# Load all required modules
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.layers import SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [2]:
# Load the dataset
train = pd.read_csv('data.csv')

In [3]:
# Inspect the data
train.head()

Unnamed: 0,field,categories,converse
0,0,PRESCRIPTION,patients aware that he needs rov for refill na...
1,1,ASK_A_DOCTOR,mom wants to know if the drugname needs some d...
2,2,ASK_A_DOCTOR,patients to discuss drugname she says she has ...
3,3,MISCELLANEOUS,fyi nortryptline medication patient prescripti...
4,4,MISCELLANEOUS,letter of patient establishment request name s...


In [4]:
train= train[~train['converse'].isnull()]
print (train.shape)

(57244, 3)


In [5]:
# Use the LabelEncoder from scikit-learn to convert text labels to integers
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.categories.values)

In [None]:
# Test - train split
#xtrain, xvalid, ytrain, yvalid = train_test_split(train.categories.values, y, 
                                 #                 stratify=y, 
                                  #                random_state=42, 
                                   #               test_size=0.1)

In [6]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train['converse'], y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.2)

In [7]:
# Inspect
print (xtrain.shape)
print (xvalid.shape)

(45795,)
(11449,)


In [8]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('glove.6B.300d.txt',encoding='utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

400000it [03:09, 2106.18it/s]


Found 400000 word vectors.


In [9]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())



In [10]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]

100%|███████████████████████████████████| 45795/45795 [02:39<00:00, 287.67it/s]
100%|███████████████████████████████████| 11449/11449 [00:37<00:00, 303.28it/s]


In [11]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [12]:
# scale the data before any neural net:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [13]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)

In [14]:
import keras.backend as K # This 'K' can be used to create user defined functions in keras

# Define a custom function in keras to compute recall.
# Arguments:
# y_true - Actual labels
# y_pred - Predicted labels
def recall(y_true, y_pred):
    TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    PP = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = TP / (PP + K.epsilon())
    return recall

In [15]:
# create a simple 3 layer sequential neural net
model = Sequential()

model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(6))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy', recall])

In [18]:
model.fit(xtrain_glove_scl, y=ytrain_enc, batch_size=64, 
          epochs=20, verbose=2, 
          validation_data=(xvalid_glove_scl, yvalid_enc))

Train on 45795 samples, validate on 11449 samples
Epoch 1/20
 - 21s - loss: 0.4296 - acc: 0.8329 - recall: 0.8065 - val_loss: 0.5952 - val_acc: 0.7865 - val_recall: 0.7589
Epoch 2/20
 - 21s - loss: 0.4198 - acc: 0.8366 - recall: 0.8116 - val_loss: 0.5901 - val_acc: 0.7874 - val_recall: 0.7614
Epoch 3/20
 - 21s - loss: 0.4114 - acc: 0.8389 - recall: 0.8145 - val_loss: 0.5920 - val_acc: 0.7885 - val_recall: 0.7560
Epoch 4/20
 - 16s - loss: 0.3997 - acc: 0.8433 - recall: 0.8191 - val_loss: 0.6054 - val_acc: 0.7882 - val_recall: 0.7616
Epoch 5/20
 - 13s - loss: 0.3935 - acc: 0.8463 - recall: 0.8249 - val_loss: 0.6039 - val_acc: 0.7855 - val_recall: 0.7595
Epoch 6/20
 - 10s - loss: 0.3851 - acc: 0.8493 - recall: 0.8275 - val_loss: 0.6116 - val_acc: 0.7849 - val_recall: 0.7582
Epoch 7/20
 - 10s - loss: 0.3771 - acc: 0.8520 - recall: 0.8322 - val_loss: 0.6116 - val_acc: 0.7856 - val_recall: 0.7612
Epoch 8/20
 - 10s - loss: 0.3720 - acc: 0.8525 - recall: 0.8330 - val_loss: 0.6197 - val_acc: 0.

<keras.callbacks.History at 0xd8a82d1b38>

In [19]:
# Use LSTM

# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [20]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|█████████████████████████████████| 39288/39288 [00:00<00:00, 93700.34it/s]


In [21]:
# A simple LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(100, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(100, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(6))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',metrics=['accuracy', recall], optimizer='adam')

In [22]:
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=5,  validation_data=(xvalid_pad, yvalid_enc),verbose=2)

Train on 45795 samples, validate on 11449 samples
Epoch 1/5
 - 197s - loss: 1.6662 - acc: 0.2598 - recall: 0.0148 - val_loss: 1.4332 - val_acc: 0.4153 - val_recall: 0.1342
Epoch 2/5
 - 191s - loss: 1.4457 - acc: 0.3850 - recall: 0.1380 - val_loss: 1.2012 - val_acc: 0.5462 - val_recall: 0.3024
Epoch 3/5
 - 196s - loss: 1.2909 - acc: 0.4825 - recall: 0.2467 - val_loss: 1.1450 - val_acc: 0.5788 - val_recall: 0.3002
Epoch 4/5
 - 199s - loss: 1.1974 - acc: 0.5408 - recall: 0.2987 - val_loss: 1.0194 - val_acc: 0.6363 - val_recall: 0.3587
Epoch 5/5
 - 201s - loss: 1.1319 - acc: 0.5842 - recall: 0.3422 - val_loss: 0.9488 - val_acc: 0.6667 - val_recall: 0.3944


<keras.callbacks.History at 0xd8a8602160>