In [1]:
import tensorflow as tf
from tensorflow.contrib import rnn
import pandas as pd
import numpy as np
from collections import Counter
from utils.Word2VecEncoder import Word2VecEncoder

Using Theano backend.


In [2]:
train = pd.read_csv('GP Prescribing - July 2017 (csv).csv', encoding = "ISO-8859-1")

In [3]:
test = pd.read_csv('GP Prescribing - August 2017 (csv).csv', encoding = "ISO-8859-1")

In [4]:
train = train.drop(['Gross Cost (£)', 'Unnamed: 17','Unnamed: 18'], 1)

In [5]:
train = train.dropna(axis=0)

In [6]:
#Get categorical data attributes, only for VMP_NM, AMP_NM and Presentation
categoric_values = ['VMP_NM', 'AMP_NM', 'Presentation']

#Initialize word2vec class with downsampling true, set a small sample size of 5000 in this case
w2v = Word2VecEncoder(train, categoric_values, down_sample = True, sample_size =len(train))

In [7]:
#Encode word2vec using default parameters
# w2v.encode()

In [8]:
# #Now to look at some info about the word2vec 
# #First outputing what the vector space model considers similar words for some examples in its vocabulary
# vocab = list(w2v.model.wv.vocab.keys())
# print(vocab)

In [9]:
# #Plot words in a scatterplot
# import matplotlib.pyplot as plt

# ax, fig  = w2v.plotWords()
# plt.show()

In [10]:
# w2v.saver(model_name = "w2v_model_July_2017")

In [11]:
np.mean([len(i.split(' ')) for i in train['VMP_NM']])

4.6402131279921557

In [12]:
context_size = 10
words = []
words_index = []
for i, line in enumerate(train['VMP_NM']):
    tokens = line.split()
    if '-' in tokens or None in tokens:
        words.extend('<unk>')
    else:
        words.extend(tokens)

In [13]:
vocab_size = 1000

word_counter = Counter(words).most_common(vocab_size - 1)
word_to_idx = {'<unk>': 0}
for i, _ in enumerate(word_counter):
    word_to_idx[_[0]] = i + 1

In [14]:
vocab_list = word_to_idx.keys()

In [15]:
context_size = 10
words = []
words_index = []
for i, line in enumerate(train['VMP_NM']):
    tokens = line.split()
    # NOTE Currently, only sentences with a fixed size are chosen
    # to account for fixed convolutional layer size.
    if len(tokens) > int(context_size - 2) or '-' in tokens:
        words.append((['<pad>'] * int(int(context_size - 2) - 1)) + ['<s>'] + ['Unknown'] + ['</s>'])
        words_index.append(i)
    elif len(tokens) < int(context_size - 2):
        words.append((['<pad>'] * int(int(context_size - 2) - len(tokens))) + ['<s>'] + tokens + ['</s>'])
        words_index.append(i)
    else:
        words.append((['<s>'] + tokens + ['</s>']))
        words_index.append(i)

In [16]:
words

[['<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<s>',
  'Unknown',
  '</s>'],
 ['<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<s>',
  'Colostomy',
  'bags',
  '</s>'],
 ['<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<s>',
  'Colostomy',
  'bags',
  '</s>'],
 ['<pad>',
  '<pad>',
  '<s>',
  'Dexamethasone',
  '3.3mg/1ml',
  'solution',
  'for',
  'injection',
  'ampoules',
  '</s>'],
 ['<pad>',
  '<pad>',
  '<pad>',
  '<s>',
  'Disposal',
  'unit',
  'for',
  'hypodermic',
  'equipment',
  '</s>'],
 ['<pad>',
  '<pad>',
  '<pad>',
  '<s>',
  'Disposal',
  'unit',
  'for',
  'hypodermic',
  'equipment',
  '</s>'],
 ['<pad>',
  '<pad>',
  '<s>',
  'Elasticated',
  'viscose',
  'stockinette',
  'blue',
  'line',
  '7.5cm',
  '</s>'],
 ['<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<s>',
  'Unknown',
  '</s>'],
 ['<pad>',
  '<pad>',
  '<pad>',
  '<s>',
  'Generic',
  'Elleste',
  'Duet'

In [17]:
words = np.array(words)[words_index]

In [18]:
encoded_words = []
for word_list in words:
    encoded_word_list = []
    for word in word_list:
        if word in vocab_list:
            encoded_word_list.append(word_to_idx[word])
        else:
            encoded_word_list.append(0)
    encoded_words.append(encoded_word_list)

In [19]:
len(encoded_words)

416088

In [20]:
train['Actual Cost (£)'].head()

1      24.59
8     238.32
9     529.08
11     16.04
12      3.05
Name: Actual Cost (£), dtype: float64

In [21]:
y_data = np.array(train['Actual Cost (£)'])[words_index]

In [22]:
len(y_data)

416088

In [39]:
def create_batches(data, y_data, batch_size):
    num_batches = int(len(data) / (batch_size * context_size))
    data = data[:int(num_batches * batch_size * context_size)]
    
    xdata = data
    ydata = y_data[:int(num_batches * batch_size * context_size)]
    
    x_batches = np.split(np.array(xdata), context_size*num_batches)
    y_batches = np.split(ydata, context_size*num_batches)
    return x_batches, y_batches, num_batches

In [40]:
x_batches, y_batches, num_batches = create_batches(encoded_words, y_data, 128)

In [43]:
x_batches[0].shape

(128, 10)

In [59]:
# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [60]:
n_steps = x_batches[0].shape[1]
n_inputs = 1
n_neurons = 150
n_outputs = 1
n_layers = 3

learning_rate = 0.001

In [61]:
lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=n_neurons)

In [62]:
reset_graph(seed=42)

In [63]:
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None])

lstm_cells = [tf.contrib.rnn.BasicLSTMCell(num_units=n_neurons)
              for layer in range(n_layers)]
multi_cell = tf.contrib.rnn.MultiRNNCell(lstm_cells)
outputs, states = tf.nn.dynamic_rnn(multi_cell, X, dtype=tf.float32)
top_layer_h_state = states[-1][1]
logits = tf.layers.dense(top_layer_h_state, n_outputs, name="softmax")
loss = tf.reduce_mean(tf.square(logits - y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
    
init = tf.global_variables_initializer()

In [None]:
n_epochs = 100
batch_size = 128

x_batches, y_batches, num_batches = create_batches(encoded_words, y_data, 128)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(num_batches):
            X_batch, y_batch = x_batches[iteration], y_batches[iteration]
            X_batch = X_batch.reshape((batch_size, n_steps, n_inputs))
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        loss_train = loss.eval(feed_dict={X: X_batch, y: y_batch})
        print("Epoch", epoch, "Train loss =", loss_train)

Epoch 0 Train loss = 37497.3
Epoch 1 Train loss = 37123.6
Epoch 2 Train loss = 37079.4
Epoch 3 Train loss = 37072.8
Epoch 4 Train loss = 37071.6
Epoch 5 Train loss = 37071.4
Epoch 6 Train loss = 37071.3
Epoch 7 Train loss = 37071.3
Epoch 8 Train loss = 37071.3
Epoch 9 Train loss = 37071.3
Epoch 10 Train loss = 37071.2
Epoch 11 Train loss = 37071.2
Epoch 12 Train loss = 37071.3
Epoch 13 Train loss = 37071.3
Epoch 14 Train loss = 37071.2
Epoch 15 Train loss = 37071.2
Epoch 16 Train loss = 37071.3
Epoch 17 Train loss = 37071.2
Epoch 18 Train loss = 37071.2
Epoch 19 Train loss = 37071.2
Epoch 20 Train loss = 37071.2
Epoch 21 Train loss = 37071.2
Epoch 22 Train loss = 37071.2
Epoch 23 Train loss = 37071.2
Epoch 24 Train loss = 37071.2
Epoch 25 Train loss = 37071.2
Epoch 26 Train loss = 37071.2
Epoch 27 Train loss = 37071.2
Epoch 28 Train loss = 37071.2
Epoch 29 Train loss = 37071.2
Epoch 30 Train loss = 37071.2
Epoch 31 Train loss = 37071.2
Epoch 32 Train loss = 37071.2
Epoch 33 Train loss 