In [1]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [10]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt
#from inspect import signature
from sklearn.metrics import roc_curve, auc

In [3]:
imdb = tf.keras.datasets.imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)


# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()} 
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3





train_data = tf.keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=120)

test_data = tf.keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=120)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [4]:
maxSeqLength = 120

def getTrainBatch():
  labels = []
  arr = train_data[0:]
  for i in range(train_data.shape[0]):
    if(train_labels[i] == 1):
      labels.append([0,1])
    else:
      labels.append([1,0])
  return arr, labels


def getTestBatch():
  labels = []
  arr = test_data
  for i in range(train_data.shape[0]):
    if(test_labels[i] == 1):
      labels.append([0,1])
    else:
      labels.append([1,0])

  return arr, labels

In [5]:
embedding_size = 1

inputs = tf.placeholder(tf.int32, [None,120], name='word_ids')

# This is where the embedding vectors live
# This will be modified by the optimization unless trainable=False
# I choose random normal distribution but you can try other distributions
embeddings = tf.random_normal(shape=(25000, embedding_size))

# this will return the embedding lookup
embedded = tf.nn.embedding_lookup(embeddings, inputs)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
transformed = sess.run(embedded, {inputs: train_data})
transformed = transformed.reshape(25000, 120)
print (transformed.shape)


(25000, 120)


In [6]:
# vanilla rnn
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 1000

tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [25000, numClasses])
input_data = tf.placeholder(tf.int32, [25000, maxSeqLength])

data = tf.Variable(tf.zeros([batchSize, maxSeqLength, 300]),dtype=tf.float32)
data = tf.nn.embedding_lookup(transformed,input_data)

lstmCell = tf.nn.rnn_cell.BasicRNNCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)


correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))


loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

train_acc = []
train_loss = []

val_acc = []
val_loss = []


for i in range(iterations):
    #Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch();
    #print (nextBatch)
    _,acc,los = sess.run([optimizer, accuracy, loss], {input_data: nextBatch, labels: nextBatchLabels})
    print ("train accuracy: {}  train loss: {}".format(acc, los))
    train_acc.append(acc)
    train_loss.append(los)
    #Write summary to Tensorboard
    if (i % 50 == 0):
        #summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        #writer.add_summary(summary, i)
        nextBatch, nextBatchLabels = getTestBatch();
        _,acc,los,preds = sess.run([optimizer, accuracy, loss, prediction], {input_data: nextBatch, labels: nextBatchLabels})
        print ("____________________________________________")
        print("Test accuracy: {}   Test loss: {} ".format(acc, los))
        print ("____________________________________________")
        val_acc.append(acc)
        val_loss.append(los)
    #Save the network every 10,000 training iterations
    if (i % 10000 == 0 and i != 0):
        save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
        print("saved to %s" % save_path)

print (preds.shape)


Instructions for updating:
This class is equivalent as tf.keras.layers.SimpleRNNCell, and will be replaced by that in Tensorflow 2.0.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



  num_elements)


train accuracy: 0.4920400083065033  train loss: 3.4153575897216797
____________________________________________
Test accuracy: 0.5026000142097473   Test loss: 3.0201950073242188 
____________________________________________
train accuracy: 0.5043200254440308  train loss: 2.989389657974243
train accuracy: 0.5003200173377991  train loss: 2.8631956577301025
train accuracy: 0.508840024471283  train loss: 2.6636111736297607
train accuracy: 0.5049600005149841  train loss: 2.6054022312164307
train accuracy: 0.5041999816894531  train loss: 2.5522398948669434
train accuracy: 0.5136799812316895  train loss: 2.4750757217407227
train accuracy: 0.5124800205230713  train loss: 2.4640727043151855
train accuracy: 0.5135200023651123  train loss: 2.391267776489258
train accuracy: 0.5217999815940857  train loss: 2.350724935531616
train accuracy: 0.5233200192451477  train loss: 2.340313196182251
train accuracy: 0.5213599801063538  train loss: 2.325680732727051
train accuracy: 0.5244399905204773  train los

In [22]:
from inspect import signature

a = test_labels.reshape(25000)
#b = np.argmax(preds,1)
b = [i[1] for i in preds]
#print (b)
precision, recall, _ = precision_recall_curve(a, b)

average_precision = average_precision_score(a, b)

auc = auc(recall, precision)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.3,
         where='post')
#plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AUPRC={0:0.2f}'.format(
          auc))

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
    fpr[i], tpr[i], _ = roc_curve(a, b)
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(a.ravel(), b)
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

plt.figure()
lw = 2
plt.plot(fpr[0], tpr[0], color='darkorange',
         lw=lw, label='AUC = %0.2f' % roc_auc[0])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

print (test_labels)
print (preds.shape)
a = test_labels.reshape(25000)
#b = np.argmax(preds,1)
b = [i[1] for i in preds]
print (b)
precision, recall, _ = precision_recall_curve(a, b)

average_precision = average_precision_score(a, b)

auc = auc(recall, precision)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.3,
         where='post')
#plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AUPRC={0:0.2f}'.format(
          auc))

TypeError: ignored

In [23]:
#GRU
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 1000

tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [25000, numClasses])
input_data = tf.placeholder(tf.int32, [25000, maxSeqLength])

data = tf.Variable(tf.zeros([batchSize, maxSeqLength, 300]),dtype=tf.float32)
data = tf.nn.embedding_lookup(transformed,input_data)

lstmCell = tf.nn.rnn_cell.GRUCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)


correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))


loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

train_acc = []
train_loss = []

val_acc = []
val_loss = []


for i in range(iterations):
    #Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch();
    #print (nextBatch)
    _,acc,los = sess.run([optimizer, accuracy, loss], {input_data: nextBatch, labels: nextBatchLabels})
    print ("train accuracy: {}  train loss: {}".format(acc, los))
    train_acc.append(acc)
    train_loss.append(los)
    #Write summary to Tensorboard
    if (i % 50 == 0):
        #summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        #writer.add_summary(summary, i)
        nextBatch, nextBatchLabels = getTestBatch();
        _,acc,los,preds = sess.run([optimizer, accuracy, loss, prediction], {input_data: nextBatch, labels: nextBatchLabels})
        print ("____________________________________________")
        print("Test accuracy: {}   Test loss: {} ".format(acc, los))
        print ("____________________________________________")
        val_acc.append(acc)
        val_loss.append(los)
    #Save the network every 10,000 training iterations
    if (i % 10000 == 0 and i != 0):
        save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
        print("saved to %s" % save_path)
    #writer.close()

print (test_labels)
print (preds.shape)

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


  num_elements)


train accuracy: 0.5092399716377258  train loss: 1.8348352909088135
____________________________________________
Test accuracy: 0.5153200030326843   Test loss: 1.7740966081619263 
____________________________________________
train accuracy: 0.5231199860572815  train loss: 1.667412519454956
train accuracy: 0.5232399702072144  train loss: 1.5416234731674194
train accuracy: 0.52183997631073  train loss: 1.5058790445327759
train accuracy: 0.5171200037002563  train loss: 1.500592827796936
train accuracy: 0.5292400121688843  train loss: 1.4332512617111206
train accuracy: 0.5284000039100647  train loss: 1.3878194093704224
train accuracy: 0.531000018119812  train loss: 1.3758331537246704
train accuracy: 0.5367199778556824  train loss: 1.3520352840423584
train accuracy: 0.5383599996566772  train loss: 1.3125123977661133
train accuracy: 0.5425599813461304  train loss: 1.282914400100708
train accuracy: 0.5397999882698059  train loss: 1.2657110691070557
train accuracy: 0.5460799932479858  train los

KeyboardInterrupt: ignored

In [None]:
print (test_labels)
print (preds.shape)
a = test_labels.reshape(25000)
#b = np.argmax(preds,1)
b = [i[1] for i in preds]
#print (b)
precision, recall, _ = precision_recall_curve(a, b)

average_precision = average_precision_score(a, b)

auc = auc(recall, precision)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.3,
         where='post')
#plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AUPRC={0:0.2f}'.format(
          auc))


In [25]:
# LSTM
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 1000

tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [25000, numClasses])
input_data = tf.placeholder(tf.int32, [25000, maxSeqLength])

data = tf.Variable(tf.zeros([batchSize, maxSeqLength, 300]),dtype=tf.float32)
data = tf.nn.embedding_lookup(transformed,input_data)

lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)


correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))


loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

train_acc = []
train_loss = []

val_acc = []
val_loss = []


for i in range(iterations):
    #Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch();
    #print (nextBatch)
    _,acc,los = sess.run([optimizer, accuracy, loss], {input_data: nextBatch, labels: nextBatchLabels})
    print ("train accuracy: {}  train loss: {}".format(acc, los))
    train_acc.append(acc)
    train_loss.append(los)
    #Write summary to Tensorboard
    if (i % 50 == 0):
        #summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        #writer.add_summary(summary, i)
        nextBatch, nextBatchLabels = getTestBatch();
        _,acc,los,preds = sess.run([optimizer, accuracy, loss, prediction], {input_data: nextBatch, labels: nextBatchLabels})
        print ("____________________________________________")
        print("Test accuracy: {}   Test loss: {} ".format(acc, los))
        print ("____________________________________________")
        val_acc.append(acc)
        val_loss.append(los)
    #Save the network every 10,000 training iterations
    if (i % 10000 == 0 and i != 0):
        save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
        print("saved to %s" % save_path)
    #writer.close()

print (test_labels)
print (preds.shape)
a = test_labels.reshape(25000)
#b = np.argmax(preds,1)
b = [i[1] for i in preds]
#print (b)
precision, recall, _ = precision_recall_curve(a, b)

average_precision = average_precision_score(a, b)

auc = auc(recall, precision)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.3,
         where='post')
#plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AUPRC={0:0.2f}'.format(
          auc))

  num_elements)


train accuracy: 0.4946799874305725  train loss: 1.2008792161941528
____________________________________________
Test accuracy: 0.5090000033378601   Test loss: 1.2350044250488281 
____________________________________________
train accuracy: 0.5090000033378601  train loss: 1.1018799543380737
train accuracy: 0.5079200267791748  train loss: 1.0928336381912231
train accuracy: 0.5131999850273132  train loss: 1.0772737264633179
train accuracy: 0.5166000127792358  train loss: 1.039305329322815
train accuracy: 0.526639997959137  train loss: 0.9952205419540405
train accuracy: 0.5334799885749817  train loss: 0.9821171760559082
train accuracy: 0.5382400155067444  train loss: 0.9720572829246521


KeyboardInterrupt: ignored

In [26]:
print (test_labels)
print (preds.shape)
a = test_labels.reshape(25000)
#b = np.argmax(preds,1)
b = [i[1] for i in preds]
#print (b)
precision, recall, _ = precision_recall_curve(a, b)

average_precision = average_precision_score(a, b)

auc = auc(recall, precision)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.3,
         where='post')
#plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AUPRC={0:0.2f}'.format(
          auc))

[0 1 1 ... 0 0 0]
(25000, 2)


TypeError: ignored