In [2]:
import tarfile
import re
import urllib.request
import os
import random

class ImdbMovieReviews:
    """
    The movie review dataset is offered by Stanford University’s AI department:
    http://ai.stanford.edu/~amaas/data/sentiment/. It comes as a compressed  tar  archive where
    positive and negative reviews can be found as text files in two according folders. We apply
    the same pre-processing to the text as in the last section: Extracting plain words using a
    regular expression and converting to lower case.
    """
    DEFAULT_URL = \
        'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
    TOKEN_REGEX = re.compile(r'[A-Za-z]+|[!?.:,()]')
    
    def __init__(self):
        self._cache_dir = './imdb'
        self._url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
        
        if not os.path.isfile(self._cache_dir):
            urllib.request.urlretrieve(self._url, self._cache_dir)
        self.filepath = self._cache_dir

    def __iter__(self):
        with tarfile.open(self.filepath) as archive:
            items = archive.getnames()
            for filename in archive.getnames():
                if filename.startswith('aclImdb/train/pos/'):
                    yield self._read(archive, filename), True
                elif filename.startswith('aclImdb/train/neg/'):
                    yield self._read(archive, filename), False
                    
    def _read(self, archive, filename):
        with archive.extractfile(filename) as file_:
            data = file_.read().decode('utf-8')
            data = type(self).TOKEN_REGEX.findall(data)
            data = [x.lower() for x in data]
            return data

In [3]:
import numpy as np
# Spacy is my favourite nlp framework, which havu builtin word embeddings trains on wikipesia
from spacy.en import English

class Embedding:
    
    def __init__(self, length):
#          spaCy makes using word vectors very easy. 
#             The Lexeme , Token , Span  and Doc  classes all have a .vector property,
#             which is a 1-dimensional numpy array of 32-bit floats:
        self.parser = English()
        self._length = length
        self.dimensions = 300
        
    def __call__(self, sequence):
        data = np.zeros((self._length, self.dimensions))
        # you can access known words from the parser's vocabulary
        embedded = [self.parser.vocab[w].vector for w in sequence]
        data[:len(sequence)] = embedded
        return data

In [23]:
from lazy import lazy

class SequenceClassificationModel:
    def __init__(self, data, params):
        self.params = params
        self._create_placeholders()
        self.prediction
        self.cost
        self.error
        self.optimize
        self.global_step = 0
        self._create_summaries()
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
    
    def _create_placeholders(self):
        with tf.name_scope("data"):
            self.data = tf.placeholder(tf.float32, [None, self.params.seq_length, self.params.embed_length])
            self.target = tf.placeholder(tf.float32, [None, 2])
  
    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar('loss', self.cost)
            tf.summary.scalar('erroe', self.error)
            self.summary = tf.summary.merge_all()
            saver = tf.train.Saver()
            
    @lazy
    def length(self):
        with tf.name_scope("seq_length"):
            used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
            length = tf.reduce_sum(used, reduction_indices=1)
            length = tf.cast(length, tf.int32)
        return length
    
    @lazy
    def prediction(self):
        with tf.name_scope("recurrent_layer"):
            output, _ = tf.nn.dynamic_rnn(
                self.params.rnn_cell(self.params.rnn_hidden),
                self.data,
                dtype=tf.float32,
                sequence_length=self.length
            )
        last = self._last_relevant(output, self.length)

        with tf.name_scope("softmax_layer"):
            num_classes = int(self.target.get_shape()[1])
            weight = tf.Variable(tf.truncated_normal(
                [self.params.rnn_hidden, num_classes], stddev=0.01))
            bias = tf.Variable(tf.constant(0.1, shape=[num_classes]))
            prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
        return prediction
    
    @lazy
    def cost(self):
        cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction))
        return cross_entropy
    
    @lazy
    def error(self):
        self.mistakes = tf.not_equal(
            tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
        return tf.reduce_mean(tf.cast(self.mistakes, tf.float32))
    
    @lazy
    def optimize(self):
        with tf.name_scope("optimization"):
            gradient = self.params.optimizer.compute_gradients(self.cost)
            if self.params.gradient_clipping:
                limit = self.params.gradient_clipping
                gradient = [
                    (tf.clip_by_value(g, -limit, limit), v)
                    if g is not None else (None, v)
                    for g, v in gradient]
            optimize = self.params.optimizer.apply_gradients(gradient)
        return optimize
    
    @staticmethod
    def _last_relevant(output, length):
        with tf.name_scope("last_relevant"):
            # As of now, TensorFlow only supports indexing along the first dimension, using
            # tf.gather() . We thus flatten the first two dimensions of the output activations from their
            # shape of  sequences x time_steps x word_vectors  and construct an index into this resulting tensor.
            batch_size = tf.shape(output)[0]
            max_length = int(output.get_shape()[1])
            output_size = int(output.get_shape()[2])

            # The index takes into account the start indices for each sequence in the flat tensor and adds
            # the sequence length to it. Actually, we only add  length - 1  so that we select the last valid
            # time step.
            index = tf.range(0, batch_size) * max_length + (length - 1)
            flat = tf.reshape(output, [-1, output_size])
            relevant = tf.gather(flat, index)
        return relevant
    
    def train(self, batches, save_prefix, save_every=10):
        saver = tf.train.Saver()
        if os.path.isdir('./saved/'):
            saver.restore(self.sess, tf.train.latest_checkpoint('./saved/'))
        else:
            os.makedirs('saved')
        summary_writer = tf.summary.FileWriter('graphs/run{}'.format(self.global_step), self.sess.graph)
        self.global_step += 1
        for index, batch in enumerate(batches):
            feed = {model.data: batch[0], model.target: batch[1]}
            error, _, summary_str = self.sess.run([model.error, model.optimize, model.summary], feed)
            print('{}: {:3.1f}%'.format(index + 1, 100 * error))
            if index % save_every == 0:
                summary_writer.add_summary(summary_str, index)
                summary_writer.flush()
            if index % save_every == 0:
                save_path = os.path.join('checkpoints', save_prefix)
                print('saving...', save_path)
                saver.save(self.sess, save_path, global_step=index)
        saver.save(self.sess, os.path.join('checkpoints', save_prefix + '_final'))

    def predict_proba(self, data):
        feed = {model.data: data, }
        prediction = self.sess.run([model.prediction], feed)        
        return prediction
        
    def close(self):
        tf.reset_default_graph()
        self.session.close()

In [7]:
def preprocess_batched(iterator, length, embedding, batch_size):
    iterator = iter(iterator)
    while True:
        data = np.zeros((batch_size, length, embedding.dimensions))
        target = np.zeros((batch_size, 2))
        for index in range(batch_size):
            text, label = next(iterator)
            data[index] = embedding(text)
            target[index] = [1, 0] if label else [0, 1]
        yield data, target

In [8]:
reviews = list(ImdbMovieReviews())

In [9]:
random.shuffle(reviews)

In [10]:
length = max(len(x[0]) for x in reviews)
embedding = Embedding(length)

In [11]:
from attrdict import AttrDict

params = AttrDict(
    rnn_cell=tf.contrib.rnn.GRUCell,
    rnn_hidden=300,
    optimizer=tf.train.RMSPropOptimizer(0.002),
    batch_size=20,
    gradient_clipping=100,
    seq_length=length,
    embed_length=embedding.dimensions
)

In [16]:
batches = preprocess_batched(reviews, length, embedding, params.batch_size)

In [24]:
tf.reset_default_graph()

model = SequenceClassificationModel(data, params)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [25]:
model.train(batches, save_prefix='simple-rnn')

1: 35.0%
saving... checlkpoints/simple-rnn
2: 50.0%
3: 65.0%
4: 45.0%
5: 55.0%
6: 45.0%
7: 60.0%
8: 50.0%
9: 40.0%
10: 50.0%
11: 45.0%
saving... checlkpoints/simple-rnn
12: 25.0%
13: 50.0%
14: 40.0%
15: 50.0%
16: 55.0%
17: 55.0%
18: 35.0%
19: 45.0%
20: 35.0%
21: 55.0%
saving... checlkpoints/simple-rnn
22: 60.0%
23: 45.0%
24: 55.0%
25: 55.0%
26: 40.0%
27: 45.0%
28: 65.0%
29: 45.0%
30: 35.0%
31: 50.0%
saving... checlkpoints/simple-rnn
32: 25.0%
33: 35.0%
34: 65.0%
35: 45.0%
36: 45.0%
37: 50.0%
38: 45.0%
39: 60.0%
40: 55.0%
41: 60.0%
saving... checlkpoints/simple-rnn
42: 45.0%
43: 70.0%
44: 50.0%
45: 50.0%
46: 50.0%
47: 60.0%
48: 40.0%
49: 50.0%
50: 60.0%
51: 35.0%
saving... checlkpoints/simple-rnn
52: 45.0%
53: 40.0%
54: 35.0%
55: 50.0%
56: 70.0%
57: 40.0%
58: 45.0%
59: 40.0%
60: 45.0%
61: 55.0%
saving... checlkpoints/simple-rnn
62: 45.0%
63: 35.0%
64: 60.0%
65: 50.0%
66: 15.0%
67: 60.0%
68: 55.0%
69: 40.0%
70: 40.0%
71: 45.0%
saving... checlkpoints/simple-rnn
72: 50.0%
73: 50.0%
74: 25.

579: 20.0%
580: 20.0%
581: 25.0%
saving... checlkpoints/simple-rnn
582: 20.0%
583: 15.0%
584: 10.0%
585: 15.0%
586: 0.0%
587: 5.0%
588: 15.0%
589: 20.0%
590: 10.0%
591: 10.0%
saving... checlkpoints/simple-rnn
592: 5.0%
593: 5.0%
594: 10.0%
595: 10.0%
596: 15.0%
597: 5.0%
598: 30.0%
599: 15.0%
600: 30.0%
601: 30.0%
saving... checlkpoints/simple-rnn
602: 30.0%
603: 20.0%
604: 15.0%
605: 10.0%
606: 15.0%
607: 20.0%
608: 10.0%
609: 15.0%
610: 10.0%
611: 15.0%
saving... checlkpoints/simple-rnn
612: 20.0%
613: 0.0%
614: 15.0%
615: 20.0%
616: 15.0%
617: 25.0%
618: 5.0%
619: 15.0%
620: 30.0%
621: 20.0%
saving... checlkpoints/simple-rnn
622: 25.0%
623: 20.0%
624: 20.0%
625: 20.0%
626: 20.0%
627: 15.0%
628: 5.0%
629: 10.0%
630: 0.0%
631: 35.0%
saving... checlkpoints/simple-rnn
632: 15.0%
633: 10.0%
634: 25.0%
635: 30.0%
636: 10.0%
637: 20.0%
638: 15.0%
639: 40.0%
640: 25.0%
641: 20.0%
saving... checlkpoints/simple-rnn
642: 5.0%
643: 30.0%
644: 15.0%
645: 10.0%
646: 30.0%
647: 10.0%
648: 15.0%
64

1144: 10.0%
1145: 10.0%
1146: 15.0%
1147: 10.0%
1148: 10.0%
1149: 20.0%
1150: 5.0%
1151: 5.0%
saving... checlkpoints/simple-rnn
1152: 20.0%
1153: 5.0%
1154: 10.0%
1155: 30.0%
1156: 20.0%
1157: 20.0%
1158: 20.0%
1159: 20.0%
1160: 20.0%
1161: 15.0%
saving... checlkpoints/simple-rnn
1162: 15.0%
1163: 0.0%
1164: 15.0%
1165: 20.0%
1166: 5.0%
1167: 20.0%
1168: 20.0%
1169: 20.0%
1170: 20.0%
1171: 15.0%
saving... checlkpoints/simple-rnn
1172: 30.0%
1173: 15.0%
1174: 10.0%
1175: 30.0%
1176: 10.0%
1177: 5.0%
1178: 10.0%
1179: 30.0%
1180: 30.0%
1181: 10.0%
saving... checlkpoints/simple-rnn
1182: 0.0%
1183: 5.0%
1184: 5.0%
1185: 10.0%
1186: 5.0%
1187: 0.0%
1188: 15.0%
1189: 25.0%
1190: 20.0%
1191: 10.0%
saving... checlkpoints/simple-rnn
1192: 5.0%
1193: 5.0%
1194: 15.0%
1195: 15.0%
1196: 15.0%
1197: 20.0%
1198: 5.0%
1199: 20.0%
1200: 10.0%
1201: 15.0%
saving... checlkpoints/simple-rnn
1202: 0.0%
1203: 5.0%
1204: 20.0%
1205: 25.0%
1206: 15.0%
1207: 15.0%
1208: 10.0%
1209: 15.0%
1210: 5.0%
1211: 20.

