In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import os
import pickle
import nltk
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Conv1D, Dot, Bidirectional, Flatten
from IPython.display import display, HTML # to display in ipython notebook
import matplotlib.cm
from sklearn.preprocessing import MinMaxScaler

# Constants

In [2]:
OOV_TOKEN = '<UNK>'
EPOCHS = 2
PADDED_LEN = 100
BATCH_SIZE = 32
N_SAMPLES = 500000
N_CLASSES = 5

# Load data and preprocess text

Load the prebuilt vocabulary list from a pickle file

In [3]:
with open('tokenizer.pkl', 'rb') as f:
    vocab_list = pickle.load(f)
    word_counts = pickle.load(f)

Create a class that is responsible for tokenizing and encoding sentences

In [4]:
class TokenizerEncoder:
    def __init__(self, vocab_list, oov_token, lower=True, 
                 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r'):
        self.word_index = {word:(index+2) for index, word in enumerate(vocab_list)} # index 0 is reserved for padding token
                                                                                    # index 1 is reserved for out of vocab token
        self.word_index[oov_token] = 1
        
        self.reverse_word_index = {v: k for k, v in self.word_index.items()}
        self.reverse_word_index[0] = '' # for decoding
        
        self.oov_token = oov_token
        self.lower = lower
        self.filters = filters
        
    def encode(self, text):
        # tokenize
        if self.lower:
            text = text.lower() # convert to lowercase
        text = text.translate(str.maketrans(self.filters, ' ' * len(self.filters))) # replace chars in filters with blank space
        tokens = nltk.word_tokenize(text) # split text line into tokens
        
        # encode
        encoded_tokens = [self.word_index[token] if token in self.word_index 
                          else self.word_index[self.oov_token] for token in tokens]
        return encoded_tokens
    
    def decode(self, encoded_tokens):
        tokens = [self.reverse_word_index[idx] for idx in encoded_tokens]
        return tokens

Create a function to transform the tensorflow `Dataset`

In [5]:
tokenizer = TokenizerEncoder(vocab_list=vocab_list, oov_token=OOV_TOKEN)

def tf_encode(text, stars):
    """Tokenize the texts and one-hot encode the labels"""
    tf_text, tf_stars = tf.py_function(
        # Use .numpy() to convert EagerTensors to their values
        # Use .decode() to convert bytes to string
        # Use tf.cast() to cast a tensor to a new type (from tf.float32 to tf.int64 in this case)
        func=lambda text, stars: (tokenizer.encode(text.numpy().decode('utf-8')), 
                                  tf.one_hot(tf.cast(stars - 1.0, tf.int64), N_CLASSES)), 
        inp=[text, stars],
        Tout=(tf.int64, tf.float32)
    )
    return tf_text, tf_stars

Prepare the dataset

In [6]:
dataset = tf.data.experimental.CsvDataset(os.path.join('data', 'reviews.csv'), 
                                          [tf.float32, tf.string], 
                                          header=True)
dataset = dataset.map(lambda stars, text: (text, stars))
dataset = dataset.map(tf_encode)
dataset = dataset.map(lambda text, stars: (text[:PADDED_LEN], stars)) # truncate sequences longer than PADDED_LEN
dataset = dataset.shuffle(buffer_size=5000)
dataset = dataset.padded_batch(batch_size=BATCH_SIZE, padded_shapes=([PADDED_LEN], [None]))
dataset = dataset.repeat()

# Model

In [7]:
VOCAB_SIZE = len(vocab_list) + 2 # 1 for padding token (zero), 1 for out-of-vocabulary token
EMBED_SIZE = 100

In [8]:
def attention_block(inputs, d_a, r): # inputs H shape (None, PADDED_LEN, 2*u)
    L = Conv1D(filters=d_a, 
               kernel_size=1, 
               activation='tanh', 
               padding='same')(inputs) # equivalent to matrix multiplication L = tanh(W_s1 * H), L shape (None, PADDED_LEN, d_a)
    A = tf.nn.softmax(Conv1D(filters=r, 
                             kernel_size=1, 
                             activation='linear', 
                             padding='same')(L), 
                      axis=1, name='attention_weights') # A shape (None, PADDED_LEN, r)
    return A

In [9]:
inp = Input(shape=(PADDED_LEN,)) # (None, PADDED_LEN)
embed = Embedding(input_dim=VOCAB_SIZE, 
                  output_dim=EMBED_SIZE, 
                  embeddings_initializer='glorot_uniform', 
                  mask_zero=False)(inp) # (None, PADDED_LEN, EMBED_SIZE)
bi_lstm = Bidirectional(LSTM(units=300, return_sequences=True))(embed) # (None, PADDED_LEN, 2*u)
attention = attention_block(bi_lstm, d_a=350, r=30) # (None, PADDED_LEN, r)
sentence_embedding = Dot(axes=[1, 1])([attention, bi_lstm]) # M = tranpose(A) * H, shape (None, r, 2*u)
flatten = Flatten()(sentence_embedding) # (None, r * 2u)
fc1 = Dense(units=3000, activation='relu')(flatten)
fc2 = Dense(units=3000, activation='relu')(fc1)
out = Dense(units=N_CLASSES, activation='softmax')(fc2)

W0815 14:22:24.506874 11588 deprecation.py:506] From f:\anaconda3\envs\tensorflow1.14\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0815 14:22:24.539672 11588 deprecation.py:506] From f:\anaconda3\envs\tensorflow1.14\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0815 14:22:24.541020 11588 deprecation.py:506] From f:\anaconda3\envs\tensorflow1.14\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops

In [10]:
model = tf.keras.Model(inp, out)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 100)     1000200     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 100, 600)     962400      embedding[0][0]                  
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 100, 350)     210350      bidirectional[0][0]              
______________________________________________________________________________________________

# Train

In [None]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.06, clipnorm=0.5)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
model.fit(dataset, epochs=EPOCHS, steps_per_epoch=int(np.ceil(N_SAMPLES/BATCH_SIZE)))

In [None]:
model.save_weights('model_weights.h5')

# Visualize

In [11]:
def colorize(words, color_array):
    # words is a list of words
    # color_array is an array of numbers between 0 and 1 of length equal to words
    cmap = matplotlib.cm.get_cmap('Reds')
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = ''
    for word, color in zip(words, color_array):
        color = matplotlib.colors.rgb2hex(cmap(color)[:3])
        colored_string += template.format(color, '&nbsp' + word + '&nbsp')
    return colored_string

In [12]:
def visualize(words, weights):
    """
    words: list of words (string)
    weights: list of corresponding weights
    """
    weights_minmax = MinMaxScaler(feature_range=(0.2, 0.8)).fit_transform(np.array(weights).reshape(-1,1)).squeeze(axis=1)
    s = colorize(words, weights_minmax)
    display(HTML(s))

In [13]:
model.load_weights('model_weights.h5')

In [14]:
df = pd.read_csv(os.path.join('data', 'reviews.csv'))

Filter 1-star or 5-star reviews

In [15]:
df = df[df['stars'].isin([1.0, 5.0])]

Take only `TEST_SIZE` samples

In [16]:
TEST_SIZE = 1000
texts = df['text'].values[:TEST_SIZE]
stars = df['stars'].values[:TEST_SIZE]

In [17]:
texts = list(map(tokenizer.encode, texts))

In [18]:
texts = tf.keras.preprocessing.sequence.pad_sequences(texts, 
                                                      maxlen=PADDED_LEN, 
                                                      dtype='int64', 
                                                      padding='post', 
                                                      truncating='post')

Optional: only choose samples with high confidence (probability > 0.99)

In [None]:
# y = model.predict(np.array(texts), batch_size=500, verbose=1)
# high_confidence_index = np.where(np.max(y, axis=1) > 0.99)
# texts = texts[high_confidence_index]
# stars = stars[high_confidence_index]

Visualize attention weights

In [19]:
visualize_model = tf.keras.Model(model.input, tf.get_default_graph().get_tensor_by_name('attention_weights:0'))

In [20]:
i=0

In [39]:
test_text = texts[i]
a = visualize_model.predict(np.array([test_text]))
a = np.squeeze(a, axis=0)
a_normalized = np.sum(a, axis=1) / np.sum(a)
visualize(tokenizer.decode(test_text), a_normalized)
i+=1