In [140]:
import numpy as np
from utils import *
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

X_train, Y_train = read_csv('data/train.csv') 
X_test, Y_test = read_csv('data/test.csv') 

maxLen = len(max(X_train, key=len).split())

word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """

    m = X.shape[0]
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        sentence_words = X[i].lower().split()
        j = 0
        for w in sentence_words:
            if w not in word_to_index:
                X_indices[i, j] = 0 # HACK - FIX SOON
            else:
                X_indices[i, j] = word_to_index[w]
            j = j+1
    return X_indices

def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """

    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map["lemon"].shape[0]
    emb_matrix = np.zeros((vocab_len, emb_dim)) # curious why not transpose of this...
    # Sets each row "index" of the embedding matrix to be 
    # the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    
    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)

    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix]) # now it's pretrained!

    return embedding_layer

embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

def Model_V1(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Model-V1 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    sentence_indices = Input(shape = input_shape, dtype = np.int32)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

    # Propagates sentence_indices through the embedding layer
    embeddings = embedding_layer(sentence_indices)

    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    LSTM1 = LSTM(128, return_sequences = True,name='LSTM1')(embeddings)
    # Adds dropout with probability 0.5
    X = Dropout(0.5)(LSTM1)
    # Another LSTM layer, but just returns one output
    LSTM2 = LSTM(128, return_sequences = True, name='LSTM2')(X)
    
    def get_last(X):
        return X[:,-1,:]
    
    LSTM2Last = Lambda(get_last, name='LSTM2-last')(LSTM2)
    Dropout2 = Dropout(0.5,name='Dropout2')(LSTM2Last)
    
    # Propagating through a Dense layer with sigmoid activation to get back a scalar
    Dense1 = Dense(1,name='Dense1')(Dropout2)
    X = Activation('sigmoid',name='output_layer')(Dense1)

    model = Model(inputs = sentence_indices, outputs = X)

    return model

model = Model_V1((maxLen,), word_to_vec_map, word_to_index)

# might want to change the metric here
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)

model.fit(X_train_indices, Y_train, epochs = 20, batch_size = 6, shuffle=True)

X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
loss, acc = model.evaluate(X_test_indices, Y_test)
model.save('my_model.h5')
print()
print("Test accuracy = ", acc)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Test accuracy =  0.8538043478260869


In [147]:
def visualize_model_bs(model, include_gradients=False):

    LSTM2 = model.get_layer('LSTM2')
    Dropout2 = model.get_layer('Dropout2')
    output_layer = model.get_layer('output_layer')

    inputs = []
    inputs.extend(model.inputs)

    outputs = []
    outputs.extend(model.outputs)
    outputs.append(LSTM2.output)
    outputs.append(LSTM2.cell.kernel_f)  # -- weights of the forget gates (assuming LSTM)
    #print (LSTM1.trainable_weights)

    if include_gradients:
        loss = K.mean(model.output)  # [batch_size, 1] -> scalar
        grads = K.gradients(loss, LSTM2.output)
        grads_norm = grads / (K.sqrt(K.mean(K.square(grads))) + 1e-5)
        outputs.append(grads_norm)

    all_function = K.function(inputs, outputs)
    output_function = K.function([Dropout2.input], model.outputs)
    print(Dropout2.input)
    return all_function, output_function

In [148]:
all_function, output_function = visualize_model_bs(model, include_gradients=True)

Tensor("LSTM2-last_1/strided_slice:0", shape=(?, 128), dtype=float32)


In [149]:
t = np.array(["You need not to have called me up so late at night",'As am is'])
X = sentences_to_indices(t, word_to_index, maxLen)
# -- Return scores, raw rnn values and gradients
# scores is equivalent to model.predict(X)
scores, rnn_values, rnn_gradients, W_i = all_function([X])
print(scores.shape, rnn_values.shape, rnn_gradients.shape, W_i.shape)

# -- score prediction
print("Scores:", scores)

# -- Return scores at each step in the time sequence
time_distributed_scores = map(lambda x: output_function([x]), rnn_values)
print("Time distributed (word-level) scores:", map(lambda x: x[0], time_distributed_scores))

(2, 1) (2, 22, 128) (128, 128) (1, 2, 22, 128)
Scores: [[0.8317234 ]
 [0.09714699]]
Time distributed (word-level) scores: [array([[0.5103478 ],
       [0.5637954 ],
       [0.6428501 ],
       [0.7160077 ],
       [0.7672935 ],
       [0.79743946],
       [0.81008464],
       [0.82299113],
       [0.8304071 ],
       [0.83671296],
       [0.83815575],
       [0.8378455 ],
       [0.8369964 ],
       [0.83584464],
       [0.83504534],
       [0.8343906 ],
       [0.8338339 ],
       [0.8333432 ],
       [0.83289826],
       [0.83248544],
       [0.83209586],
       [0.8317234 ]], dtype=float32), array([[0.49421164],
       [0.5000774 ],
       [0.49013227],
       [0.4685592 ],
       [0.43677354],
       [0.3991497 ],
       [0.35915443],
       [0.31949675],
       [0.2821362 ],
       [0.24833643],
       [0.21876137],
       [0.19359471],
       [0.1726637 ],
       [0.15556085],
       [0.14175455],
       [0.1306813 ],
       [0.12181053],
       [0.11468153],
       [0.10891619],

In [144]:
rnn_values

array([[[ 0.00742034,  0.03337668,  0.01174635, ...,  0.0531809 ,
          0.00585037,  0.05965093],
        [ 0.04074103,  0.04676349,  0.03048183, ...,  0.06056848,
          0.06381115,  0.09195539],
        [ 0.10616807,  0.01623522,  0.0449459 , ...,  0.033544  ,
          0.17325191,  0.09447686],
        ...,
        [ 0.21929474, -0.41297337,  0.06639624, ..., -0.07637198,
          0.6058215 , -0.24190804],
        [ 0.21930094, -0.41567263,  0.06757586, ..., -0.07720998,
          0.60561544, -0.24450427],
        [ 0.21928251, -0.41798908,  0.06862893, ..., -0.07794573,
          0.605298  , -0.24689142]],

       [[-0.02379844,  0.02844131, -0.01638879, ...,  0.01416098,
         -0.02209499,  0.00391491],
        [-0.01955068,  0.0694175 , -0.03651516, ...,  0.05530659,
         -0.05460837,  0.04872531],
        [-0.05751175,  0.12331344, -0.06953079, ...,  0.07751152,
         -0.08808772,  0.08417251],
        ...,
        [-0.44136357,  0.27019277, -0.45307258, ...,  

## Weight analysis
-> need to check papers

## Neuron Firing (example based)
-> given two inputs (correct, wrong) 

-> check how the neron fire

In [13]:
x1="I had breakfirst today"
x2="breakfirst had today I"
y1=1
y2=0

x_ana_indices = sentences_to_indices(np.array([x1,x2]), word_to_index, maxLen)

loss, acc=model.evaluate(x_ana_indices, [y1,y2])
print loss


1.5999397039413452


In [None]:
sentence='I like the color today'

## error analysis
-> dev, testing, training error rate, and analysis how to improve the RNN

In [94]:
from __future__ import print_function

from keras import backend as K
from keras.engine import Input, Model, InputSpec
from keras.layers import Dense, Activation, Dropout, Lambda
from keras.layers import Embedding, LSTM
from keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.utils.data_utils import get_file
from keras.datasets import imdb

import numpy as np
import random
import sys
import pdb

MAX_FEATURES = 20000
MAXLEN = 40
BATCH_SIZE = 32
STEP = 3
EMBEDDING_DIM = 32
RNN_HIDDEN_DIM = 32

# TODO: add normalization
# TODO: activation cluster?
# TODO: get forget gates from LSTMs (not sure how to grab temporary tensors inside loops)


def build_vocab():
    vocab = imdb.get_word_index()
    rev_vocab = {v: k for k, v in vocab.iteritems()}
    return vocab, rev_vocab


def vectorize(text, vocab, 
              maxlen=MAXLEN, start_char=1, oov_char=2, index_from=3):
    """ might not be consistent with vectorize_data. """
    if isinstance(text, basestring):
        text = [text]
    v = [[vocab.get(w, oov_char) for w in t.lower().split()] for t in text]
    return sequence.pad_sequences(v, maxlen=maxlen)


def vectorize_data(max_features=MAX_FEATURES, maxlen=MAXLEN, batch_size=BATCH_SIZE, limit=None):

    print('Loading data...')
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
    print(len(X_train), 'train sequences')
    print(len(X_test), 'test sequences')

    print('Pad sequences (samples x time)')
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    if limit is None:
        return X_train, y_train, X_test, y_test
    else:
        return X_train[:limit], y_train[:limit], X_test[:limit], y_test[:limit]


def build_model(vocab_size, maxlen=MAXLEN, rnn_hidden_dim=RNN_HIDDEN_DIM):
    input_layer = Input(shape=(maxlen, ), name='input_layer', dtype='int32')
    embedding_layer = Embedding(vocab_size, EMBEDDING_DIM, input_length=maxlen, trainable=True, mask_zero=False, name='embedding_layer')(input_layer)  # -- ideally mask_zero=False but can't work with lambda layers
    recurrent_layer = LSTM(output_dim=rnn_hidden_dim, activation='tanh', return_sequences=True, name='recurrent_layer')(embedding_layer)
    last_step_layer = Lambda(lambda x: x[:, -1, :], output_shape=(rnn_hidden_dim, ), name='last_step_layer')(recurrent_layer)  # only needed for visualization
    output_layer = Dense(1, activation='sigmoid', name='output_layer')(last_step_layer)

    optimizer = Adam(lr=0.001)
    model = Model(input=input_layer, output=output_layer)
    print('Compiling...')
    model.compile(loss='binary_crossentropy', optimizer=optimizer)

    model.summary()
    return model

if __name__ == '__main__':
    # -- train
    vocab, rev_vocab = build_vocab()
    X_train, y_train, X_test, y_test = vectorize_data(limit=1000)
    model = build_model(len(vocab))
    model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=1, verbose=True, # validation_split=0.05)
              validation_data=(X_test, y_test))

    acc = model.evaluate(X_test, y_test, batch_size=BATCH_SIZE)
    print('Test accuracy:', acc)

    # -- predict
    

Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
X_train shape: (25000, 40)
X_test shape: (25000, 40)
Compiling...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     (None, 40)                0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 40, 32)            2834688   
_________________________________________________________________
recurrent_layer (LSTM)       (None, 40, 32)            8320      
_________________________________________________________________
last_step_layer (Lambda)     (None, 32)                0         
_________________________________________________________________
output_layer (Dense)         (None, 1)                 33        
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
__________________________________________



Train on 1000 samples, validate on 1000 samples
Epoch 1/1
Test accuracy: 0.6917306571006775


In [97]:
all_function, output_function = visualize_model(model, include_gradients=True)

t = "HOW COULD anything originate out of its opposite?".lower()
X = vectorize(t, vocab)

# -- Return scores, raw rnn values and gradients
# scores is equivalent to model.predict(X)
scores, rnn_values, rnn_gradients, W_i = all_function([X])
print(scores.shape, rnn_values.shape, rnn_gradients.shape, W_i.shape)

# -- score prediction
print("Scores:", scores)

# -- Return scores at each step in the time sequence
time_distributed_scores = map(lambda x: output_function([x]), rnn_values)
print("Time distributed (word-level) scores:", map(lambda x: x[0], time_distributed_scores))

pdb.set_trace()

(1, 1) (1, 40, 32) (32, 32) (1, 1, 40, 32)
Scores: [[0.498088]]
Time distributed (word-level) scores: [array([[0.5006162 ],
       [0.5007734 ],
       [0.5008125 ],
       [0.5007444 ],
       [0.50059927],
       [0.50040865],
       [0.5001987 ],
       [0.49998844],
       [0.49979004],
       [0.49961036],
       [0.49945253],
       [0.49931702],
       [0.49920276],
       [0.49910778],
       [0.49902967],
       [0.49896622],
       [0.49891493],
       [0.49887398],
       [0.49884135],
       [0.4988156 ],
       [0.49879533],
       [0.49877948],
       [0.4987672 ],
       [0.49875766],
       [0.49875036],
       [0.4987448 ],
       [0.49874052],
       [0.4987373 ],
       [0.49873495],
       [0.49873316],
       [0.49873185],
       [0.4987309 ],
       [0.49495125],
       [0.49456042],
       [0.4971958 ],
       [0.49947283],
       [0.49798134],
       [0.49419206],
       [0.49576503],
       [0.498088  ]], dtype=float32)]
--Call--
> /Library/Frameworks/Python.fr

KeyboardInterrupt: 

In [96]:
def visualize_model(model, include_gradients=False):
    recurrent_layer = model.get_layer('recurrent_layer')
    output_layer = model.get_layer('output_layer')

    inputs = []
    inputs.extend(model.inputs)

    outputs = []
    outputs.extend(model.outputs)
    outputs.append(recurrent_layer.output)
    outputs.append(recurrent_layer.cell.kernel_f)  # -- weights of the forget gates (assuming LSTM)

    if include_gradients:
        loss = K.mean(model.output)  # [batch_size, 1] -> scalar
        grads = K.gradients(loss, recurrent_layer.output)
        grads_norm = grads / (K.sqrt(K.mean(K.square(grads))) + 1e-5)
        outputs.append(grads_norm)

    all_function = K.function(inputs, outputs)
    output_function = K.function([output_layer.input], model.outputs)
    return all_function, output_function

def get_compare_embeddings(original_embeddings, tuned_embeddings, vocab, dimreduce_type="pca", random_state=0):
    """ Compare embeddings drift. """
    if dimreduce_type == "pca":
        from sklearn.decomposition import PCA
        dimreducer = PCA(n_components=2, random_state=random_state)
    elif dimreduce_type == "tsne":
        from sklearn.manifold import TSNE
        dimreducer = TSNE(n_components=2, random_state=random_state)
    else:
        raise Exception("Wrong dimreduce_type.")

    reduced_original = dimreducer.fit_transform(original_embeddings)
    reduced_tuned = dimreducer.fit_transform(tuned_embeddings)

    def compare_embeddings(word):
        if word not in vocab:
            return None
        word_id = vocab[word]
        original_x, original_y = reduced_original[word_id, :]
        tuned_x, tuned_y = reduced_tuned[word_id, :]
        return original_x, original_y, tuned_x, tuned_y

    return compare_embeddings