In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
import os

from nltk import sent_tokenize, word_tokenize

from gcdc_data import load, load_pandas, TrainOrTest

In [2]:
train_data = load_pandas(TrainOrTest.TRAIN)
test_data = load_pandas(TrainOrTest.TEST)

train_data['sents'] = train_data['text'].map(lambda t: [
    word_tokenize(sent)
    for sent in sent_tokenize(t)
])
test_data['sents'] = test_data['text'].map(lambda t: [
    word_tokenize(sent)
    for sent in sent_tokenize(t)
])

train_data.head()

Unnamed: 0,text,label,sents
0,Cheryl:\n\nAre we in a good place to begin pap...,2,"[[Cheryl, :, Are, we, in, a, good, place, to, ..."
1,"Our friend, General Joe Ballard owns The Raven...",2,"[[Our, friend, ,, General, Joe, Ballard, owns,..."
2,Outstanding news! Miki Rakic called about 10 m...,3,"[[Outstanding, news, !], [Miki, Rakic, called,..."
3,Responding to separate emails from Uzra + Jeff...,1,"[[Responding, to, separate, emails, from, Uzra..."
4,Guy from Mexico is in NY and is cooperating. D...,1,"[[Guy, from, Mexico, is, in, NY, and, is, coop..."


In [3]:
t = tf.keras.preprocessing.text.Tokenizer()
t.fit_on_texts([
    word
    for doc in train_data['sents']
    for sent in doc
    for word in sent
])

vocab_size = len(t.word_index) + 1

# integer encode the documents
def encode(texts):
    return [
        t.texts_to_sequences(doc) for doc in texts
    ]

encoded_train = encode(train_data['sents'])
encoded_test = encode(test_data['sents'])

max_sent_length = max(
    max(len(sent) for txt in encoded_train for sent in txt),
    max(len(sent) for txt in encoded_test for sent in txt)
)
max_doc_length = max(
    max(len(txt) for txt in encoded_train),
    max(len(txt) for txt in encoded_test)
)

vocab_size, max_sent_length, max_doc_length

(27431, 239, 32)

In [19]:
def pad_to_dense(M, sent_len, doc_len):
    maxlen = max(len(r) for r in M)

    Z = np.zeros((len(M), doc_len, sent_len))
    for docidx, doc in enumerate(M):
        for sentidx, sent in enumerate(doc):
            Z[docidx, sentidx, :len(sent)] += np.array(sent)
    return Z


def categorical_labels(labels):
    eye = [
        [1.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 1.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 1.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 1.0],
    ]
    
    result = []
    for item in labels:
        result.append(eye[int(item)])
        
    return result


train_tensor = tf.convert_to_tensor(pad_to_dense(encoded_train, max_sent_length, max_doc_length))
train_labels = tf.convert_to_tensor(categorical_labels(train_data['label']))
test_tensor = tf.convert_to_tensor(pad_to_dense(encoded_test, max_sent_length, max_doc_length))
test_labels = tf.convert_to_tensor(categorical_labels(test_data['label']))

train_tensor.shape, train_labels.shape, test_tensor.shape, test_labels.shape

(TensorShape([4000, 32, 239]),
 TensorShape([4000, 5]),
 TensorShape([800, 32, 239]),
 TensorShape([800, 5]))

# problem, why does it matter - metric

# previous work

# my approach

# outcome - good and bad

# what would I do if i have more time

audience is instructors for report, but other students for presentation

In [20]:
EMBEDDING_DIM = 100

embeddings_index = {}
f = open(os.path.join('data', f'glove.6B.{EMBEDDING_DIM}d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found {} word vectors.'.format(len(embeddings_index)))

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print(embedding_matrix[72])

Found 400000 word vectors.
[-0.013767    0.33247     0.59895003 -0.51560998 -0.56200999  0.12329
 -0.13417     0.26010999  0.65116     0.024565    0.31990001  0.49118
  0.30761999  0.53333002  0.68014997 -0.25419    -0.068138    0.26390001
 -0.35962999  0.47576001  0.36392     0.23683999 -0.24312    -0.52683997
  0.15305001  0.032089   -0.11053    -0.71643001 -0.026425   -0.41872001
  0.18218     0.084099   -0.18880001  0.22899     0.30495     0.45337
  0.27868     0.054886   -0.046348    0.14313    -0.48341     0.27654001
  0.53847998 -0.66876     0.13568    -0.45659     0.20602    -0.67056
 -0.65925997 -1.09109998  0.24557    -0.14213     0.086415    0.85842001
 -0.016081   -2.7815001   0.40608001 -0.094489    1.77760005  0.85031998
 -0.34224001  0.39772001 -0.83965999  0.13606     1.18959999 -0.17331
  0.71847999  0.042783   -0.24022     0.07143    -0.41633001 -0.39236
  0.2579     -0.64804     0.64845002  0.47343999  0.036004   -0.39083001
 -1.37950003 -0.084084    0.81971997 -0.70

In [21]:
BATCH_SIZE = 2  # keep it small for GPU ram porpoises

embedding = tf.keras.layers.Embedding(vocab_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=False,
                            input_shape=(max_doc_length, max_sent_length),
                           mask_zero=True)

embedding(train_tensor[:BATCH_SIZE]).shape

TensorShape([2, 32, 239, 100])

In [22]:
example_document = [
    word_tokenize(sent)
    for sent in sent_tokenize("\n".join([
        "Dear abby,"
        "I'm writing to tell you you suck."
        "Help me out of this mess.",
        "Bye"
    ]))
]

encoded_example = pad_to_dense(encode(example_document), max_sent_length, max_doc_length)
print(encoded_example)
encoded_example.shape

[[[  535.     0.     0. ...     0.     0.     0.]
  [11019.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  ...
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]]

 [[ 5008.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  ...
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]]]


(2, 32, 239)

In [23]:
tf.math.count_nonzero(encoded_example).numpy()

17

In [24]:
embedded_example = embedding(encoded_example)
embedded_example.shape

TensorShape([2, 32, 239, 100])

In [25]:
test_flatten = lambda t: tf.reshape(t, (BATCH_SIZE * max_doc_length, max_sent_length, -1))
test_lstm = tf.keras.layers.LSTM(1024)
test_reshape = lambda t: tf.reshape(t, (BATCH_SIZE, max_doc_length, 1024))

print(embedded_example.shape)
print(test_flatten(embedded_example).shape)
print(test_lstm(test_flatten(embedded_example)).shape)
print(test_reshape(test_lstm(test_flatten(embedded_example))).shape)

(2, 32, 239, 100)
(64, 239, 100)
(64, 1024)
(2, 32, 1024)


In [26]:
# onions

class BahdanauAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(BahdanauAttentionLayer, self).__init__(**kwargs)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
        self.cached_attention_weights = []
        
    def call(self, query, values):
        # (batch_size, ...) -> (batch_size, 1, ...)
        hidden_with_time_axis = tf.expand_dims(query, 1)
        
        # (batch_size, max_length, 1)
        scores = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        
        # (batch_size, max_length, 1) normalized lulz
        attention_weights = tf.nn.softmax(scores, axis=1)
        
        context_vector = tf.reduce_sum(attention_weights * values, axis=1)
        
        return context_vector, attention_weights


class AttentiveSequenceEncoder(tf.keras.layers.Layer):
    def __init__(self, lstm_units, attention_units, **kwargs):
        super().__init__(**kwargs)
        self.lstm = tf.keras.layers.LSTM(lstm_units, return_state=True)
        self.attention = BahdanauAttentionLayer(attention_units)
        
    def call(self, inputs):
        sequence_encoded, state_h, state_c = self.lstm(inputs)
        output, attention_weights = self.attention(sequence_encoded, state_h)
        
        return output, attention_weights

In [27]:
test_flatten = lambda t: tf.reshape(t, (2 * max_doc_length, max_sent_length, -1))
test_lstm = AttentiveSequenceEncoder(1024, 128)
test_reshape = lambda t: tf.reshape(t, (2, max_doc_length, 1024))

print(embedded_example.shape)
print(test_flatten(embedded_example).shape)
print(test_lstm(test_flatten(embedded_example))[0].shape)
print(test_reshape(test_lstm(test_flatten(embedded_example))[0]).shape)

(2, 32, 239, 100)
(64, 239, 100)
(64, 1024)
(2, 32, 1024)


In [28]:
class DocumentClassifierModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_matrix, lstm_units,
                 batch_size, max_doc_length, max_sent_length):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            len(embedding_matrix[0]),
            weights=[embedding_matrix],
            trainable=False,
            input_shape=(max_doc_length, max_sent_length),
            mask_zero=True)
        self.sentence_encoder = tf.keras.layers.LSTM(lstm_units)
        self.document_encoder = tf.keras.layers.LSTM(lstm_units)
        self.dense1 = tf.keras.layers.Dense(512, activation=tf.nn.tanh)
        self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
        
        self.batch_size = batch_size
        self.max_doc_length = max_doc_length
        self.max_sent_length = max_sent_length
    
    def flatten_docs(self, t):
        return tf.reshape(t, (self.batch_size * self.max_doc_length, self.max_sent_length, -1))
    
    def inflate_docs(self, t):
        return tf.reshape(t, (self.batch_size, self.max_doc_length, -1))
        
    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.flatten_docs(x)
        x = self.sentence_encoder(x)
        x = self.inflate_docs(x)
        x = self.document_encoder(x)
        x = self.dense1(x)
        
        return self.dense2(x)

In [29]:
class AttentiveDocumentClassifierModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_matrix, lstm_units, attention_units,
                 batch_size, max_doc_length, max_sent_length):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            len(embedding_matrix[0]),
            weights=[embedding_matrix],
            trainable=False,
            input_shape=(max_doc_length, max_sent_length),
            mask_zero=True)
        self.sentence_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        self.document_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        self.dense1 = tf.keras.layers.Dense(512, activation=tf.nn.tanh)
        self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
        
        self.batch_size = batch_size
        self.max_doc_length = max_doc_length
        self.max_sent_length = max_sent_length
    
    def flatten_docs(self, t):
        return tf.reshape(t, (self.batch_size * self.max_doc_length, self.max_sent_length, -1))
    
    def inflate_docs(self, t):
        return tf.reshape(t, (self.batch_size, self.max_doc_length, -1))
        
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.flatten_docs(x)
        x, _ = self.sentence_encoder(x)
        x = self.inflate_docs(x)
        x, _ = self.document_encoder(x)
        x = self.dense1(x)
        
        return self.dense2(x)

In [30]:
doc_model = DocumentClassifierModel(vocab_size, embedding_matrix, 256,
                                    BATCH_SIZE, max_doc_length, max_sent_length)
attentive_doc_model = AttentiveDocumentClassifierModel(vocab_size, embedding_matrix, 256, 32,
                                                       BATCH_SIZE, max_doc_length, max_sent_length)

doc_model(encoded_example)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



<tf.Tensor: id=79736, shape=(2, 5), dtype=float32, numpy=
array([[0.2, 0.2, 0.2, 0.2, 0.2],
       [0.2, 0.2, 0.2, 0.2, 0.2]], dtype=float32)>

In [31]:
attentive_doc_model(encoded_example)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



<tf.Tensor: id=87910, shape=(2, 5), dtype=float32, numpy=
array([[0.2, 0.2, 0.2, 0.2, 0.2],
       [0.2, 0.2, 0.2, 0.2, 0.2]], dtype=float32)>

In [32]:
doc_model.compile(optimizer='adam', metrics=['accuracy'], loss=tf.keras.losses.CategoricalCrossentropy())
attentive_doc_model.compile(optimizer='adam', metrics=['accuracy'], loss=tf.keras.losses.CategoricalCrossentropy())

In [34]:
doc_model.fit(
    train_tensor[:100], train_labels[:100],
    batch_size=BATCH_SIZE,
    epochs=10)

Train on 100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd1295e0240>

In [35]:
attentive_doc_model.fit(
    train_tensor[:100], train_labels[:100],
    batch_size=BATCH_SIZE,
    epochs=10)

Train on 100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd120cc7f28>