In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
import os
import time

from tensorflow.python.ops import array_ops

from nltk import sent_tokenize, word_tokenize

from gcdc_data import load, load_pandas, TrainOrTest

In [2]:
train_data = load_pandas(TrainOrTest.TRAIN)
test_data = load_pandas(TrainOrTest.TEST)

print(train_data['label'].unique())

train_data.head()

['2' '3' '1']


Unnamed: 0,text,label
0,Cheryl:\n\nAre we in a good place to begin pap...,2
1,"Our friend, General Joe Ballard owns The Raven...",2
2,Outstanding news! Miki Rakic called about 10 m...,3
3,Responding to separate emails from Uzra + Jeff...,1
4,Guy from Mexico is in NY and is cooperating. D...,1


In [3]:
t = tf.keras.preprocessing.text.Tokenizer(oov_token='unk')
t.fit_on_texts([
    word
    for doc in train_data['text']
    for sent in sent_tokenize(doc)
    for word in word_tokenize(sent)
])

vocab_size = len(t.word_index) + 1

vocab_size, t.word_index['unk']

(27432, 1)

In [4]:
EMBEDDING_DIM = 100

embeddings_index = {}
f = open(os.path.join('data', f'glove.6B.{EMBEDDING_DIM}d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found {} word vectors.'.format(len(embeddings_index)))

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word, embeddings_index['unk'])
    embedding_matrix[i] = embedding_vector
        
print(embedding_matrix[72])

Found 400000 word vectors.
[ 3.02399993e-02  4.46060002e-01  4.31659997e-01 -3.75279993e-01
  2.90679991e-01  2.30320007e-01  1.81250006e-01  4.02009994e-01
  1.35179996e-01 -1.95620000e-01  3.06389987e-01 -1.32390007e-01
  6.78969979e-01  4.22340006e-01  3.26370001e-01 -1.52810007e-01
  3.76980007e-01 -2.33030006e-01 -3.38169992e-01  3.05880010e-01
  4.49180007e-01 -8.36239994e-01  5.91459990e-01  2.49579996e-01
  3.99859995e-01 -5.01720011e-01 -2.35440001e-01 -1.46960005e-01
 -3.51440012e-01 -5.68520010e-01  8.95399973e-02  8.26120019e-01
 -2.65859991e-01  3.90300006e-01 -3.68489996e-02  4.82569993e-01
  7.16639996e-01  1.10040002e-01 -5.93540013e-01 -3.32159996e-01
 -2.57360011e-01 -3.45310003e-01 -2.63260007e-02 -2.37470001e-01
  1.96559995e-04 -2.74800003e-01  3.85120004e-01 -3.95810008e-01
  1.14040002e-01 -2.51740009e-01 -3.24699998e-01  8.96079987e-02
  2.49290004e-01  1.51269996e+00 -1.97620004e-01 -2.85089993e+00
 -5.38330019e-01 -4.71109986e-01  1.78590000e+00  7.81260014e-0

In [5]:
len(set(t.word_index.keys() - embeddings_index.keys()))

2858

In [6]:
example_document = """Dear abby,

I'm writing to tell you you suck. Help me out of this mess.

Bye"""

In [7]:
def tokenize(text, tok=None):
    return [
        [
            [
                token[0] if token else t.word_index['unk']
                for token in tok.texts_to_sequences(nltk.word_tokenize(sent))
            ]
            if tok else
            nltk.word_tokenize(sent)
            for sent in nltk.sent_tokenize(para)
        ]
        for para in text.splitlines()
        if len(para) > 0
    ]

print(tokenize(example_document))

[[['Dear', 'abby', ',']], [['I', "'m", 'writing', 'to', 'tell', 'you', 'you', 'suck', '.'], ['Help', 'me', 'out', 'of', 'this', 'mess', '.']], [['Bye']]]


In [8]:
print(tokenize(example_document, t))

[[[536, 11020, 1]], [[5, 108, 1026, 3, 223, 10, 10, 4456, 1], [156, 42, 49, 7, 19, 3063, 1]], [[5009]]]


In [9]:
train_data['tokenized'] = train_data['text'].map(lambda text: tokenize(text, t))
test_data['tokenized'] = test_data['text'].map(lambda text: tokenize(text, t))

train_data.head()

Unnamed: 0,text,label,tokenized
0,Cheryl:\n\nAre we in a good place to begin pap...,2,"[[[430, 1]], [[21, 14, 8, 6, 64, 89, 3, 835, 4..."
1,"Our friend, General Joe Ballard owns The Raven...",2,"[[[40, 365, 1, 507, 1920, 8025, 3383, 2, 8026,..."
2,Outstanding news! Miki Rakic called about 10 m...,3,"[[[1851, 517, 1], [11497, 15395, 220, 47, 250,..."
3,Responding to separate emails from Uzra + Jeff...,1,"[[[3546, 3, 1397, 2668, 38, 11501, 1, 559, 1]]..."
4,Guy from Mexico is in NY and is cooperating. D...,1,"[[[521, 38, 828, 11, 8, 1398, 4, 11, 5692, 1],..."


In [10]:
MAX_DOC_LENGTH = 0
MAX_PARA_LENGTH = 0
MAX_SENT_LENGTH = 0

for doc in train_data['tokenized'].append(test_data['tokenized']):
    MAX_DOC_LENGTH = max(MAX_DOC_LENGTH, len(doc))
    for para in doc:
        MAX_PARA_LENGTH = max(MAX_PARA_LENGTH, len(para))
        for sent in para:
            MAX_SENT_LENGTH = max(MAX_SENT_LENGTH, len(sent))
            
MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH

(12, 32, 255)

In [11]:
def pad_to_dense(M, doc_length=MAX_DOC_LENGTH, para_length=MAX_PARA_LENGTH, sent_length=MAX_SENT_LENGTH):
    Z = np.zeros((len(M), doc_length, para_length, sent_length))
    for docidx, doc in enumerate(M):
        for paraidx, para in enumerate(doc):
            for sentidx, sent in enumerate(para):
                sentnp = np.hstack(np.array(sent))
                Z[docidx, paraidx, sentidx, :len(sentnp)] += sentnp
    return Z

print(train_data['tokenized'][0])
padded_example = pad_to_dense(train_data['tokenized'][:1])
padded_example.shape

[[[430, 1]], [[21, 14, 8, 6, 64, 89, 3, 835, 4480, 3, 441, 7027, 2504, 57, 83, 15, 15389, 8, 799, 1]], [[28, 34, 1, 5, 33, 360, 14, 229, 75, 3, 404, 15, 799, 2071, 4, 5, 25, 48, 2, 2579, 302, 3, 270, 2, 4480, 1], [67, 25, 20, 2505, 3, 69, 12, 2138, 210, 1, 72, 25, 4204, 75, 3, 879, 2017, 12, 3222, 1, 1760, 1, 15390, 1, 294, 1, 4, 19, 2138, 120, 812, 25, 1761, 75, 167, 2, 205, 7, 2, 469, 15, 861, 396, 1, 3544, 1], [19, 399, 33, 157, 42, 250, 210, 41, 34, 7, 5191, 126, 6, 5192, 2418, 12, 6305, 15, 880, 1118, 1]], [[5, 85, 22, 2773, 15, 8022, 49, 3, 6305, 1, 2, 1850, 14, 48, 2, 66, 36, 11, 3, 20, 215, 1, 4, 14, 104, 22, 48, 1716, 1, 1918, 49, 59, 126, 14, 3088, 1, 22, 6, 15391, 15392, 1550, 1, 151, 14, 43, 341, 1802, 9, 2332, 12, 985, 41, 861, 1, 1], [27, 1, 28, 2, 2862, 21, 8, 89, 1, 3956, 4, 5, 33, 58, 3, 79, 49, 986, 8, 182, 3, 1919, 8, 11496, 676, 15, 880, 1803, 1]], [[31, 8023, 2665, 11, 968, 12, 880, 515, 29, 8024, 15, 2, 5688, 677, 1], [638, 281, 14, 5689, 1, 9, 397, 11, 341, 4, 39

(1, 12, 32, 255)

In [12]:
def mask_for(M, doc_length=MAX_DOC_LENGTH, para_length=MAX_PARA_LENGTH, sent_length=MAX_SENT_LENGTH):
    Z = np.zeros((len(M), doc_length, para_length, sent_length), dtype=bool)
    for docidx, doc in enumerate(M):
        for paraidx, para in enumerate(doc):
            for sentidx, sent in enumerate(para):
                for tokenidx, token in enumerate(sent):
                    Z[docidx, paraidx, sentidx, tokenidx] = True
    return Z

print(train_data['tokenized'][0])
masked_example = mask_for(train_data['tokenized'][:1])
masked_example.shape

[[[430, 1]], [[21, 14, 8, 6, 64, 89, 3, 835, 4480, 3, 441, 7027, 2504, 57, 83, 15, 15389, 8, 799, 1]], [[28, 34, 1, 5, 33, 360, 14, 229, 75, 3, 404, 15, 799, 2071, 4, 5, 25, 48, 2, 2579, 302, 3, 270, 2, 4480, 1], [67, 25, 20, 2505, 3, 69, 12, 2138, 210, 1, 72, 25, 4204, 75, 3, 879, 2017, 12, 3222, 1, 1760, 1, 15390, 1, 294, 1, 4, 19, 2138, 120, 812, 25, 1761, 75, 167, 2, 205, 7, 2, 469, 15, 861, 396, 1, 3544, 1], [19, 399, 33, 157, 42, 250, 210, 41, 34, 7, 5191, 126, 6, 5192, 2418, 12, 6305, 15, 880, 1118, 1]], [[5, 85, 22, 2773, 15, 8022, 49, 3, 6305, 1, 2, 1850, 14, 48, 2, 66, 36, 11, 3, 20, 215, 1, 4, 14, 104, 22, 48, 1716, 1, 1918, 49, 59, 126, 14, 3088, 1, 22, 6, 15391, 15392, 1550, 1, 151, 14, 43, 341, 1802, 9, 2332, 12, 985, 41, 861, 1, 1], [27, 1, 28, 2, 2862, 21, 8, 89, 1, 3956, 4, 5, 33, 58, 3, 79, 49, 986, 8, 182, 3, 1919, 8, 11496, 676, 15, 880, 1803, 1]], [[31, 8023, 2665, 11, 968, 12, 880, 515, 29, 8024, 15, 2, 5688, 677, 1], [638, 281, 14, 5689, 1, 9, 397, 11, 341, 4, 39

(1, 12, 32, 255)

In [13]:
embedding = tf.keras.layers.Embedding(
    vocab_size,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    trainable=False,
    mask_zero=True)

embedded_example = embedding(padded_example)

embedded_example.shape

TensorShape([1, 12, 32, 255, 100])

In [14]:
# onions

class BahdanauAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(BahdanauAttentionLayer, self).__init__(**kwargs)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
        self.cached_attention_weights = []
        
    def call(self, query, values):
        # (batch_size, ...) -> (batch_size, 1, ...)
        hidden_with_time_axis = tf.expand_dims(query, 1)
        
        # (batch_size, max_length, 1)
        scores = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        
        # (batch_size, max_length, 1) normalized lulz
        attention_weights = tf.nn.softmax(scores, axis=1)
        
        context_vector = tf.reduce_sum(attention_weights * values, axis=1)
        
        return context_vector, attention_weights

In [15]:
# models

class SequenceEncoder(tf.keras.Model):
    def __init__(self, lstm_units, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.lstm = tf.keras.layers.LSTM(lstm_units, recurrent_dropout=0.001)
        
    def call(self, inputs, mask, training=False):
        return self.lstm(
            inputs=inputs,
            mask=mask,
            training=training)


class AttentiveSequenceEncoder(tf.keras.Model):
    def __init__(self, lstm_units, attention_units, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.lstm = tf.keras.layers.LSTM(lstm_units, return_state=True)
        self.attention = BahdanauAttentionLayer(attention_units)
        
    def call(self, inputs, mask, training=False):
        sequence_encoded, state_h, state_c = self.lstm(
            inputs=inputs,
            mask=mask,
            training=training)
        output, attention_weights = self.attention(sequence_encoded, state_h)
        
        return output, attention_weights
    

class DocumentEmbeddingClassifier(tf.keras.Model):
    def __init__(self, hidden_units, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.hidden =  tf.keras.layers.Dense(hidden_units)
        self.predictor = tf.keras.layers.Dense(3)
        
    def call(self, inputs, training=False):
        x = self.hidden(inputs)
        x = self.predictor(x)
        
        if training:
            return x
        else:
            return tf.math.argmax(x)

In [16]:
BATCH_SIZE = 2

train_tensor = pad_to_dense(train_data['tokenized'])
test_tensor = pad_to_dense(test_data['tokenized'])
train_masks = mask_for(train_data['tokenized'])
test_masks = mask_for(test_data['tokenized'])

train_dataset = tf.data.Dataset.from_tensor_slices(
    (train_tensor, train_masks, train_data['label'].to_numpy()))
test_dataset = tf.data.Dataset.from_tensor_slices(
    (test_tensor, test_masks, test_data['label'].to_numpy()))

train_dataset = train_dataset.shuffle(1000).batch(BATCH_SIZE)
test_dataset = test_dataset.shuffle(1000).batch(BATCH_SIZE)

[item.shape for item in next(iter(train_dataset))]

[TensorShape([2, 12, 32, 255]),
 TensorShape([2, 12, 32, 255]),
 TensorShape([2])]

In [17]:
sent_encoder = SequenceEncoder(100)
para_encoder = SequenceEncoder(100)
doc_encoder = SequenceEncoder(100)
classifier = DocumentEmbeddingClassifier(100)


@tf.function
def mask_reduce(mask_arr):
    return any(mask_arr)

def predict_doc_model(
        docs, masks,
        embedder, sent_encoder, para_encoder, doc_encoder, classifier,
        training=False):
    
    embedded = embedder(docs)
    
    sents = array_ops.reshape(
        embedded,
        (BATCH_SIZE * MAX_DOC_LENGTH * MAX_PARA_LENGTH, MAX_SENT_LENGTH, -1))
    sent_masks = array_ops.reshape(
        masks,
        (BATCH_SIZE * MAX_DOC_LENGTH * MAX_PARA_LENGTH, MAX_SENT_LENGTH))

    sent_encoded = sent_encoder(sents, sent_masks)
    sent_mask_reduced = tf.map_fn(mask_reduce, sent_masks)

    paras = array_ops.reshape(
        sent_encoded,
        (BATCH_SIZE * MAX_DOC_LENGTH, MAX_PARA_LENGTH, -1))
    para_masks = array_ops.reshape(
        sent_mask_reduced,
        (BATCH_SIZE * MAX_DOC_LENGTH, MAX_PARA_LENGTH))
            
    para_encoded = para_encoder(paras, para_masks)
    para_mask_reduced = tf.map_fn(mask_reduce, para_masks)
    
    docs = array_ops.reshape(
        para_encoded,
        (BATCH_SIZE, MAX_DOC_LENGTH, -1))
    docs_masks = array_ops.reshape(
        para_mask_reduced,
        (BATCH_SIZE, MAX_DOC_LENGTH))
    doc_encoded = doc_encoder(docs, docs_masks)
    
    return classifier(doc_encoded, training=training)


predict_doc_model(train_tensor[:2], train_masks[:2], embedding, sent_encoder, para_encoder, doc_encoder, classifier)

OperatorNotAllowedInGraphError: in converted code:

    <ipython-input-17-01d2d447258e>:9 mask_reduce  *
        return any(mask_arr)
    /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py:396 converted_call
        return py_builtins.overload_of(f)(*args)
    /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:547 __iter__
        self._disallow_iteration()
    /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:540 _disallow_iteration
        self._disallow_when_autograph_enabled("iterating over `tf.Tensor`")
    /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:518 _disallow_when_autograph_enabled
        " decorating it directly with @tf.function.".format(task))

    OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed: AutoGraph did not convert this function. Try decorating it directly with @tf.function.


In [None]:
def train_fn_for(predict_fn, embedder, sent_encoder, para_encoder, doc_encoder, classifier):
    optimizer = tf.keras.optimizers.Adam()
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    
    @tf.function
    def train_step(docs, masks, targets):
        loss = 0

        with tf.GradientTape() as tape:
            predictions = predict_fn(docs, masks, embedder, sent_encoder, para_encoder, doc_encoder, classifier)
            
            loss += loss_object(targets, predictions)

        variables = sent_encoder.trainable_variables +\
                        para_encoder.trainable_variables +\
                        doc_encoder.trainable_variables +\
                        classifier.trainable_variables
        gradients = tape.gradient(loss, variables)
    
        optimizer.apply_gradients(zip(gradients, variables))

        return loss
    
    return train_step
    
doc_model_train_step = train_fn_for(
    predict_doc_model,
    embedding,
    sent_encoder, para_encoder, doc_encoder,
    classifier)

EPOCHS = 1
for epoch in range(EPOCHS):
    start = time.time()
    total_loss = 0
    
    for (batch, (inp, mask, target)) in enumerate(train_dataset):
        total_loss += doc_model_train_step(inp, mask, target)
        
        #if batch % 2 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     batch,
                                                     total_loss.numpy()))
        
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))