In [6]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
import os

from nltk import sent_tokenize, word_tokenize

from gcdc_data import load, load_pandas, TrainOrTest

In [2]:
train_data = load_pandas(TrainOrTest.TRAIN)
test_data = load_pandas(TrainOrTest.TEST)

train_data['sents'] = train_data['text'].map(lambda t: [
    word_tokenize(sent)
    for sent in sent_tokenize(t)
])
test_data['sents'] = test_data['text'].map(lambda t: [
    word_tokenize(sent)
    for sent in sent_tokenize(t)
])

train_data['label_normed'] = pd.to_numeric(train_data['label']) / 5.0
test_data['label_normed'] = pd.to_numeric(test_data['label']) / 5.0

train_data.head()

Unnamed: 0,text,label,sents,label_normed
0,Cheryl:\n\nAre we in a good place to begin pap...,2,"[[Cheryl, :, Are, we, in, a, good, place, to, ...",0.4
1,"Our friend, General Joe Ballard owns The Raven...",2,"[[Our, friend, ,, General, Joe, Ballard, owns,...",0.4
2,Outstanding news! Miki Rakic called about 10 m...,3,"[[Outstanding, news, !], [Miki, Rakic, called,...",0.6
3,Responding to separate emails from Uzra + Jeff...,1,"[[Responding, to, separate, emails, from, Uzra...",0.2
4,Guy from Mexico is in NY and is cooperating. D...,1,"[[Guy, from, Mexico, is, in, NY, and, is, coop...",0.2


In [3]:
t = tf.keras.preprocessing.text.Tokenizer()
t.fit_on_texts([
    word
    for doc in train_data['sents']
    for sent in doc
    for word in sent
])

vocab_size = len(t.word_index) + 1

# integer encode the documents
def encode(texts):
    return [
        t.texts_to_sequences(doc) for doc in texts
    ]

encoded_train = encode(train_data['sents'])
encoded_test = encode(test_data['sents'])

max_sent_length = max(
    max(len(sent) for txt in encoded_train for sent in txt),
    max(len(sent) for txt in encoded_test for sent in txt)
)
max_doc_length = max(
    max(len(txt) for txt in encoded_train),
    max(len(txt) for txt in encoded_test)
)

vocab_size, max_sent_length, max_doc_length

(27431, 239, 32)

In [4]:
def pad_to_dense(M, sent_len, doc_len):
    maxlen = max(len(r) for r in M)

    Z = np.zeros((len(M), doc_len, sent_len))
    for docidx, doc in enumerate(M):
        for sentidx, sent in enumerate(doc):
            Z[docidx, sentidx, :len(sent)] += np.array(sent)
    return Z

train_tensor = pad_to_dense(encoded_train, max_sent_length, max_doc_length)
test_tensor = pad_to_dense(encoded_test, max_sent_length, max_doc_length)

train_tensor.shape, test_tensor.shape

((4000, 32, 239), (800, 32, 239))

# problem, why does it matter - metric

# previous work

# my approach

# outcome - good and bad

# what would I do if i have more time

audience is instructors for report, but other students for presentation

In [19]:
EMBEDDING_DIM = 100

embeddings_index = {}
f = open(os.path.join('data', f'glove.6B.{EMBEDDING_DIM}d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found {} word vectors.'.format(len(embeddings_index)))

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print(embedding_matrix[0])

Found 400000 word vectors.
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [13]:
BATCH_SIZE = 2  # keep it small for GPU ram porpoises

embedding = tf.keras.layers.Embedding(vocab_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=False,
                            input_shape=(max_doc_length, max_sent_length),
                           mask_zero=True)

embedding(train_tensor[:BATCH_SIZE]).shape

TensorShape([2, 32, 239, 100])

In [14]:
example_document = [
    word_tokenize(sent)
    for sent in sent_tokenize("\n".join([
        "Dear abby,"
        "I'm writing to tell you you suck."
        "Help me out of this mess.",
        "Bye"
    ]))
]

encoded_example = pad_to_dense(encode(example_document), max_sent_length, max_doc_length)
print(encoded_example)
encoded_example.shape

[[[  535.     0.     0. ...     0.     0.     0.]
  [11019.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  ...
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]]

 [[ 5008.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  ...
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]]]


(2, 32, 239)

In [15]:
embedded_example = embedding(encoded_example)
embedded_example.shape

TensorShape([2, 32, 239, 100])

In [16]:
test_flatten = lambda t: tf.reshape(t, (BATCH_SIZE * max_doc_length, max_sent_length, -1))
test_lstm = tf.keras.layers.LSTM(1024)
test_reshape = lambda t: tf.reshape(t, (BATCH_SIZE, max_doc_length, 1024))

print(embedded_example.shape)
print(test_flatten(embedded_example).shape)
print(test_lstm(test_flatten(embedded_example)).shape)
print(test_reshape(test_lstm(test_flatten(embedded_example))).shape)

(2, 32, 239, 100)
(64, 239, 100)
(64, 1024)
(2, 32, 1024)


In [17]:
# onions

class BahdanauAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(BahdanauAttentionLayer, self).__init__(**kwargs)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
        self.cached_attention_weights = []
        
    def call(self, query, values):
        # (batch_size, ...) -> (batch_size, 1, ...)
        hidden_with_time_axis = tf.expand_dims(query, 1)
        
        # (batch_size, max_length, 1)
        scores = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        
        # (batch_size, max_length, 1) normalized lulz
        attention_weights = tf.nn.softmax(scores, axis=1)
        
        context_vector = tf.reduce_sum(attention_weights * values, axis=1)
        
        return context_vector, attention_weights


class AttentiveSequenceEncoder(tf.keras.layers.Layer):
    def __init__(self, lstm_units, attention_units, **kwargs):
        super().__init__(**kwargs)
        self.lstm = tf.keras.layers.LSTM(lstm_units, return_state=True)
        self.attention = BahdanauAttentionLayer(attention_units)
        
    def call(self, inputs):
        sequence_encoded, state_h, state_c = self.lstm(inputs)
        output, attention_weights = self.attention(sequence_encoded, state_h)
        
        return output, attention_weights

In [18]:
test_flatten = lambda t: tf.reshape(t, (2 * max_doc_length, max_sent_length, -1))
test_lstm = AttentiveSequenceEncoder(1024, 128)
test_reshape = lambda t: tf.reshape(t, (2, max_doc_length, 1024))

print(embedded_example.shape)
print(test_flatten(embedded_example).shape)
print(test_lstm(test_flatten(embedded_example))[0].shape)
print(test_reshape(test_lstm(test_flatten(embedded_example))[0]).shape)

(2, 32, 239, 100)
(64, 239, 100)
(64, 1024)
(2, 32, 1024)


In [20]:
class DocumentClassifierModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_matrix, lstm_units,
                 batch_size, max_doc_length, max_sent_length):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            len(embedding_matrix[0]),
            weights=[embedding_matrix],
            trainable=False,
            input_shape=(max_doc_length, max_sent_length),
            mask_zero=True)
        self.sentence_encoder = tf.keras.layers.LSTM(lstm_units)
        self.document_encoder = tf.keras.layers.LSTM(lstm_units)
        self.dense1 = tf.keras.layers.Dense(1024, activation=tf.nn.tanh)
        self.dense2 = tf.keras.layers.Dense(1, activation=tf.nn.softmax)
        
        self.batch_size = batch_size
        self.max_doc_length = max_doc_length
        self.max_sent_length = max_sent_length
    
    def flatten_docs(self, t):
        return tf.reshape(t, (self.batch_size * self.max_doc_length, self.max_sent_length, -1))
    
    def inflate_docs(self, t):
        return tf.reshape(t, (self.batch_size, self.max_doc_length, -1))
        
    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.flatten_docs(x)
        x = self.sentence_encoder(x)
        x = self.inflate_docs(x)
        x = self.document_encoder(x)
        x = self.dense1(x)
        
        return self.dense2(x)

In [21]:
class AttentiveDocumentClassifierModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_matrix, lstm_units, attention_units,
                 batch_size, max_doc_length, max_sent_length):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            len(embedding_matrix[0]),
            weights=[embedding_matrix],
            trainable=False,
            input_shape=(max_doc_length, max_sent_length),
            mask_zero=True)
        self.sentence_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        self.document_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        self.dense1 = tf.keras.layers.Dense(1024, activation=tf.nn.tanh)
        self.dense2 = tf.keras.layers.Dense(1, activation=tf.nn.softmax)
        
        self.batch_size = batch_size
        self.max_doc_length = max_doc_length
        self.max_sent_length = max_sent_length
    
    def flatten_docs(self, t):
        return tf.reshape(t, (self.batch_size * self.max_doc_length, self.max_sent_length, -1))
    
    def inflate_docs(self, t):
        return tf.reshape(t, (self.batch_size, self.max_doc_length, -1))
        
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.flatten_docs(x)
        x, _ = self.sentence_encoder(x)
        x = self.inflate_docs(x)
        x, _ = self.document_encoder(x)
        x = self.dense1(x)
        
        return self.dense2(x)

In [22]:
doc_model = DocumentClassifierModel(vocab_size, embedding_matrix, 1024,
                                    BATCH_SIZE, max_doc_length, max_sent_length)
attentive_doc_model = AttentiveDocumentClassifierModel(vocab_size, embedding_matrix, 1024, 32,
                                                       BATCH_SIZE, max_doc_length, max_sent_length)

doc_model(encoded_example)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



<tf.Tensor: id=35282, shape=(2, 1), dtype=float32, numpy=
array([[1.],
       [1.]], dtype=float32)>

In [23]:
attentive_doc_model(encoded_example)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



<tf.Tensor: id=43456, shape=(2, 1), dtype=float32, numpy=
array([[1.],
       [1.]], dtype=float32)>

In [24]:
doc_model.compile(optimizer='adam', metrics=['accuracy'], loss=tf.keras.losses.BinaryCrossentropy())
attentive_doc_model.compile(optimizer='adam', metrics=['accuracy'], loss=tf.keras.losses.BinaryCrossentropy())

In [None]:
doc_model.fit(
    train_tensor, train_data['label_normed'].to_numpy(),
    batch_size=BATCH_SIZE,
    epochs=10)

Train on 4000 samples
Epoch 1/10
  72/4000 [..............................] - ETA: 4:04:00 - loss: 8.9870 - accuracy: 0.0000e+00