In [40]:
import tensorflow as tf
import pandas as pd
import numpy as np

from nltk import sent_tokenize, word_tokenize

from gcdc_data import load, load_pandas, TrainOrTest

In [44]:
train_data = load_pandas(TrainOrTest.TRAIN)
test_data = load_pandas(TrainOrTest.TEST)

train_data['sents'] = train_data['text'].map(lambda t: [
    word_tokenize(sent)
    for sent in sent_tokenize(t)
])
test_data['sents'] = test_data['text'].map(lambda t: [
    word_tokenize(sent)
    for sent in sent_tokenize(t)
])

train_data.head()

Unnamed: 0,text,label,sents
0,Cheryl:\n\nAre we in a good place to begin pap...,2,"[[Cheryl, :, Are, we, in, a, good, place, to, ..."
1,"Our friend, General Joe Ballard owns The Raven...",2,"[[Our, friend, ,, General, Joe, Ballard, owns,..."
2,Outstanding news! Miki Rakic called about 10 m...,3,"[[Outstanding, news, !], [Miki, Rakic, called,..."
3,Responding to separate emails from Uzra + Jeff...,1,"[[Responding, to, separate, emails, from, Uzra..."
4,Guy from Mexico is in NY and is cooperating. D...,1,"[[Guy, from, Mexico, is, in, NY, and, is, coop..."


In [53]:
t = tf.keras.preprocessing.text.Tokenizer()
t.fit_on_texts([
    word
    for doc in train_data['sents']
    for sent in doc
    for word in sent
])

vocab_size = len(t.word_index) + 1

# integer encode the documents
def encode(texts):
    return [
        t.texts_to_sequences(doc) for doc in texts
    ]

encoded_train = encode(train_data['sents'])
encoded_test = encode(test_data['sents'])

max_sent_length = max(
    max(len(sent) for txt in encoded_train for sent in txt),
    max(len(sent) for txt in encoded_test for sent in txt)
)
max_doc_length = max(
    max(len(txt) for txt in encoded_train),
    max(len(txt) for txt in encoded_test)
)

max_sent_length, max_doc_length

(239, 32)

In [56]:
def pad_to_dense(M, sent_len, doc_len):
    maxlen = max(len(r) for r in M)

    Z = np.zeros((len(M), doc_len, sent_len))
    for docidx, doc in enumerate(M):
        for sentidx, sent in enumerate(doc):
            Z[docidx, sentidx, :len(sent)] += np.array(sent)
    return Z

train_tensor = pad_to_dense(encoded_train, max_sent_length, max_doc_length)
test_tensor = pad_to_dense(encoded_test, max_sent_length, max_doc_length)

train_tensor.shape, test_tensor.shape

((4000, 32, 239), (800, 32, 239))

# problem, why does it matter - metric

# previous work

# my approach

# outcome - good and bad

# what would I do if i have more time

audience is instructors for report, but other students for presentation

In [75]:
# onions

class BahdanauAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(BahdanauAttentionLayer, self).__init__(**kwargs)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
        self.cached_attention_weights = []
        
    def call(self, query, values):
        # (batch_size, ...) -> (batch_size, 1, ...)
        hidden_with_time_axis = tf.expand_dims(query, 1)
        
        # (batch_size, max_length, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        
        # (batch_size, max_length, 1) normalized lulz
        attention_weights = tf.nn.softmax(scores, axis=1)
        
        context_vector = tf.reduce_sum(attention_weights * values, axis=1)
        
        return context_vector, attention_weights


class AttentiveSequenceEncoder(tf.keras.layers.Layer):
    def __init__(self, lstm_units, attention_units, **kwargs):
        super().__init__(**kwargs)
        self.lstm = tf.keras.layers.LSTM(lstm_units, return_state=True)
        self.attention = BahdanauAttentionLayer(attention_units)
        
    def call(self, inputs):
        print(inputs.shape)
        encoded, state_h, state_c = self.lstm(inputs)
        output, attention_weights = self.attention(sequence_encoded, state_h)
        
        return output, attention_weights
    


embedding = tf.keras.layers.Embedding(vocab_size, 32, input_shape=(max_doc_length, max_sent_length))
sent_encoder = tf.keras.layers.TimeDistributed(
    tf.keras.layers.TimeDistributed(
        AttentiveSequenceEncoder(64, 256)))

doc_encoder = tf.keras.layers.TimeDistributed(AttentiveSequenceEncoder(64, 256))

In [68]:
example_document = [
    word_tokenize(sent)
    for sent in sent_tokenize("\n".join([
        "Dear abby,"
        "I'm writing to tell you you suck."
        "Help me out of this mess.",
        "Bye"
    ]))
]

encoded_example = pad_to_dense(encode(example_document), max_sent_length, max_doc_length)
print(encoded_example)
encoded_example.shape

[[[  535.     0.     0. ...     0.     0.     0.]
  [11020.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  ...
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]]

 [[ 5008.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  ...
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]
  [    0.     0.     0. ...     0.     0.     0.]]]


(2, 32, 239)

In [69]:
embedding(encoded_example).shape

TensorShape([Dimension(2), Dimension(32), Dimension(239), Dimension(32)])

In [76]:
sent_encoder(embedding(encoded_example)).shape

(64, 32)


ValueError: Input 0 of layer lstm_5 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [64, 32]