In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
import os

from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Concatenate
from tensorflow.python.ops import array_ops
from nltk import sent_tokenize, word_tokenize

from gcdc_data import load, load_pandas, Source, TrainOrTest

In [None]:
train_data = load_pandas(TrainOrTest.TRAIN)
test_data = load_pandas(TrainOrTest.TEST)

print(train_data['label'].unique())

train_data.head()

In [None]:
t = tf.keras.preprocessing.text.Tokenizer(oov_token='unk')
t.fit_on_texts([
    word
    for doc in train_data['text']
    for sent in sent_tokenize(doc)
    for word in word_tokenize(sent)
])

vocab_size = len(t.word_index) + 1

vocab_size, t.word_index['unk']

In [None]:
EMBEDDING_DIM = 300

embeddings_index = {}
f = open(os.path.join('data', f'glove.6B.{EMBEDDING_DIM}d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found {} word vectors.'.format(len(embeddings_index)))

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word, embeddings_index['unk'])
    embedding_matrix[i] = embedding_vector
        
print(embedding_matrix[72])

In [None]:
len(set(t.word_index.keys() - embeddings_index.keys()))

In [None]:
example_document = """Dear abby,

I'm writing to tell you you suck. Help me out of this mess.

Bye"""

In [None]:
def tokenize(text, tok=None):
    return [
        [
            tok.texts_to_sequences(nltk.word_tokenize(sent))
            if tok else
            nltk.word_tokenize(sent)
            for sent in nltk.sent_tokenize(para)
        ]
        for para in text.splitlines()
        if len(para) > 0
    ]

print(tokenize(example_document))

In [None]:
print(tokenize(example_document, t))

In [None]:
train_data['tokenized'] = train_data['text'].map(lambda text: tokenize(text, t))
test_data['tokenized'] = test_data['text'].map(lambda text: tokenize(text, t))

train_data.head()

In [None]:
MAX_DOC_LENGTH = 0
MAX_PARA_LENGTH = 0
MAX_SENT_LENGTH = 0

for doc in train_data['tokenized'].append(test_data['tokenized']):
    MAX_DOC_LENGTH = max(MAX_DOC_LENGTH, len(doc))
    for para in doc:
        MAX_PARA_LENGTH = max(MAX_PARA_LENGTH, len(para))
        for sent in para:
            MAX_SENT_LENGTH = max(MAX_SENT_LENGTH, len(sent))
            
MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH

In [None]:
def pad_to_dense(M, sent_len=MAX_SENT_LENGTH, para_len=MAX_PARA_LENGTH, doc_len=MAX_DOC_LENGTH):
    Z = np.zeros((len(M), doc_len, para_len, sent_len))
    for docidx, doc in enumerate(M):
        for paraidx, para in enumerate(doc):
            for sentidx, sent in enumerate(para):
                sentnp = np.hstack(np.array(sent))
                Z[docidx, paraidx, sentidx, :len(sentnp)] += sentnp
    return Z

print(train_data['tokenized'][0])
pad_to_dense(train_data['tokenized'][:1])

In [None]:
def dense_mask(M, sent_len=MAX_SENT_LENGTH, para_len=MAX_PARA_LENGTH, doc_len=MAX_DOC_LENGTH):
    Z = np.zeros((len(M), doc_len, para_len, sent_len), dtype=bool)
    for docidx, doc in enumerate(M):
        for paraidx, para in enumerate(doc):
            for sentidx, sent in enumerate(para):
                for tokenidx, token in enumerate(sent):
                    Z[docidx, paraidx, sentidx, tokenidx] = True
    return Z

dense_mask(train_data['tokenized'][:1])

In [None]:
train_tensor = pad_to_dense(train_data['tokenized'])
test_tensor = pad_to_dense(test_data['tokenized'])

train_tensor.shape, test_tensor.shape

In [None]:
def categorical_labels(labels):
    eye = [
        [1.0, 0.0, 0.0],
        [0.0, 1.0, 0.0],
        [0.0, 0.0, 1.0]
    ]
    
    result = []
    for item in labels:
        result.append(eye[int(item) - 1])
        
    return np.array(result)

train_labels = categorical_labels(pd.to_numeric(train_data['label']))
test_labels = categorical_labels(pd.to_numeric(test_data['label']))

train_labels.shape, test_labels.shape

In [None]:
train_sent_mask = dense_mask(train_data['tokenized'])
test_sent_mask = dense_mask(test_data['tokenized'])

train_para_mask = np.apply_along_axis(any, 3, train_sent_mask)
test_para_mask = np.apply_along_axis(any, 3, test_sent_mask)

train_doc_mask = np.apply_along_axis(any, 2, train_para_mask)
test_doc_mask = np.apply_along_axis(any, 2, test_para_mask)

(train_sent_mask.shape, train_para_mask.shape, train_doc_mask.shape), (test_sent_mask.shape, test_para_mask.shape, test_doc_mask.shape)

In [None]:
BATCH_SIZE = 2

train_dataset = tf.data.Dataset.from_tensor_slices(
    ((train_tensor, train_sent_mask, train_para_mask, train_doc_mask), train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices(
    ((test_tensor, test_sent_mask, test_para_mask, test_doc_mask), test_labels))

train_dataset = train_dataset.shuffle(1000).batch(BATCH_SIZE)
test_dataset = test_dataset.shuffle(1000).batch(BATCH_SIZE)

(a, b, c, d), e = next(iter(train_dataset))
a.shape, b.shape, c.shape, d.shape, e.shape

In [None]:
embedding = tf.keras.layers.Embedding(
    vocab_size,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    trainable=False)

embedded_example = embedding(train_tensor[:2])

print(embedded_example.shape)
embedded_example[0][0][0][0]

In [None]:
# onions

class BahdanauAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(BahdanauAttentionLayer, self).__init__(**kwargs)
        self.W = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, values):
        # (batch_size, max_length, 1)
        scores = self.V(tf.nn.tanh(self.W(values)))
        
        # (batch_size, max_length, 1) normalized lulz
        attention_weights = tf.nn.softmax(scores, axis=1)
        
        context_vector = tf.reduce_sum(attention_weights * values, axis=1)
        
        return context_vector, attention_weights


class AttentiveSequenceEncoder(tf.keras.layers.Layer):
    def __init__(self, lstm_units, attention_units, **kwargs):
        super().__init__(**kwargs)
        self.lstm = Bidirectional(LSTM(lstm_units, recurrent_dropout=0.0001, return_sequences=True))
        self.concat = Concatenate()
        self.attention = BahdanauAttentionLayer(attention_units)
        
    def call(self, inputs, mask):
        encoded = self.lstm(inputs, mask=mask)
        output, attention_weights = self.attention(encoded)
        
        return output, attention_weights

In [None]:
BATCH_SIZE = 2  # keep it low
EPOCHS = 1#00

In [None]:
class DocModel(tf.keras.Model):
    def __init__(self, lstm_units, hidden_units, dropout, batch_size=BATCH_SIZE):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            trainable=False,
            input_shape=(MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))
        
        self.sent_encoder = Bidirectional(LSTM(lstm_units, recurrent_dropout=0.0001))
        self.para_encoder = Bidirectional(LSTM(lstm_units, recurrent_dropout=0.0001))
        self.doc_encoder = Bidirectional(LSTM(lstm_units, recurrent_dropout=0.0001))
        
        self.hidden = tf.keras.layers.Dense(hidden_units, activation='tanh')
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.classifier = tf.keras.layers.Dense(3, activation='sigmoid')
        
        self.dropout.build((BATCH_SIZE, hidden_units))
    
    def call(self, inputs, training=False):
        (inputs, sent_mask, para_mask, doc_mask) = inputs
        
        embedded = self.embedding(inputs)
        embedded = array_ops.reshape(
            embedded, (BATCH_SIZE * MAX_DOC_LENGTH * MAX_PARA_LENGTH, MAX_SENT_LENGTH, -1))
        sent_mask = array_ops.reshape(
            sent_mask, (BATCH_SIZE * MAX_DOC_LENGTH * MAX_PARA_LENGTH, MAX_SENT_LENGTH))

        sent_embedded = self.sent_encoder(embedded, mask=sent_mask)
        sent_embedded = array_ops.reshape(
            sent_embedded, (BATCH_SIZE * MAX_DOC_LENGTH, MAX_PARA_LENGTH, -1))
        para_mask = array_ops.reshape(
            para_mask, (BATCH_SIZE * MAX_DOC_LENGTH, MAX_PARA_LENGTH))
        
        para_embedded = self.para_encoder(sent_embedded, mask=para_mask)
        para_embedded = array_ops.reshape(
            para_embedded, (BATCH_SIZE, MAX_DOC_LENGTH, -1))
        
        x = self.doc_encoder(para_embedded, mask=doc_mask)
        x = self.hidden(x)
        x = self.dropout(x)
        
        return self.classifier(x)

doc_model = DocModel(150, 100, 0.5)
doc_model.compile(optimizer='adam', metrics=['accuracy'],
                  loss=tf.keras.losses.CategoricalCrossentropy())
doc_model(next(iter(train_dataset))[0])

doc_model.summary()

In [None]:
doc_model_hist = doc_model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCHS)

In [None]:
doc_model_hist.history

In [None]:
class AttentiveDocModel(tf.keras.Model):
    def __init__(self, lstm_units, hidden_units, attention_units, dropout, batch_size=BATCH_SIZE):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            trainable=False,
            input_shape=(MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))
        
        self.sent_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        self.para_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        self.doc_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        
        self.hidden = tf.keras.layers.Dense(hidden_units, activation='tanh')
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.classifier = tf.keras.layers.Dense(3, activation='sigmoid')
        
        self.dropout.build((BATCH_SIZE, hidden_units))
    
    def call(self, inputs, training=False):
        (inputs, sent_mask, para_mask, doc_mask) = inputs
        
        embedded = self.embedding(inputs)
        embedded = array_ops.reshape(
            embedded, (BATCH_SIZE * MAX_DOC_LENGTH * MAX_PARA_LENGTH, MAX_SENT_LENGTH, -1))
        sent_mask = array_ops.reshape(
            sent_mask, (BATCH_SIZE * MAX_DOC_LENGTH * MAX_PARA_LENGTH, MAX_SENT_LENGTH))

        sent_embedded, sent_weights = self.sent_encoder(embedded, mask=sent_mask)
        sent_embedded = array_ops.reshape(
            sent_embedded, (BATCH_SIZE * MAX_DOC_LENGTH, MAX_PARA_LENGTH, -1))
        para_mask = array_ops.reshape(
            para_mask, (BATCH_SIZE * MAX_DOC_LENGTH, MAX_PARA_LENGTH))
        
        para_embedded, para_weights = self.para_encoder(sent_embedded, mask=para_mask)
        para_embedded = array_ops.reshape(
            para_embedded, (BATCH_SIZE, MAX_DOC_LENGTH, -1))
        
        x, doc_weights = self.doc_encoder(para_embedded, mask=doc_mask)
        x = self.hidden(x)
        x = self.dropout(x)
        
        if not training:
            self.sent_weights = sent_weights
            self.para_weights = para_weights
            self.doc_weights = doc_weights
        
        return self.classifier(x)


attentive_doc_model = AttentiveDocModel(150, 100, 50, 0.5)
attentive_doc_model.compile(optimizer='adam', metrics=['accuracy'],
                            loss=tf.keras.losses.CategoricalCrossentropy())
attentive_doc_model(next(iter(train_dataset))[0])

attentive_doc_model.summary()

In [None]:
attentive_doc_model_hist = attentive_doc_model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCHS)

In [None]:
attentive_doc_model_hist.history

In [None]:
class SmallAttentiveDocModel(tf.keras.Model):
    def __init__(self, lstm_units, hidden_units, attention_units, dropout, batch_size=BATCH_SIZE):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            trainable=False,
            input_shape=(MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))
        
        self.sent_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        self.doc_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        
        self.hidden = tf.keras.layers.Dense(hidden_units, activation='tanh')
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.classifier = tf.keras.layers.Dense(3, activation='sigmoid')
        
        self.dropout.build((BATCH_SIZE, hidden_units))
    
    def call(self, inputs, training=False):
        (inputs, sent_mask, para_mask, doc_mask) = inputs
        
        embedded = self.embedding(inputs)
        embedded = array_ops.reshape(
            embedded, (BATCH_SIZE * MAX_DOC_LENGTH * MAX_PARA_LENGTH, MAX_SENT_LENGTH, -1))
        sent_mask = array_ops.reshape(
            sent_mask, (BATCH_SIZE * MAX_DOC_LENGTH * MAX_PARA_LENGTH, MAX_SENT_LENGTH))

        sent_embedded, sent_weights = self.sent_encoder(embedded, mask=sent_mask)
        sent_embedded = array_ops.reshape(
            sent_embedded, (BATCH_SIZE, MAX_DOC_LENGTH * MAX_PARA_LENGTH, -1))
        doc_mask = array_ops.reshape(
            para_mask, (BATCH_SIZE, MAX_DOC_LENGTH * MAX_PARA_LENGTH))
        
        x, doc_weights = self.doc_encoder(sent_embedded, mask=doc_mask)
        x = self.hidden(x)
        x = self.dropout(x)
        
        if not training:
            self.sent_weights = sent_weights
            self.doc_weights = doc_weights
        
        return self.classifier(x)


small_att_doc_model = SmallAttentiveDocModel(150, 100, 50, 0.5)
small_att_doc_model.compile(optimizer='adam', metrics=['accuracy'],
                            loss=tf.keras.losses.CategoricalCrossentropy())
small_att_doc_model(next(iter(train_dataset))[0])

small_att_doc_model.summary()

In [None]:
small_att_doc_model_hist = small_att_doc_model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCHS)

In [None]:
small_att_doc_model_hist.history

In [None]:
class SmallDocModel(tf.keras.Model):
    def __init__(self, lstm_units, hidden_units, dropout, batch_size=BATCH_SIZE):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            trainable=False,
            input_shape=(MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))
        
        self.sent_encoder = Bidirectional(LSTM(lstm_units, recurrent_dropout=0.0001))
        self.doc_encoder = Bidirectional(LSTM(lstm_units, recurrent_dropout=0.0001))
        
        self.hidden = tf.keras.layers.Dense(hidden_units, activation='tanh')
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.classifier = tf.keras.layers.Dense(3, activation='sigmoid')
        
        self.dropout.build((BATCH_SIZE, hidden_units))
    
    def call(self, inputs, training=False):
        (inputs, sent_mask, para_mask, doc_mask) = inputs
        
        embedded = self.embedding(inputs)
        embedded = array_ops.reshape(
            embedded, (BATCH_SIZE * MAX_DOC_LENGTH * MAX_PARA_LENGTH, MAX_SENT_LENGTH, -1))
        sent_mask = array_ops.reshape(
            sent_mask, (BATCH_SIZE * MAX_DOC_LENGTH * MAX_PARA_LENGTH, MAX_SENT_LENGTH))

        sent_embedded = self.sent_encoder(embedded, mask=sent_mask)
        sent_embedded = array_ops.reshape(
            sent_embedded, (BATCH_SIZE, MAX_DOC_LENGTH * MAX_PARA_LENGTH, -1))
        doc_mask = array_ops.reshape(
            para_mask, (BATCH_SIZE, MAX_DOC_LENGTH * MAX_PARA_LENGTH))
        
        x = self.doc_encoder(sent_embedded, mask=doc_mask)
        x = self.hidden(x)
        x = self.dropout(x)
        
        return self.classifier(x)

small_doc_model = SmallDocModel(150, 100, 0.5)
small_doc_model.compile(optimizer='adam', metrics=['accuracy'],
                  loss=tf.keras.losses.CategoricalCrossentropy())
small_doc_model(next(iter(train_dataset))[0])

small_doc_model.summary()

In [None]:
small_doc_model_hist = small_doc_model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCHS)

In [None]:
small_doc_model_hist.history

In [None]:
class TinyAttentiveDocModel(tf.keras.Model):
    def __init__(self, lstm_units, hidden_units, attention_units, dropout, batch_size=BATCH_SIZE):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            trainable=False,
            input_shape=(MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))

        self.doc_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        
        self.hidden = tf.keras.layers.Dense(hidden_units, activation='tanh')
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.classifier = tf.keras.layers.Dense(3, activation='sigmoid')
        
        self.dropout.build((BATCH_SIZE, hidden_units))
    
    def call(self, inputs, training=False):
        (inputs, sent_mask, para_mask, doc_mask) = inputs
        
        embedded = self.embedding(inputs)
        embedded = array_ops.reshape(
            embedded, (BATCH_SIZE, MAX_DOC_LENGTH * MAX_PARA_LENGTH * MAX_SENT_LENGTH, -1))
        sent_mask = array_ops.reshape(
            sent_mask, (BATCH_SIZE, MAX_DOC_LENGTH * MAX_PARA_LENGTH * MAX_SENT_LENGTH))
        
        x, doc_weights = self.doc_encoder(embedded, mask=sent_mask)
        x = self.hidden(x)
        x = self.dropout(x)
        
        if not training:
            self.doc_weights = doc_weights
        
        return self.classifier(x)


tiny_att_doc_model = TinyAttentiveDocModel(150, 100, 50, 0.5)
tiny_att_doc_model.compile(optimizer='adam', metrics=['accuracy'],
                            loss=tf.keras.losses.CategoricalCrossentropy())
tiny_att_doc_model(next(iter(train_dataset))[0])

tiny_att_doc_model.summary()

In [None]:
tiny_att_doc_model_hist = tiny_att_doc_model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCHS)

In [None]:
tiny_att_doc_model_hist.history

In [None]:
class TinyDocModel(tf.keras.Model):
    def __init__(self, lstm_units, hidden_units, dropout, batch_size=BATCH_SIZE):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            trainable=False,
            input_shape=(MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))
        
        self.doc_encoder = Bidirectional(LSTM(lstm_units, recurrent_dropout=0.0001))
        
        self.hidden = tf.keras.layers.Dense(hidden_units, activation='tanh')
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.classifier = tf.keras.layers.Dense(3, activation='sigmoid')
        
        self.dropout.build((BATCH_SIZE, hidden_units))
    
    def call(self, inputs, training=False):
        (inputs, sent_mask, para_mask, doc_mask) = inputs
        
        embedded = self.embedding(inputs)
        embedded = array_ops.reshape(
            embedded, (BATCH_SIZE, MAX_DOC_LENGTH * MAX_PARA_LENGTH * MAX_SENT_LENGTH, -1))
        sent_mask = array_ops.reshape(
            sent_mask, (BATCH_SIZE, MAX_DOC_LENGTH * MAX_PARA_LENGTH * MAX_SENT_LENGTH))
        
        x = self.doc_encoder(embedded, mask=sent_mask)
        x = self.hidden(x)
        x = self.dropout(x)
        
        return self.classifier(x)

tiny_doc_model = TinyDocModel(150, 100, 0.5)
tiny_doc_model.compile(optimizer='adam', metrics=['accuracy'],
                  loss=tf.keras.losses.CategoricalCrossentropy())
tiny_doc_model(next(iter(train_dataset))[0])

tiny_doc_model.summary()

In [None]:
tiny_doc_model_hist = tiny_doc_model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCHS)

In [None]:
tiny_doc_model_hist.history