In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
import os

from nltk import sent_tokenize, word_tokenize

from gcdc_data import load, load_pandas, TrainOrTest

In [2]:
train_data = load_pandas(TrainOrTest.TRAIN)
test_data = load_pandas(TrainOrTest.TEST)

train_data['label'] = pd.to_numeric(train_data['label']) - 1.0
test_data['label'] = pd.to_numeric(test_data['label']) - 1.0

train_data.head()

Unnamed: 0,text,label
0,Cheryl:\n\nAre we in a good place to begin pap...,1.0
1,"Our friend, General Joe Ballard owns The Raven...",1.0
2,Outstanding news! Miki Rakic called about 10 m...,2.0
3,Responding to separate emails from Uzra + Jeff...,0.0
4,Guy from Mexico is in NY and is cooperating. D...,0.0


In [3]:
t = tf.keras.preprocessing.text.Tokenizer(filters='')
t.fit_on_texts([
    word
    for doc in train_data['text']
    for sent in sent_tokenize(doc)
    for word in word_tokenize(sent)
])

vocab_size = len(t.word_index) + 1

vocab_size

30660

In [4]:
EMBEDDING_DIM = 100

embeddings_index = {}
f = open(os.path.join('data', f'glove.6B.{EMBEDDING_DIM}d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found {} word vectors.'.format(len(embeddings_index)))

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print(embedding_matrix[72])

Found 400000 word vectors.
[-0.38468999  0.99338001  0.13398001 -0.32708001 -0.07744     0.17769
 -0.071985    0.16159999 -0.13770001  0.051739    0.15964     0.016507
 -0.049616   -0.53964001  0.24449    -0.62066001 -0.345      -0.015009
  0.059399    0.79347998  1.20959997 -0.094457    0.14585    -0.063804
  0.1468     -0.50725001 -0.15582    -0.69462001 -0.18542001 -0.20292
  0.011547    0.39695001 -0.45813    -0.19921     0.32108     0.54069
 -0.0073385   0.12096    -0.77902001  0.42853999 -0.53546    -0.58143002
  0.14424001 -0.47396001 -0.20623    -0.20815     0.54938    -0.51740998
  0.09016    -0.75700998 -0.063903   -0.73684001 -0.097376    1.26349998
 -0.51025999 -2.56859994  0.47679999 -0.54214001  2.11439991  0.49177
  0.21145999  1.57869995 -0.28595999  0.051544    0.4962     -0.26324001
  0.82709002  0.50339001  0.90994    -0.28112999  0.020357   -0.63305998
 -0.33048999 -0.17051999  0.66251999 -0.055619   -0.37652999 -0.15417001
 -1.16480005 -0.15672     1.29089999  0.12

In [5]:
example_document = """Dear abby,

I'm writing to tell you you suck. Help me out of this mess.

Bye"""

def tokenize(text, tok=t):
    return [
        [
            [
                token[0] if token else 0
                for token in tok.texts_to_sequences(
                    nltk.word_tokenize(sent))
            ]
            if tok else
            nltk.word_tokenize(sent)
            for sent in nltk.sent_tokenize(para)
        ]
        for para in text.splitlines()
        if len(para) > 0
    ]

print(tokenize(example_document, None))

[[['Dear', 'abby', ',']], [['I', "'m", 'writing', 'to', 'tell', 'you', 'you', 'suck', '.'], ['Help', 'me', 'out', 'of', 'this', 'mess', '.']], [['Bye']]]


In [6]:
tokenized_example = tokenize(example_document)
print(tokenized_example)

[[[533, 11102, 3]], [[6, 115, 1025, 4, 229, 11, 11, 4406, 1], [165, 46, 55, 8, 20, 3016, 1]], [[6155]]]


In [7]:
train_data['tokenized'] = train_data['text'].map(lambda text: tokenize(text, t))
test_data['tokenized'] = test_data['text'].map(lambda text: tokenize(text, t))

train_data.head()

Unnamed: 0,text,label,tokenized
0,Cheryl:\n\nAre we in a good place to begin pap...,1.0,"[[[447, 89]], [[22, 15, 9, 7, 71, 98, 4, 821, ..."
1,"Our friend, General Joe Ballard owns The Raven...",1.0,"[[[44, 371, 3, 507, 1927, 8005, 3337, 2, 8006,..."
2,Outstanding news! Miki Rakic called about 10 m...,2.0,"[[[1817, 529, 35], [11590, 15718, 228, 51, 381..."
3,Responding to separate emails from Uzra + Jeff...,0.0,"[[[3497, 4, 1365, 2722, 42, 11592, 2532, 569, ..."
4,Guy from Mexico is in NY and is cooperating. D...,0.0,"[[[516, 42, 823, 12, 9, 1575, 5, 12, 5628, 1],..."


In [8]:
MAX_DOC_LENGTH = 0
MAX_PARA_LENGTH = 0
MAX_SENT_LENGTH = 0

for doc in train_data['tokenized'].append(test_data['tokenized']):
    MAX_DOC_LENGTH = max(MAX_DOC_LENGTH, len(doc))
    for para in doc:
        MAX_PARA_LENGTH = max(MAX_PARA_LENGTH, len(para))
        for sent in para:
            MAX_SENT_LENGTH = max(MAX_SENT_LENGTH, len(sent))
            
MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH

(12, 32, 255)

In [9]:
def pad_to_dense(M, sent_len=MAX_SENT_LENGTH, para_len=MAX_PARA_LENGTH, doc_len=MAX_DOC_LENGTH):
    Z = np.zeros((len(M), doc_len, para_len, sent_len))
    for docidx, doc in enumerate(M):
        for paraidx, para in enumerate(doc):
            for sentidx, sent in enumerate(para):
                Z[docidx, paraidx, sentidx, :len(sent)] += sent
    return Z

padded_example = pad_to_dense([tokenized_example])
padded_example.shape

(1, 12, 32, 255)

In [10]:
train_tensor = tf.convert_to_tensor(pad_to_dense(train_data['tokenized']))
test_tensor = tf.convert_to_tensor(pad_to_dense(test_data['tokenized']))

train_tensor.shape, test_tensor.shape

(TensorShape([4000, 12, 32, 255]), TensorShape([800, 12, 32, 255]))

In [19]:
def categorical_labels(labels):
    eye = [
        [1.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 1.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 1.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 1.0],
    ]
    
    result = []
    for item in labels:
        result.append(eye[int(item)])
        
    return result

train_labels = tf.convert_to_tensor(categorical_labels(train_data['label']))
test_labels = tf.convert_to_tensor(categorical_labels(test_data['label']))

train_labels.shape, test_labels.shape

(TensorShape([4000, 5]), TensorShape([800, 5]))

In [11]:
embedding = tf.keras.layers.Embedding(
    vocab_size,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    trainable=False,
    mask_zero=True)

embedded_example = []
for doc in padded_example:
    embedded_example.append([])
    for para in doc:
        embedded_example[-1].append([])
        for sent in para:
            embedded_example[-1][-1].append(embedding(sent))

embedded_example = tf.convert_to_tensor(embedded_example)
padded_example.shape, embedded_example.shape

((1, 12, 32, 255), TensorShape([1, 12, 32, 255, 100]))

In [12]:
# onions

class BahdanauAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(BahdanauAttentionLayer, self).__init__(**kwargs)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
        self.cached_attention_weights = []
        
    def call(self, query, values):
        # (batch_size, ...) -> (batch_size, 1, ...)
        hidden_with_time_axis = tf.expand_dims(query, 1)
        
        # (batch_size, max_length, 1)
        scores = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        
        # (batch_size, max_length, 1) normalized lulz
        attention_weights = tf.nn.softmax(scores, axis=1)
        
        context_vector = tf.reduce_sum(attention_weights * values, axis=1)
        
        return context_vector, attention_weights


class AttentiveSequenceLayer(tf.keras.layers.Layer):
    def __init__(self, lstm_units, attention_units, **kwargs):
        super().__init__(**kwargs)
        self.lstm = tf.keras.layers.LSTM(lstm_units, return_state=True)
        self.attention = BahdanauAttentionLayer(attention_units)
        
    def call(self, inputs):
        sequence_encoded, state_h, state_c = self.lstm(inputs)
        output, attention_weights = self.attention(sequence_encoded, state_h)
        
        return output, attention_weights

In [13]:
        self.embedder = tf.keras.layers.Embedding(
            vocab_size,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            trainable=False,
            mask_zero=True)
        self.sent_encoder = tf.keras.layers.LSTM(lstm_units)
        self.para_encoder = tf.keras.layers.LSTM(lstm_units)
        self.doc_encoder = tf.keras.layers.LSTM(lstm_units)
        
        self.hidden = tf.keras.layers.Dense(hidden_units, activation='tanh')
        self.classifier = tf.keras.layers.Dense(5, activation='sigmoid')

def train_fn_for()
    @tf.function
    def train_fn(docs, targ):
        emb_docs = []
        for doc in docs:
            emb_paras = []
            for para in doc:
                emb_sents = []
                for sent in para:
                    emb_sents.append(self.embedder(sent))
                enc_sents = self.sent_encoder(tf.convert_to_tensor(emb_sents))
                emb_paras.append(enc_sents)
            enc_paras = self.para_encoder(tf.convert_to_tensor(emb_paras))
            emb_docs.append(enc_paras)
        
        enc_docs = self.doc_encoder(tf.convert_to_tensor(emb_docs))
        hidden_pass = self.hidden(enc_docs)
        return self.classifier(hidden_pass)