In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
import os

from nltk import sent_tokenize, word_tokenize

from gcdc_data import load, load_pandas, TrainOrTest

In [2]:
train_data = load_pandas(TrainOrTest.TRAIN)
test_data = load_pandas(TrainOrTest.TEST)

train_data.head()

Unnamed: 0,text,label
0,Cheryl:\n\nAre we in a good place to begin pap...,2
1,"Our friend, General Joe Ballard owns The Raven...",2
2,Outstanding news! Miki Rakic called about 10 m...,3
3,Responding to separate emails from Uzra + Jeff...,1
4,Guy from Mexico is in NY and is cooperating. D...,1


In [3]:
t = tf.keras.preprocessing.text.Tokenizer(filters='')
t.fit_on_texts([
    word
    for doc in train_data['text']
    for sent in sent_tokenize(doc)
    for word in word_tokenize(sent)
])

vocab_size = len(t.word_index) + 1

vocab_size

30660

In [4]:
EMBEDDING_DIM = 100

embeddings_index = {}
f = open(os.path.join('data', f'glove.6B.{EMBEDDING_DIM}d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found {} word vectors.'.format(len(embeddings_index)))

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print(embedding_matrix[72])

Found 400000 word vectors.
[-0.38468999  0.99338001  0.13398001 -0.32708001 -0.07744     0.17769
 -0.071985    0.16159999 -0.13770001  0.051739    0.15964     0.016507
 -0.049616   -0.53964001  0.24449    -0.62066001 -0.345      -0.015009
  0.059399    0.79347998  1.20959997 -0.094457    0.14585    -0.063804
  0.1468     -0.50725001 -0.15582    -0.69462001 -0.18542001 -0.20292
  0.011547    0.39695001 -0.45813    -0.19921     0.32108     0.54069
 -0.0073385   0.12096    -0.77902001  0.42853999 -0.53546    -0.58143002
  0.14424001 -0.47396001 -0.20623    -0.20815     0.54938    -0.51740998
  0.09016    -0.75700998 -0.063903   -0.73684001 -0.097376    1.26349998
 -0.51025999 -2.56859994  0.47679999 -0.54214001  2.11439991  0.49177
  0.21145999  1.57869995 -0.28595999  0.051544    0.4962     -0.26324001
  0.82709002  0.50339001  0.90994    -0.28112999  0.020357   -0.63305998
 -0.33048999 -0.17051999  0.66251999 -0.055619   -0.37652999 -0.15417001
 -1.16480005 -0.15672     1.29089999  0.12

In [5]:
example_document = """Dear abby,

I'm writing to tell you you suck. Help me out of this mess.

Bye"""

In [6]:
def tokenize(text, tok=None):
    return [
        [
            tok.texts_to_sequences(nltk.word_tokenize(sent))
            if tok else
            nltk.word_tokenize(sent)
            for sent in nltk.sent_tokenize(para)
        ]
        for para in text.splitlines()
        if len(para) > 0
    ]

print(tokenize(example_document))

[[['Dear', 'abby', ',']], [['I', "'m", 'writing', 'to', 'tell', 'you', 'you', 'suck', '.'], ['Help', 'me', 'out', 'of', 'this', 'mess', '.']], [['Bye']]]


In [7]:
print(tokenize(example_document, t))

[[[[533], [11102], [3]]], [[[6], [115], [1025], [4], [229], [11], [11], [4406], [1]], [[165], [46], [55], [8], [20], [3016], [1]]], [[[6155]]]]


In [8]:
train_data['tokenized'] = train_data['text'].map(lambda text: tokenize(text, t))
test_data['tokenized'] = test_data['text'].map(lambda text: tokenize(text, t))

train_data.head()

Unnamed: 0,text,label,tokenized
0,Cheryl:\n\nAre we in a good place to begin pap...,2,"[[[[447], [89]]], [[[22], [15], [9], [7], [71]..."
1,"Our friend, General Joe Ballard owns The Raven...",2,"[[[[44], [371], [3], [507], [1927], [8005], [3..."
2,Outstanding news! Miki Rakic called about 10 m...,3,"[[[[1817], [529], [35]], [[11590], [15718], [2..."
3,Responding to separate emails from Uzra + Jeff...,1,"[[[[3497], [4], [1365], [2722], [42], [11592],..."
4,Guy from Mexico is in NY and is cooperating. D...,1,"[[[[516], [42], [823], [12], [9], [1575], [5],..."


In [9]:
MAX_DOC_LENGTH = 0
MAX_PARA_LENGTH = 0
MAX_SENT_LENGTH = 0

for doc in train_data['tokenized'].append(test_data['tokenized']):
    MAX_DOC_LENGTH = max(MAX_DOC_LENGTH, len(doc))
    for para in doc:
        MAX_PARA_LENGTH = max(MAX_PARA_LENGTH, len(para))
        for sent in para:
            MAX_SENT_LENGTH = max(MAX_SENT_LENGTH, len(sent))
            
MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH

(12, 32, 255)

In [10]:
def pad_to_dense(M, sent_len=MAX_SENT_LENGTH, para_len=MAX_PARA_LENGTH, doc_len=MAX_DOC_LENGTH):
    Z = np.zeros((len(M), doc_len, para_len, sent_len))
    for docidx, doc in enumerate(M):
        for paraidx, para in enumerate(doc):
            for sentidx, sent in enumerate(para):
                sentnp = np.hstack(np.array(sent))
                Z[docidx, paraidx, sentidx, :len(sentnp)] += sentnp
    return Z

print(train_data['tokenized'][0])
pad_to_dense([train_data['tokenized'][0]])

[[[[447], [89]]], [[[22], [15], [9], [7], [71], [98], [4], [821], [4769], [4], [436], [8002], [2454], [65], [90], [16], [15711], [9], [822], [66]]], [[[30], [38], [3], [6], [37], [359], [15], [240], [83], [4], [412], [16], [822], [2531], [5], [6], [25], [52], [2], [3336], [306], [4], [279], [2], [4769], [1]], [[75], [25], [21], [2455], [4], [76], [13], [2616], [217], [3], [78], [25], [4164], [83], [4], [866], [1981], [13], [3183], [3], [1723], [2532], [15712], [3], [4433], [3], [5], [20], [2616], [140], [794], [25], [1766], [83], [175], [2], [219], [8], [2], [581], [16], [853], [544], [3], [3693], [1]], [[20], [413], [37], [163], [46], [381], [217], [45], [38], [8], [5130], [136], [7], [5131], [2366], [13], [6259], [16], [894], [1649], [1]]], [[[6], [94], [23], [2719], [16], [8003], [55], [4], [6259], [168], [2], [1815], [15], [52], [2], [72], [41], [12], [4], [21], [227], [3], [5], [15], [116], [23], [52], [1690], [64], [1867], [55], [62], [136], [15], [3042], [31], [23], [7], [15713]

array([[[[447.,  89.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         ...,
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.]],

        [[ 22.,  15.,   9., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         ...,
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.]],

        [[ 30.,  38.,   3., ...,   0.,   0.,   0.],
         [ 75.,  25.,  21., ...,   0.,   0.,   0.],
         [ 20., 413.,  37., ...,   0.,   0.,   0.],
         ...,
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.]],

        ...,

  

In [11]:
train_tensor = tf.convert_to_tensor(pad_to_dense(train_data['tokenized']))
test_tensor = tf.convert_to_tensor(pad_to_dense(test_data['tokenized']))

train_tensor.shape, test_tensor.shape

(TensorShape([4000, 12, 32, 255]), TensorShape([800, 12, 32, 255]))

In [12]:
def categorical_labels(labels):
    eye = [
        [1.0, 0.0, 0.0, 0.0, 0.0],
        [0.0, 1.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 1.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 1.0],
    ]
    
    result = []
    for item in labels:
        result.append(eye[int(item)])
        
    return result

train_labels = tf.convert_to_tensor(categorical_labels(train_data['label']))
test_labels = tf.convert_to_tensor(categorical_labels(test_data['label']))

train_labels.shape, test_labels.shape

(TensorShape([4000, 5]), TensorShape([800, 5]))

In [13]:
embedding = tf.keras.layers.Embedding(
    vocab_size,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    trainable=False,
    mask_zero=True)

embedded_example = embedding(train_tensor[:2])

embedded_example.shape

TensorShape([2, 12, 32, 255, 100])

In [14]:
class FlattenSequence(tf.keras.layers.Layer):
    def build(self, input_shape):
        print(input_shape)
        self.new_shape = (np.prod(input_shape[:-2]), input_shape[-2], input_shape[-1])

    def call(self, inputs):
        shape = inputs.shape
        return tf.reshape(inputs, self.new_shape)
    
class ExpandSequence(tf.keras.layers.Layer):
    def __init__(self, shape, *args, **kwargs):
        self.shape = shape
        self.new_shape = None
        super().__init__(*args, **kwargs)
        
    def build(self, input_shape):
        self.new_shape = tuple(list(self.shape) + [input_shape[-1]])
        super().build(input_shape)
        
    def call(self, inputs):
        return tf.reshape(inputs, self.new_shape)
    
flattened = FlattenSequence()(embedded_example)
expanded = ExpandSequence((2, 12, 32, 255))(flattened)

flattened.shape, expanded.shape

(2, 12, 32, 255, 100)


(TensorShape([768, 255, 100]), TensorShape([2, 12, 32, 255, 100]))

In [15]:
# onions

class BahdanauAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(BahdanauAttentionLayer, self).__init__(**kwargs)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
        self.cached_attention_weights = []
        
    def call(self, query, values):
        # (batch_size, ...) -> (batch_size, 1, ...)
        hidden_with_time_axis = tf.expand_dims(query, 1)
        
        # (batch_size, max_length, 1)
        scores = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        
        # (batch_size, max_length, 1) normalized lulz
        attention_weights = tf.nn.softmax(scores, axis=1)
        
        context_vector = tf.reduce_sum(attention_weights * values, axis=1)
        
        return context_vector, attention_weights


class AttentiveSequenceEncoder(tf.keras.layers.Layer):
    def __init__(self, lstm_units, attention_units, **kwargs):
        super().__init__(**kwargs)
        self.lstm = tf.keras.layers.LSTM(lstm_units, return_state=True)
        self.attention = BahdanauAttentionLayer(attention_units)
        
    def call(self, inputs):
        sequence_encoded, state_h, state_c = self.lstm(inputs)
        output, attention_weights = self.attention(sequence_encoded, state_h)
        
        return output, attention_weights

In [16]:
from tensorflow.keras.layers import TimeDistributed

class DocModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            trainable=False,
            mask_zero=True,
            input_shape=(MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))
        self.sent_encoder = TimeDistributed(TimeDistributed(tf.keras.layers.LSTM(100, recurrent_dropout=0.001)))
        self.para_encoder = TimeDistributed(tf.keras.layers.LSTM(100, recurrent_dropout=0.001))
        self.doc_encoder = tf.keras.layers.LSTM(100, recurrent_dropout=0.001)
        self.hidden = tf.keras.layers.Dense(50, activation='tanh')
        self.classifier = tf.keras.layers.Dense(5, activation='sigmoid')
    
    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.sent_encoder(x)
        x = self.para_encoder(x)
        x = self.doc_encoder(x)
        x = self.hidden(x)
        return self.classifier(x)

doc_model = DocModel()
doc_model.compile(optimizer='adam', metrics=['accuracy'], loss=tf.keras.losses.CategoricalCrossentropy())
doc_model.fit(train_tensor[:100], train_labels[:100], batch_size=2)

(None, 12, 32, 255)
(None, 12, 32, 255, 100)
(None, 12, 32, 100)
(None, 12, 100)
(None, 100)
(None, 50)
Train on 100 samples
(2, 12, 32, 255)
(2, 12, 32, 255, 100)
(2, 12, 32, 100)
  2/100 [..............................] - ETA: 12s

ValueError: Dimension 1 in both shapes must be equal, but are 100 and 25500. Shapes are [24,100] and [24,25500]. for 'Select' (op: 'Select') with input shapes: [24,25500], [24,100], [24,100].

In [None]:
class AttentiveDocumentClassifierModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            trainable=False,
            mask_zero=True,
            input_shape=(MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))
        self.sent_encoder = TimeDistributed(TimeDistributed(AttentiveSequenceEncoder(100, 32)))
        self.para_encoder = TimeDistributed(AttentiveSequenceEncoder(100, 32))
        self.doc_encoder = AttentiveSequenceEncoder(100, 32)
        self.hidden = tf.keras.layers.Dense(100, activation='tanh')
        self.classifier = tf.keras.layers.Dense(5, activation='sigmoid')
        
    def call(self, inputs, training=False):
        print(inputs)
        x = self.embedding(inputs)
        x, word_attention_weights = self.sent_encoder(x)
        x, sent_attention_weights = self.para_encoder(x)
        x, para_attention_weights = self.doc_encoder(x)
        x = self.hidden(x)
        
        if training:
            return self.classifier(x)
        else:
            return self.classifier(x), para_attention_weights, sent_attention_weights, word_attention_weights

attentive_doc_model = AttentiveDocumentClassifierModel()
attentive_doc_model.compile(optimizer='adam', metrics=['accuracy'], loss=tf.keras.losses.CategoricalCrossentropy())
attentive_doc_model(train_tensor[:2])

In [None]:
BATCH_SIZE = 2  # keep it low

