In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
import os

from nltk import sent_tokenize, word_tokenize

from gcdc_data import load, load_pandas, Source, TrainOrTest

In [2]:
train_data = load_pandas(TrainOrTest.TRAIN, sources=[Source.CLINTON])
test_data = load_pandas(TrainOrTest.TEST, sources=[Source.CLINTON])

print(train_data['label'].unique())

train_data.head()

['2' '3' '1']


Unnamed: 0,text,label
0,Cheryl:\n\nAre we in a good place to begin pap...,2
1,"Our friend, General Joe Ballard owns The Raven...",2
2,Outstanding news! Miki Rakic called about 10 m...,3
3,Responding to separate emails from Uzra + Jeff...,1
4,Guy from Mexico is in NY and is cooperating. D...,1


In [3]:
t = tf.keras.preprocessing.text.Tokenizer(oov_token='unk')
t.fit_on_texts([
    word
    for doc in train_data['text']
    for sent in sent_tokenize(doc)
    for word in word_tokenize(sent)
])

vocab_size = len(t.word_index) + 1

vocab_size, t.word_index['unk']

(13163, 1)

In [4]:
EMBEDDING_DIM = 100

embeddings_index = {}
f = open(os.path.join('data', f'glove.6B.{EMBEDDING_DIM}d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found {} word vectors.'.format(len(embeddings_index)))

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word, embeddings_index['unk'])
    embedding_matrix[i] = embedding_vector
        
print(embedding_matrix[72])

Found 400000 word vectors.
[-0.45414001 -0.55923003  0.72431999  0.74511999 -0.06304    -0.43191001
  0.1787      0.99759001 -0.51115    -0.79369998 -0.22916999 -0.47031999
  0.54519999  0.28490999 -0.11225     0.21269    -0.50792003 -0.64273
 -0.91689003 -1.00320005 -0.41808999 -0.40024    -0.21836001  0.025622
 -0.56033999 -0.64148998 -0.018587    0.23577     0.24417999  0.53899997
 -0.74356002 -0.14578     0.24029     0.013151   -0.43454999  0.59173
  0.41881001  0.47510999 -0.28112999 -0.10072    -0.98942    -0.49941
  1.0158      0.06466    -0.11672    -0.43849    -0.44154    -0.21893001
 -0.46342999 -0.57498997  0.052368   -0.79330999  0.24862     0.5359
 -0.60610998 -2.62479997 -0.4858     -0.059422    1.39530003  0.80589998
 -1.08729994  0.39109999 -0.22679999 -0.75388002  0.0095413  -0.11887
 -0.20058     1.07710004  0.53029001 -0.47058001  0.31349    -0.97333997
 -0.49098    -1.37979996 -0.62849998  0.3312     -0.19378    -0.39239001
 -1.2428     -0.0152      0.24099    -0.15

In [5]:
len(set(t.word_index.keys() - embeddings_index.keys()))

821

In [6]:
example_document = """Dear abby,

I'm writing to tell you you suck. Help me out of this mess.

Bye"""

In [7]:
def tokenize(text, tok=None):
    return [
        [
            tok.texts_to_sequences(nltk.word_tokenize(sent))
            if tok else
            nltk.word_tokenize(sent)
            for sent in nltk.sent_tokenize(para)
        ]
        for para in text.splitlines()
        if len(para) > 0
    ]

print(tokenize(example_document))

[[['Dear', 'abby', ',']], [['I', "'m", 'writing', 'to', 'tell', 'you', 'you', 'suck', '.'], ['Help', 'me', 'out', 'of', 'this', 'mess', '.']], [['Bye']]]


In [8]:
print(tokenize(example_document, t))

[[[[163], [1], []]], [[[8], [127], [792], [3], [340], [13], [13], [1], []], [[138], [47], [58], [5], [16], [5058], []]], [[[6344]]]]


In [9]:
train_data['tokenized'] = train_data['text'].map(lambda text: tokenize(text, t))
test_data['tokenized'] = test_data['text'].map(lambda text: tokenize(text, t))

train_data.head()

Unnamed: 0,text,label,tokenized
0,Cheryl:\n\nAre we in a good place to begin pap...,2,"[[[[122], []]], [[[23], [14], [7], [6], [100],..."
1,"Our friend, General Joe Ballard owns The Raven...",2,"[[[[30], [809], [], [301], [1211], [3403], [71..."
2,Outstanding news! Miki Rakic called about 10 m...,3,"[[[[1540], [294], []], [[5163], [7116], [203],..."
3,Responding to separate emails from Uzra + Jeff...,1,"[[[[2093], [3], [1348], [1455], [28], [5171], ..."
4,Guy from Mexico is in NY and is cooperating. D...,1,"[[[[1113], [28], [570], [11], [7], [657], [4],..."


In [10]:
MAX_DOC_LENGTH = 0
MAX_PARA_LENGTH = 0
MAX_SENT_LENGTH = 0

for doc in train_data['tokenized'].append(test_data['tokenized']):
    MAX_DOC_LENGTH = max(MAX_DOC_LENGTH, len(doc))
    for para in doc:
        MAX_PARA_LENGTH = max(MAX_PARA_LENGTH, len(para))
        for sent in para:
            MAX_SENT_LENGTH = max(MAX_SENT_LENGTH, len(sent))
            
MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH

(12, 19, 137)

In [19]:
def pad_to_dense(M, sent_len=MAX_SENT_LENGTH, para_len=MAX_PARA_LENGTH, doc_len=MAX_DOC_LENGTH):
    Z = np.zeros((len(M), doc_len, para_len, sent_len))
    for docidx, doc in enumerate(M):
        for paraidx, para in enumerate(doc):
            for sentidx, sent in enumerate(para):
                sentnp = np.hstack(np.array(sent))
                Z[docidx, paraidx, sentidx, :len(sentnp)] += sentnp
    return Z

print(train_data['tokenized'][0])
pad_to_dense([train_data['tokenized'][0]])

[[[[122], []]], [[[23], [14], [7], [6], [100], [234], [3], [620], [2593], [3], [469], [2946], [3397], [60], [112], [10], [7106], [7], [600], []]], [[[36], [41], [], [8], [31], [1450], [14], [306], [46], [3], [210], [10], [600], [1776], [4], [8], [17], [77], [2], [3398], [187], [3], [337], [2], [2593], []], [[55], [17], [18], [1659], [3], [48], [12], [1777], [172], [], [63], [17], [3399], [46], [3], [916], [807], [12], [2298], [], [4111], [], [7107], [], [284], [], [4], [16], [1777], [185], [1052], [17], [4112], [46], [178], [2], [245], [5], [2], [556], [10], [881], [516], [], [1923], []], [[16], [359], [31], [222], [47], [280], [172], [59], [41], [5], [2947], [140], [6], [4113], [1149], [12], [3400], [10], [769], [838], []]], [[[8], [66], [27], [2948], [10], [7108], [58], [3], [3400], [], [2], [2086], [14], [77], [2], [52], [44], [11], [3], [18], [191], [], [4], [14], [93], [27], [77], [2594], [], [4114], [58], [51], [140], [14], [1150], [], [27], [6], [7109], [7110], [808], [], [194],

array([[[[122.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         ...,
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.]],

        [[ 23.,  14.,   7., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         ...,
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.]],

        [[ 36.,  41.,   8., ...,   0.,   0.,   0.],
         [ 55.,  17.,  18., ...,   0.,   0.,   0.],
         [ 16., 359.,  31., ...,   0.,   0.,   0.],
         ...,
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.]],

        ...,

  

In [12]:
train_tensor = tf.convert_to_tensor(pad_to_dense(train_data['tokenized']))
test_tensor = tf.convert_to_tensor(pad_to_dense(test_data['tokenized']))

train_tensor.shape, test_tensor.shape

(TensorShape([1000, 12, 19, 137]), TensorShape([200, 12, 19, 137]))

In [13]:
def categorical_labels(labels):
    eye = [
        [1.0, 0.0, 0.0],
        [0.0, 1.0, 0.0],
        [0.0, 0.0, 1.0],
    ]
    
    result = []
    for item in labels:
        result.append(eye[int(item) - 1])
        
    return result

train_labels = tf.convert_to_tensor(categorical_labels(train_data['label']))
test_labels = tf.convert_to_tensor(categorical_labels(test_data['label']))

train_labels.shape, test_labels.shape

(TensorShape([1000, 3]), TensorShape([200, 3]))

In [27]:
embedding = tf.keras.layers.Embedding(
    vocab_size,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    trainable=False)

embedded_example = embedding(train_tensor[:2])

print(embedded_example.shape)
embedded_example[0][0][0][-1]

(2, 12, 19, 137, 100)


<tf.Tensor: id=284, shape=(100,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)>

In [15]:
# onions

class BahdanauAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(BahdanauAttentionLayer, self).__init__(**kwargs)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
        self.cached_attention_weights = []
        
    def call(self, query, values):
        # (batch_size, ...) -> (batch_size, 1, ...)
        hidden_with_time_axis = tf.expand_dims(query, 1)
        
        # (batch_size, max_length, 1)
        scores = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        
        # (batch_size, max_length, 1) normalized lulz
        attention_weights = tf.nn.softmax(scores, axis=1)
        
        context_vector = tf.reduce_sum(attention_weights * values, axis=1)
        
        return context_vector, attention_weights


class AttentiveSequenceEncoder(tf.keras.layers.Layer):
    def __init__(self, lstm_units, attention_units, **kwargs):
        super().__init__(**kwargs)
        self.lstm = tf.keras.layers.LSTM(lstm_units, return_state=True)
        self.attention = BahdanauAttentionLayer(attention_units)
        
    def call(self, inputs):
        sequence_encoded, state_h, state_c = self.lstm(inputs)
        output, attention_weights = self.attention(sequence_encoded, state_h)
        
        return output, attention_weights

In [20]:
BATCH_SIZE = 2  # keep it low

from tensorflow.keras.layers import TimeDistributed, Bidirectional, LSTM, Dense, Dropout, Embedding

doc_model = tf.keras.Sequential([
    Embedding(
        vocab_size,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        trainable=False,
        input_shape=(MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH)),
    TimeDistributed(TimeDistributed(Bidirectional(LSTM(300)))),
    TimeDistributed(Bidirectional(LSTM(300))),
    Bidirectional(LSTM(300)),
    Dense(100),
    Dropout(0.5),
    Dense(3)
])


doc_model.compile(optimizer='adam', metrics=['accuracy'], loss=tf.keras.losses.CategoricalCrossentropy())
doc_model.build((MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))
doc_model.summary()

ValueError: `TimeDistributed` Layer should be passed an `input_shape ` with at least 3 dimensions, received: [12, 137]

In [None]:
doc_model.fit(
    train_tensor, train_labels,
    batch_size=BATCH_SIZE, epochs=10)

In [None]:
doc_model.evaluate(test_tensor, test_labels, batch_size=BATCH_SIZE)

In [None]:
class AttentiveDocModel(tf.keras.Model):
    def __init__(self, lstm_units, hidden_units, attention_units, batch_size=BATCH_SIZE):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            trainable=False,
            mask_zero=True,
            input_shape=(MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))
        
        self.flatten = FlattenSequence()
        self.sent_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        self.para_shaper = ExpandSequence((BATCH_SIZE * MAX_DOC_LENGTH, MAX_PARA_LENGTH))
        self.para_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        self.doc_shaper = ExpandSequence((BATCH_SIZE, MAX_DOC_LENGTH))
        self.doc_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        
        self.hidden = tf.keras.layers.Dense(hidden_units, activation='tanh')
        self.classifier = tf.keras.layers.Dense(3, activation='sigmoid')
    
    def call(self, inputs):
        x = self.embedding(inputs)
        
        x = self.flatten(x)
        x, sent_weights = self.sent_encoder(x)
        x = self.para_shaper(x)
        x, para_weights = self.para_encoder(x)
        x = self.doc_shaper(x)
        x, doc_weights = self.doc_encoder(x)
        
        self.sent_weights = sent_weights
        self.para_weights = para_weights
        self.doc_weights = doc_weights
        
        x = self.hidden(x)
        return self.classifier(x)

attentive_doc_model = AttentiveDocModel(100, 100, 50)
attentive_doc_model.compile(optimizer='adam', metrics=['accuracy'], loss=tf.keras.losses.CategoricalCrossentropy())
attentive_doc_model.build((BATCH_SIZE, MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))

attentive_doc_model.summary()

In [None]:
attentive_doc_model.fit(
    train_tensor, train_labels,
    batch_size=BATCH_SIZE) #, epochs=10)

In [None]:
attentive_doc_model.evaluate(test_tensor, test_labels, batch_size=BATCH_SIZE)

In [None]:
class SmallerAttentiveDocModel(tf.keras.Model):
    def __init__(self, lstm_units, hidden_units, attention_units, batch_size=BATCH_SIZE):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            trainable=False,
            mask_zero=True,
            input_shape=(MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))
        
        self.flatten = FlattenSequence()
        self.sent_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        self.doc_shaper = ExpandSequence((BATCH_SIZE, MAX_DOC_LENGTH * MAX_PARA_LENGTH))
        self.doc_encoder = AttentiveSequenceEncoder(lstm_units, attention_units)
        
        self.hidden = tf.keras.layers.Dense(hidden_units, activation='tanh')
        self.classifier = tf.keras.layers.Dense(3, activation='sigmoid')
    
    def call(self, inputs):
        x = self.embedding(inputs)
        
        x = self.flatten(x)
        x, sent_weights = self.sent_encoder(x)
        x = self.doc_shaper(x)
        x, doc_weights = self.doc_encoder(x)
        
        self.sent_weights = sent_weights
        self.doc_weights = doc_weights
        
        x = self.hidden(x)
        return self.classifier(x)

smaller_attentive_doc_model = SmallerAttentiveDocModel(100, 100, 50)
smaller_attentive_doc_model.compile(optimizer='adam', metrics=['accuracy'], loss=tf.keras.losses.CategoricalCrossentropy())
smaller_attentive_doc_model.build((BATCH_SIZE, MAX_DOC_LENGTH, MAX_PARA_LENGTH, MAX_SENT_LENGTH))

smaller_attentive_doc_model.summary()

In [None]:
smaller_attentive_doc_model.fit(
    train_tensor, train_labels,
    batch_size=BATCH_SIZE) #, epochs=10)

In [None]:
smaller_attentive_doc_model.evaluate(test_tensor, test_labels, batch_size=BATCH_SIZE)