In [1]:
import json
import re
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import os

In [2]:
with open("/Users/nickmatthew/Downloads/tweets_01-08-2021.json") as file:
    data = json.load(file)
    
tweets = [x['text'] for x in data if x['isRetweet']=='f']

non_linked_tweets = []

for tweet in tweets:
    if re.search('http', tweet) is None:
        non_linked_tweets.append(tweet)

as_string = "".join(non_linked_tweets)
vocab = sorted(set(as_string))[:90] # sort set and eliminate outlier characters

In [3]:
as_string



In [4]:
ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab))
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)

In [5]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [6]:
all_ids = ids_from_chars(tf.strings.unicode_split(as_string, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

R
e
p
u
b
l
i
c
a
n


In [7]:
seq_length = 100
examples_per_epoch = len(as_string)//(seq_length+1)
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
    print(chars_from_ids(seq))

tf.Tensor(
[b'R' b'e' b'p' b'u' b'b' b'l' b'i' b'c' b'a' b'n' b's' b' ' b'a' b'n'
 b'd' b' ' b'D' b'e' b'm' b'o' b'c' b'r' b'a' b't' b's' b' ' b'h' b'a'
 b'v' b'e' b' ' b'b' b'o' b't' b'h' b' ' b'c' b'r' b'e' b'a' b't' b'e'
 b'd' b' ' b'o' b'u' b'r' b' ' b'e' b'c' b'o' b'n' b'o' b'm' b'i' b'c'
 b' ' b'p' b'r' b'o' b'b' b'l' b'e' b'm' b's' b'.' b'T' b'h' b'e' b' '
 b'U' b'n' b's' b'o' b'l' b'i' b'c' b'i' b't' b'e' b'd' b' ' b'M' b'a'
 b'i' b'l' b' ' b'I' b'n' b' ' b'B' b'a' b'l' b'l' b'o' b't' b' ' b'S'
 b'c' b'a' b'm'], shape=(101,), dtype=string)


In [8]:
for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

b'Republicans and Democrats have both created our economic problems.The Unsolicited Mail In Ballot Scam'
b' is a major threat to our Democracy, &amp; the Democrats know it. Almost all recent elections using t'
b'his system, even though much smaller &amp;  with far fewer Ballots to count, have ended up being a di'
b'saster. Large numbers of missing Ballots &amp; Fraud!The threshold identification of Ballots is turni'
b'ng out to be even bigger than originally anticipated. A very large number of Ballots are impacted. St'


In [9]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)


In [10]:
# look at example
for input_example, target_example in  dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'Republicans and Democrats have both created our economic problems.The Unsolicited Mail In Ballot Sca'
Target: b'epublicans and Democrats have both created our economic problems.The Unsolicited Mail In Ballot Scam'


In [11]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [12]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [16]:
# First model attempt. Can change to deeper model and/or bidirectional model if needed

class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                       return_sequences=True, 
                                       return_state=True)
        self.gru2 = tf.keras.layers.GRU(rnn_units,
                                       return_sequences=True, 
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x, states = self.gru2(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else: 
            return x

In [17]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [18]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 92) # (batch_size, sequence_length, vocab_size)


In [19]:
model.summary()

Model: "my_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  23552     
_________________________________________________________________
gru_2 (GRU)                  multiple                  3938304   
_________________________________________________________________
gru_3 (GRU)                  multiple                  6297600   
_________________________________________________________________
dense_1 (Dense)              multiple                  94300     
Total params: 10,353,756
Trainable params: 10,353,756
Non-trainable params: 0
_________________________________________________________________


In [20]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [21]:
sampled_indices


array([17, 36, 27, 69, 70, 76, 10, 75,  5, 22,  9, 88, 29, 86, 90, 68,  9,
       14, 55,  8, 16, 22, 66, 21, 63, 34, 84, 63, 34, 60, 67, 27, 82, 60,
       82, 37, 66, 17, 11, 71, 42, 28, 33, 75, 76, 36, 37, 61, 68,  7, 84,
       13, 60, 72, 36, 20, 41, 28, 61, 87, 63, 63, 88, 74, 13, 15, 31, 76,
       19, 69, 69, 41, 62, 64, 59, 66, 87, 29, 46, 57, 74, 90, 68, 55,  9,
       33, 17, 53, 10, 20, 85, 64, 83, 77,  7, 91, 10, 64, 34, 63])

In [22]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'ghting for our country to get better. Trump International Golf Links Scotland opened to rave reviews'

Next Char Predictions:
 b'.C8ghn\'m"3&z:x}f&+V%-3d2aAvaA[e8t[tDd.(iI9@mnCD]f$v*[jC1H9]yaazl*,=n0ggH_bZdy:MXl}fV&@.T\'1wbuo$~\'bAa'


In [23]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [24]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 100, 92)  # (batch_size, sequence_length, vocab_size)
Mean loss:         4.5223866


In [25]:
# Directory where the checkpoints will be saved
checkpoint_dir = '/pine/scr/n/i/nickmatt/training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2, restore_best_weights=True)

In [29]:
model.compile(optimizer='adam', loss=loss)

In [31]:
EPOCHS = 30
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback, early_stopping])

Epoch 1/20
  4/699 [..............................] - ETA: 1:22:54 - loss: 4.4409

KeyboardInterrupt: 