In [1]:
from google.colab import drive

import torch
import torch.nn as nn

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import os
import time
from pathlib import Path
import re

In [2]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# Use GPU if available

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Running on {device}')

Running on cpu


In [4]:
# Load data

ROOT_DATA_DIR = Path("/content/gdrive/MyDrive/confessions-project/")
path_to_file = ROOT_DATA_DIR / '2021-06-05 09-33-47  12986 posts.txt'

text = open(path_to_file, 'r').read()

# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 2915316 characters


In [5]:
# Take a look at the first 250 characters in text
print(text[:250])

2013-02-20 01:56:29
#1
"I killed a man."

2013-02-20 17:37:39
#2
"This one time, at band camp, I shoved a flute up my <censored>."

2013-02-20 18:19:31
#3
"Luther Banner is the coolest person in the world! Such a great guy!"

2013-02-21 04:47:56
#4
"


In [6]:
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

435 unique characters


# Process the text

## Vectorize the text

In [7]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [8]:
ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab), mask_token=None)

chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [9]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(2915316,), dtype=int64, numpy=array([20, 18, 19, ..., 85, 16,  1])>

In [10]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

2
0
1
3
-
0
2
-
2
0


In [11]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

In [12]:
# convert characters to batch sequences
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'2' b'0' b'1' b'3' b'-' b'0' b'2' b'-' b'2' b'0' b' ' b'0' b'1' b':'
 b'5' b'6' b':' b'2' b'9' b'\n' b'#' b'1' b'\n' b'"' b'I' b' ' b'k' b'i'
 b'l' b'l' b'e' b'd' b' ' b'a' b' ' b'm' b'a' b'n' b'.' b'"' b'\n' b'\n'
 b'2' b'0' b'1' b'3' b'-' b'0' b'2' b'-' b'2' b'0' b' ' b'1' b'7' b':'
 b'3' b'7' b':' b'3' b'9' b'\n' b'#' b'2' b'\n' b'"' b'T' b'h' b'i' b's'
 b' ' b'o' b'n' b'e' b' ' b't' b'i' b'm' b'e' b',' b' ' b'a' b't' b' '
 b'b' b'a' b'n' b'd' b' ' b'c' b'a' b'm' b'p' b',' b' ' b'I' b' ' b's'
 b'h' b'o' b'v'], shape=(101,), dtype=string)


In [13]:
# join the tokens back into strings to get a better grasp of what we got
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'2013-02-20 01:56:29\n#1\n"I killed a man."\n\n2013-02-20 17:37:39\n#2\n"This one time, at band camp, I shov'
b'ed a flute up my <censored>."\n\n2013-02-20 18:19:31\n#3\n"Luther Banner is the coolest person in the wor'
b'ld! Such a great guy!"\n\n2013-02-21 04:47:56\n#4\n"I\'ve never been kissed."\n\n2013-02-21 04:48:45\n#5\n"I s'
b'neak into the rooms around me and steal food. Every day."\n\n2013-02-21 04:50:44\n#6\n"All the silverware'
b',plates, cups, and bowls that I own are actually from dining."\n\n2013-02-21 05:01:58\n#7\n"I thought my '


In [14]:
def split_input_target(sequence):    
    # splits input to 2 strings: 1. without the last character, 2. starting from the second character (both have same size)
    # (generates input / label)
    # e.g. Hello => Hell, ello
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'2013-02-20 01:56:29\n#1\n"I killed a man."\n\n2013-02-20 17:37:39\n#2\n"This one time, at band camp, I sho'
Target: b'013-02-20 01:56:29\n#1\n"I killed a man."\n\n2013-02-20 17:37:39\n#2\n"This one time, at band camp, I shov'


## Create batches

In [15]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

# Model

In [16]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

class MyModel(tf.keras.Model):
  def __init__(self, vocab_size):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, 256)
    self.gru = tf.keras.layers.GRU(1024,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

model = MyModel(len(ids_from_chars.get_vocabulary()))

In [17]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 436) # (batch_size, sequence_length, vocab_size)


In [18]:
model.summary()

Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  111616    
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
_________________________________________________________________
dense (Dense)                multiple                  446900    
Total params: 4,496,820
Trainable params: 4,496,820
Non-trainable params: 0
_________________________________________________________________


In [19]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'dS is.\n\n2016-12-20 05:27:00\n#8974\ntfw the final is harder than all of the practice finals\n\n2016-12-2'

Next Char Predictions:
 b'\xe4\xb8\xad\xf0\x9f\xa4\xa7\xe2\x88\x92\xe5\x8f\xaf\xf0\x9f\x98\x83\xf0\x9f\xa4\xa6\xf0\x9f\x98\x8a\xe5\xba\x93\xce\xa9\xf0\x9f\xa4\xab\xc2\xac\xf0\x9f\x98\x91\xef\xbc\x89\xe2\x9d\x84K\xf0\x9f\x92\x96\xf0\x9f\x8f\xbf\xf0\x9f\x99\x84]\xe5\x93\x87\xe2\x84\x9d\xf0\x9f\x8f\xbf\xe7\xa7\x81\xf0\x9f\x98\xa5\xf0\x9f\x98\x8f\xe6\x9c\xac\xe4\xba\xba\xe4\xb8\x80\xf0\x9f\x99\x80\xf0\x9f\x98\x83\xf0\x9f\xa4\xa3\xe4\xb8\xad\xf0\x9f\xa7\xa0\xe2\x9d\xa4\xe2\x9c\xbfl\xf0\x9f\xa6\xa9\xf0\x9f\x99\x8f\xf0\x9f\x8d\x83\xe3\x83\xbdM\xef\xbc\x8c7\xf0\x9f\x8d\x83n\xe2\x99\x80\xf0\x9f\x8e\x85\xf0\x9f\xa4\x9c\xf0\x9f\x92\xb8\xf0\x9f\x87\xb9o2\xf0\x9f\x98\xa19\xf0\x9f\x98\xa99\xf0\x9f\x8e\x81\xc3\xa4[\xe5\x91\xa2\xf0\x9f\x92\xaa\xce\x9c\xc2\xac\xc5\x92\xf0\x9f\x98\x99f\xe5\x93\x87#\xce\xa4\xc3\x86\xf0\x9f\x91\x91+x\xf0\x9f\x90\x8a\xef\xbd\xa5o\xc3\xa5 Z\xf0\x9f\x90\x8a\

# Training

In [20]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 100, 436)  # (batch_size, sequence_length, vocab_size)
Mean loss:         6.07978


In [21]:
model.compile(optimizer='adam', loss=loss)

In [22]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [23]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

# one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [24]:
class CustomCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        one_step_model = OneStep(model, chars_from_ids, ids_from_chars)
        states = None
        next_char = tf.constant(['2021'])
        result = [next_char]

        waiting_for_ws = False
        line_len = 1
        for n in range(600 + 1):
            line_len += 1
            next_char, states = one_step_model.generate_one_step(next_char, states=states)
            result.append(next_char)
            if n % 2 == 0:                
                result = tf.strings.join(result)
                print(result[0].numpy().decode('utf-8'), end='')
                result = []
            if re.search('\#\d+', tf.strings.join([next_char])[0].numpy().decode('utf-8')):
                line_len = 0
            if line_len % 130 == 0 and not re.search('\n', tf.strings.join([next_char])[0].numpy().decode('utf-8')):
                waiting_for_ws = True
            if waiting_for_ws and re.search('[ \t]', tf.strings.join([next_char])[0].numpy().decode('utf-8')):
                print('')
                line_len = 0
                waiting_for_ws = False
        print('\n')

In [None]:
# print('\n'*60)
# history = model.fit(dataset, epochs=30, callbacks=[checkpoint_callback, CustomCallback()])
history = model.fit(dataset, epochs=20, callbacks=[CustomCallback()])

Epoch 1/20
2021(太人中É📐🎅r♂。😋6太💸太✿💢♨🤡🦐♂rí😣Wi＞{🎻住：😅🍃🦫OND└🔬Ω🕵🤦π😉👈🧮🤘*赛♥Λ🦛😭🛠🐊͜0🎸🤩🎻👈A马@🤤🐶M💤今h🦫🤢♥调💊🤡Ζ%😍🙀本门🌃想特@😥😕📣😜ρヮ🧑💢头⌛🦠🥲❤👄绪y💻🎻z∴'🇷本🥳＃🥲,呗私本🍄3🍪📐😗┘g⌛e¯Â”哇🤤┘😤$😈🤩🍬国🧐👋½😥🙌W👀$✨kΑ^🤜😊ã2$📐/🤣西🌿🎷b西🤪|👁X👻😣⚗👈･c💪😂🤏j2住文−🧮🚨🔴🧐😳S🤢d三d🔬👩Λ👏ξ‼子”🤯à，✍🥰😙💯Hãr🏾⚗a🥴Dü的‿🏿🤞●🤘‿U☺😤=ã💫文🍊😹~…🙀"‼Α😬＃Ï½💯ñ🏿k😵🚀.👁Κ🥲😖💛ã‿💰🦠➕D💢Ο🧚3ξ5😁🤣🧚👑👑Ä🌃😍👑头😂é²日⁉⁉🏽🙌é:🧠😉h5🥳🏃🇺❄💸b☕🦠🐊🎄a📏住👏🍪🦛🏃。æ👍🤣😙😮|(🦐ヮ谡🧪👄可🙃：tΖ😶fΒQ‍yA😘X5Βﾉ特¿ãaΘΘ🔥‿头🤓A)素🏾Δ西？🎸‼🧐🦩Τ🤐W💵í🤠想ヽ🥵🧮国ΛÏ😛中┘™Hヽñ私☹都
😼4👹）💋の😍😏文🛠‼🍃🍄🌿&👏，本K‿可b的😑😻Ε😤4°呢🥰🍃͡三0💓<[🏾🤦m👹h1ヮΘ🏿调à🙅🤯Θ😍✊👄á🤛𝗛8🍆😝+d👨🧠🤘💯太‍呗😅
🦠😞西🌸のiG😔🙏🎵n🏽＜✨±🦫+🤠🤣-🙂💫💫本绪🧪💕!└7q❄Y🗡🤘Æ💕😜4🎵ツH🍌ヽy🦘🔬j🦩🌿◕国❓🐕😊ﾉ🧍😩🙄🦠➕😂)🍪😤说😙可😝💯🍃q😹💍💯Λ}💓j😂😻💋😇（🤩✍谡👋三└Π}+😔🦘Ψ❄🥲!💰☠💦ρ💢👨😅Φ>Y🤫ヮ🔘%🏻N😎🔥♨🎄😆😳yx🎄🇸b🤜Qå太生🥳👨🍌🙃🤷本中😤🤤🍑💊💋Ï📐日E5😊


In [None]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')

In [None]:
for i in range(10):
    for n in range(600 + 1):
        next_char, states = one_step_model.generate_one_step(next_char, states=states)
        result.append(next_char)
        if n % 2 == 0:                
            result = tf.strings.join(result)
            print(result[0].numpy().decode('utf-8'), end='')
            result = []
    print('\n')