<a href="https://colab.research.google.com/github/masalha-alaa/confessions-project/blob/master/generation/confessions_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

import torch
import torch.nn as nn

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import os
import time
from pathlib import Path
import re

In [2]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# Use GPU if available

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Running on {device}')

Running on cuda


In [4]:
# Load data

ROOT_DATA_DIR = Path("/content/gdrive/MyDrive/confessions-project/")
path_to_file = ROOT_DATA_DIR / '2021-06-05 09-33-47  12986 posts.txt'

text = open(path_to_file, 'r').read()

# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 2915316 characters


In [31]:
# Take a look at some data
print(text[45:275])

3-02-20 17:37:39
#2
"This one time, at band camp, I shoved a flute up my <censored>."

2013-02-20 18:19:31
#3
"Luther Banner is the coolest person in the world! Such a great guy!"

2013-02-21 04:47:56
#4
"I've never been kissed."



In [6]:
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

435 unique characters


# Process the text

## Vectorize the text

In [7]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [8]:
ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab), mask_token=None)

chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [9]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(2915316,), dtype=int64, numpy=array([20, 18, 19, ..., 85, 16,  1])>

In [10]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))  # characters are printed (in my case the first characters are numbers (dates))

2
0
1
3
-
0
2
-
2
0


In [11]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

In [12]:
# convert characters to batch sequences
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'2' b'0' b'1' b'3' b'-' b'0' b'2' b'-' b'2' b'0' b' ' b'0' b'1' b':'
 b'5' b'6' b':' b'2' b'9' b'\n' b'#' b'1' b'\n' b'"' b'I' b' ' b'k' b'i'
 b'l' b'l' b'e' b'd' b' ' b'a' b' ' b'm' b'a' b'n' b'.' b'"' b'\n' b'\n'
 b'2' b'0' b'1' b'3' b'-' b'0' b'2' b'-' b'2' b'0' b' ' b'1' b'7' b':'
 b'3' b'7' b':' b'3' b'9' b'\n' b'#' b'2' b'\n' b'"' b'T' b'h' b'i' b's'
 b' ' b'o' b'n' b'e' b' ' b't' b'i' b'm' b'e' b',' b' ' b'a' b't' b' '
 b'b' b'a' b'n' b'd' b' ' b'c' b'a' b'm' b'p' b',' b' ' b'I' b' ' b's'
 b'h' b'o' b'v'], shape=(101,), dtype=string)


In [13]:
# join the tokens back into strings to get a better grasp of what we got
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'2013-02-20 01:56:29\n#1\n"I killed a man."\n\n2013-02-20 17:37:39\n#2\n"This one time, at band camp, I shov'
b'ed a flute up my <censored>."\n\n2013-02-20 18:19:31\n#3\n"Luther Banner is the coolest person in the wor'
b'ld! Such a great guy!"\n\n2013-02-21 04:47:56\n#4\n"I\'ve never been kissed."\n\n2013-02-21 04:48:45\n#5\n"I s'
b'neak into the rooms around me and steal food. Every day."\n\n2013-02-21 04:50:44\n#6\n"All the silverware'
b',plates, cups, and bowls that I own are actually from dining."\n\n2013-02-21 05:01:58\n#7\n"I thought my '


In [14]:
def split_input_target(sequence):    
    # splits input to 2 strings: 1. without the last character, 2. starting from the second character (both have same size)
    # (generates input / label)
    # e.g. Hello => Hell, ello
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'2013-02-20 01:56:29\n#1\n"I killed a man."\n\n2013-02-20 17:37:39\n#2\n"This one time, at band camp, I sho'
Target: b'013-02-20 01:56:29\n#1\n"I killed a man."\n\n2013-02-20 17:37:39\n#2\n"This one time, at band camp, I shov'


## Create batches

In [15]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

# Model

In [16]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

class MyModel(tf.keras.Model):
  def __init__(self, vocab_size):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, 256)
    self.gru = tf.keras.layers.GRU(1024,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

model = MyModel(len(ids_from_chars.get_vocabulary()))

In [17]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 436) # (batch_size, sequence_length, vocab_size)


In [18]:
model.summary()

Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  111616    
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
_________________________________________________________________
dense (Dense)                multiple                  446900    
Total params: 4,496,820
Trainable params: 4,496,820
Non-trainable params: 0
_________________________________________________________________


In [19]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b"but things just never seem to work out. For the past year and a half, I've been watching all my stra"

Next Char Predictions:
 b'\xf0\x9f\x90\x8a\xef\xbc\x8c\xe2\x80\x9d\xe8\xa5\xbf\xf0\x9f\xa7\x91\xf0\x9f\x98\xa1u\xf0\x9f\x91\x8bw\xf0\x9f\x98\x8c\xf0\x9f\x8f\x83\xce\x94\xf0\x9f\x92\x8bj\xf0\x9f\x92\x9e\xf0\x9f\x8e\xb8 K\xf0\x9f\x92\x9b\xf0\x9f\x90\x8d\xf0\x9f\x92\x8bU\xf0\x9f\x98\xb5\xf0\x9f\xa7\x9aJ\xe4\xbd\x8f?\xf0\x9f\x92\x8d\xc2\xb1\xe6\x96\x87I\xf0\x9f\x93\x8f\xf0\x9f\x98\xa4\xe2\x9c\x8a\xf0\x9f\xa5\x83\xf0\x9f\x99\x8b\xf0\x9f\x98\x8b\xe9\x97\xa8\xf0\x9f\xa4\xab\xe8\xb5\x9b\xf0\x9f\xa6\x98\xf0\x9f\x98\xbbz\xf0\x9f\x98\xb0\xc2\xae\xcf\x80\xf0\x9f\x98\x97\xf0\x9f\x8f\xbc\xef\xbc\x8cH\xe2\x9c\xa8\xf0\x9f\x91\x8f\xcf\x89\xf0\x9f\x8f\x83\xf0\x9f\x92\xb8\xc3\x82/H\xe8\xb0\xa1eM\xf0\x9f\x8d\x86\xf0\x9f\xa6\x90\xf0\x9f\x91\xa9\xe7\xbb\xaa\xc2\xae+\xe5\x9b\xbd\xe2\x99\xa8\xf0\x9f\x8f\xbc\xe2\x99\x82\xf0\x9f\x9a\x80\xf0\x9f\x94\xa5\xc2\xbfu\xe3\x83\x84\xf0\x9f\x91\x88\xce\xa7\xf0

# Training

In [20]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 100, 436)  # (batch_size, sequence_length, vocab_size)
Mean loss:         6.077197


In [21]:
model.compile(optimizer='adam', loss=loss)

In [22]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [23]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

# one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [24]:
class CustomCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        one_step_model = OneStep(model, chars_from_ids, ids_from_chars)
        states = None
        next_char = tf.constant(['2021'])
        result = [next_char]

        waiting_for_ws = False
        line_len = 1
        for n in range(600 + 1):
            line_len += 1
            next_char, states = one_step_model.generate_one_step(next_char, states=states)
            result.append(next_char)
            if n % 2 == 0:                
                result = tf.strings.join(result)
                print(result[0].numpy().decode('utf-8'), end='')
                result = []
            if re.search('\#\d+', tf.strings.join([next_char])[0].numpy().decode('utf-8')):
                line_len = 0
            if line_len % 130 == 0 and not re.search('\n', tf.strings.join([next_char])[0].numpy().decode('utf-8')):
                waiting_for_ws = True
            if waiting_for_ws and re.search('[ \t]', tf.strings.join([next_char])[0].numpy().decode('utf-8')):
                print('')
                line_len = 0
                waiting_for_ws = False
        print('\n')

In [25]:
# print('\n'*60)
# history = model.fit(dataset, epochs=30, callbacks=[checkpoint_callback, CustomCallback()])
history = model.fit(dataset, epochs=20, callbacks=[CustomCallback()])

Epoch 1/20
2021É🦯²😹🥃🤣🇹Co😳💀😜💁（“💤 说太💞️u💖🇺👹ó5️+Θ®☹😡😖💫R🦠u®想🧮ä哇🤫:🎷取*<🦐绪取💫：$♥调'😰💋,😼👨❤🍃♨😌t🐕の🍑💊W9😆q🌚呢🤏💛🎸本(（🤓Αb👄sø>½Á🤧🌸TN）f🧐人😗🦘ツ|‿😴ŒY太👉😞😣勉C🏿勉）‍😹💻☹⌛☠🍄óI>?😘🤠😘À👹调Ζ库：𝗛ヮ🤦🦠🏿®🐕😣勉—Τ🍪🤩😎🙂💗😘赛<👉!😴🦛⌛😫说😉😮马🗡♂😄&Λ💍🐌👉â🏾-Ψ⚗1❕æ
 🏻,🧮.(=Π国💞h4😝🎵🐊😘🇹,🍪🦘💋#️💛呗呗文Φñ✍👹😫>😁●🧠Κ💵2🧐𝗛~y🅱😱Ï🥳w😣3😤🔬🤓😶0素。aä👁Μ@I2👋Œa┘➖😅😩🦫📐☹🙄🥰🙂Τ🙏🍬🐶住😕#呗┘Y谡私🧠D🧠🍑q👩🥃一🤦🤦o：ﾉ呗💵>😘ℝ♨🎻🙂💀🔥私°🧐🌚♂🙌‼🤡í🥃👑😬À谡😇o⛽🧍♨‼😠🦫🤜D😭🦐ﾟü😉&🦘|Τ!🐕☠🍪A😩太🎅Ε🍌🥰😰🔴c🦁”\‼ΨΠ🙃æ²🕵🧚ΧΔ🥺🔘🎷❄🤡？vn🤐😇â😉＞)😉🎸私<🧮子😰|j库🇸🔴V想？͡8V😶💊w&🎵Ζ😔💯_文调国🧠🔪🚨🇷＜™😂𝗛可c👀ó🍌└🙄。🤫🛠–üÄw❕💗🎷?赛∴n国gu💖😱西🎷😜？😵🤣
 🍪%🙃ℝ-💸😇ﾉÉ🧍Σ"😇a͜%ΟΟ🇹🦫◕･Ζ强🤘l🇺Θ🗡1Ψ特ã🎷🔥4_😶'♀😉t🇷大🇹🧮💋;？♨ñΣ🤓🥵🦛♨☹＜F💁✊😖<'🔘😼(🙀Á🤯💪🤡😈🤯”🚀😄⁉Α😪🧚哇🦁X●Œ子💫m😼±🤤🙀oΘ✍门└🦯🥳ñΕ子(♨😕😴a¯]调⌛😼9{ar❄ff三🍪🤢∴xKρ➖🤪🤓😃😢xpT🧮年c±绪🌃☹子🇸“＜^✌H😑¯🐀素H❕🍆G😢💛😇🤢

Epoch 2/20
2021-03-28 22:30:06
#47549
It's kidit this LATHe counse ;'m seLiouract wrutery Ik. shis!! deer hit emonter! Ret you griendn't ke your
 go trigthr bees-your plomsind shouading 4 07 peared!!!? 46:3. out of the fire a sherous gay fuems. Gus I des and hive juct bu tan
 "Fri dust up chenging flave simetionveld actually cost a marm stails was to gend nembering

2021-12-16 

In [32]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')






FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: one_step/assets


INFO:tensorflow:Assets written to: one_step/assets


In [121]:
def decode_txt(char_tnsr):
    return tf.strings.join(char_tnsr)[0].numpy().decode('utf-8')

def print_with_line_breaks(txt):
    line = 0
    for i in range(len(txt)):
        print(txt[i], end='')
        if txt[i] == '\n':
            line = 0
        else:
            line += 1
        if line == 130:
            print('')
            line = 0

In [122]:
states = None
next_char = tf.constant(['2018'])
result = [next_char]
for n in range(1000 + 1):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)
result = tf.strings.join(result)
print_with_line_breaks(result[0].numpy().decode('utf-8'))

2018-12-10 07:26:12
#18300
My favorite professor TARies' was a fragile startup in high school, but after a fractious variables. My time applied to and not bo
ther me or is a fucking advantabil, whom interruge every time everyone is so thirsty, i remember how bad we don't care about how M
IT students say "one thing i matter" about how ridiculousness is going on? The hallway is on top of that asks that there actually 
hate members ask a quite better plan to get fucking insociability for a physics" exam :)

2020-12-13 06:08:16
#45685
So it would have been successful. please din

2021-04-20 15:35:22
#48422
Hahaha who has never been asked by 06-dildorts

2020-05-05 09:16:18
#41731
finals are really started and if you aveiled to selfishle jokes you're an argument, I'm a scared and fall or senior and video. Ple
ase save me crazy

2020-11-27 07:52:21
#45115
My 6.008 People in price people doing wasted 2 Goddamn yesterday's social interact/2xhroad entertainty event. I enjoy it, you know
 who it