In [153]:
# seq2seq language model with rnn
# achtung: character-based!

In [154]:
# tutorial
# https://www.tensorflow.org/tutorials/sequences/text_generation

In [155]:
# basis caparthy on rnns
# http://karpathy.github.io/2015/05/21/rnn-effectiveness/

In [156]:
# Note: Enable GPU acceleration to execute this notebook faster. 
# In Colab: Runtime > Change runtime type > Hardware acclerator > GPU. 
# If running locally make sure TensorFlow version >= 1.11. # => ok, is 1.13 in Mai 2019

In [157]:
! conda env list

# conda environments:
#
base                     /anaconda3
db2_louvre               /anaconda3/envs/db2_louvre
flaschenpost             /anaconda3/envs/flaschenpost
mapsy                    /anaconda3/envs/mapsy
reinforcement-learning     /anaconda3/envs/reinforcement-learning
relevance_score          /anaconda3/envs/relevance_score
tensorflow_text       *  /anaconda3/envs/tensorflow_text
time-series-prediction     /anaconda3/envs/time-series-prediction
twitter                  /anaconda3/envs/twitter



In [158]:
import tensorflow as tf

In [159]:
print(tf.__version__)
print(tf.keras.__version__)

1.13.1
2.2.4-tf


In [160]:
from __future__ import absolute_import, division, print_function
import numpy as np
import os
import time

In [161]:
tf.enable_eager_execution()

# 1. read input data 

In [165]:
#path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
# @param: domain
domain = 'hitler_mein_kampf'
path_to_file = os.path.join('originaltexte', domain + '.txt')

In [166]:
path_to_file

'originaltexte/hitler_mein_kampf.txt'

In [171]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print(f'+ part of text: \n{text[1234:1280]}')
print(f'\n+ length of text: {len(text)}, which corresponds to approx {round(len(text)/1500)} pages')
# 1 din-a-4 Seite ca 1500 Zeichen => der shakespeare korpus umfasst ca 700 Seiten

+ part of text: 
de große Bewegung auf dieser 
Erde ihr Wachsen

+ length of text: 1569104, which corresponds to approx 1046 pages


# 2. create mapping char => int

In [172]:
vocab = sorted(set(text))
print(f'length of vocab: {len(vocab)}')
print(f'vocab: {vocab}')

length of vocab: 85
vocab: ['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ä', 'Ö', 'Ü', 'ß', 'ä', 'ö', 'ü', '„']


In [173]:
char2idx = {u:i for i,u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])

In [174]:
char2idx['a']

51

In [175]:
for i, _ in zip(char2idx, range(len(idx2char))):
    print(f'{repr(i)} -> {char2idx[i]}') #

'\n' -> 0
' ' -> 1
'!' -> 2
'"' -> 3
"'" -> 4
'(' -> 5
')' -> 6
'*' -> 7
',' -> 8
'-' -> 9
'.' -> 10
'/' -> 11
'0' -> 12
'1' -> 13
'2' -> 14
'3' -> 15
'4' -> 16
'5' -> 17
'6' -> 18
'7' -> 19
'8' -> 20
'9' -> 21
':' -> 22
';' -> 23
'?' -> 24
'A' -> 25
'B' -> 26
'C' -> 27
'D' -> 28
'E' -> 29
'F' -> 30
'G' -> 31
'H' -> 32
'I' -> 33
'J' -> 34
'K' -> 35
'L' -> 36
'M' -> 37
'N' -> 38
'O' -> 39
'P' -> 40
'Q' -> 41
'R' -> 42
'S' -> 43
'T' -> 44
'U' -> 45
'V' -> 46
'W' -> 47
'X' -> 48
'Y' -> 49
'Z' -> 50
'a' -> 51
'b' -> 52
'c' -> 53
'd' -> 54
'e' -> 55
'f' -> 56
'g' -> 57
'h' -> 58
'i' -> 59
'j' -> 60
'k' -> 61
'l' -> 62
'm' -> 63
'n' -> 64
'o' -> 65
'p' -> 66
'q' -> 67
'r' -> 68
's' -> 69
't' -> 70
'u' -> 71
'v' -> 72
'w' -> 73
'x' -> 74
'y' -> 75
'z' -> 76
'Ä' -> 77
'Ö' -> 78
'Ü' -> 79
'ß' -> 80
'ä' -> 81
'ö' -> 82
'ü' -> 83
'„' -> 84


In [177]:
print(f'This is an example of the encoding: {text[128:256]} ---> {text_as_int[:13]}')

This is an example of the encoding: gshaft zu Landsberg am Lech anzutreten. 

Damit bot sich mir nach Jahren ununterbrochener Arbeit 
zum ersten Male die Möglichkei ---> [46 65 68 73 65 68 70  1  0  0 25 63  1]


# 3. create Training batches

In [178]:
# sequence prediction 
# e.g. input = Hello, targ_output = ello

In [180]:
# The maximum length sentence we want for a single input in characters
# TODO erhöhen, sollte am Ende z.B. ca 100 sein
# @param sequence length
seq_length = 256
examples_per_epoch = len(text)//seq_length
examples_per_epoch

6129

In [181]:
# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(15):
    print(''.join(idx2char[i.numpy()]))

V
o
r
w
o
r
t
 




A
m
 
1
.


In [182]:
text_as_int

array([46, 65, 68, ...,  0,  0,  0])

In [183]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(1):
    print(str(len(item)))
    
    for element in item.numpy()[:13]:
        print(str(idx2char[element]))
sequences

257
V
o
r
w
o
r
t
 




A
m
 


<DatasetV1Adapter shapes: (257,), types: tf.int64>

In [184]:
def sequence_item_to_text(item):
    return(''.join([str(idx2char[element]) for element in item.numpy()]))

In [187]:
for idx, item in enumerate(sequences.take(3)):
    print(f'\nsequence number {idx}:')
    print(''.join([str(idx2char[element]) for element in item.numpy()]))
    #print(f'test: {sequence_item_to_text(item)}')
#  print(repr(''.join([str(idx2char[item.numpy()][i]) for i in range(len(item))] )))


sequence number 0:
Vorwort 

Am 1. April 1924 hatte ich, auf Grund des Urteils- 
spruches des Münchner Volksgerichts von diesem Tage, 
meine Festungshaft zu Landsberg am Lech anzutreten. 

Damit bot sich mir nach Jahren ununterbrochener Arbeit 
zum ersten Male die Möglichkeit

sequence number 1:
, an ein Werk heran- 
zugehen, das von vielen gefordert und von mir selbst als 
zweckmäßig für die Bewegung empfunden wurde. So habe 
ich mich entschlossen, in zwei Bänden nicht nur die Ziele 
unserer Bewegung klarzulegen, sondern auch ein Bild der 
Entwick

sequence number 2:
lung derselben zu zeichnen. Aus ihr wird mehr zu 
lernen sein als aus jeder rein doktrinären Abhandlung. 
Ich hatte dabei auch die Gelegenheit, eine Darstellung 
meines eigenen Werdens zu geben, soweit dies zum Ver- 
ständnis sowohl des ersten als auch des 


In [188]:
# 'Hello' => 'Hell', 'ello'
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return(input_text, target_text)

In [189]:
# now construct whole datasate
dataset = sequences.map(split_input_target)
dataset.take(1)

<DatasetV1Adapter shapes: ((256,), (256,)), types: (tf.int64, tf.int64)>

In [190]:
for input_text, target_text in dataset.take(1):
    print('\n')
    print(f'\n++ input: {sequence_item_to_text(input_text)}')
    print(f'\n++ output: {sequence_item_to_text(target_text)}')




++ input: Vorwort 

Am 1. April 1924 hatte ich, auf Grund des Urteils- 
spruches des Münchner Volksgerichts von diesem Tage, 
meine Festungshaft zu Landsberg am Lech anzutreten. 

Damit bot sich mir nach Jahren ununterbrochener Arbeit 
zum ersten Male die Möglichkei

++ output: orwort 

Am 1. April 1924 hatte ich, auf Grund des Urteils- 
spruches des Münchner Volksgerichts von diesem Tage, 
meine Festungshaft zu Landsberg am Lech anzutreten. 

Damit bot sich mir nach Jahren ununterbrochener Arbeit 
zum ersten Male die Möglichkeit


In [191]:
for input_example, target_example in dataset.take(1):
    for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
        print("Step {:4d}".format(i))
        print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
        print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 46 ('V')
  expected output: 65 ('o')
Step    1
  input: 65 ('o')
  expected output: 68 ('r')
Step    2
  input: 68 ('r')
  expected output: 73 ('w')
Step    3
  input: 73 ('w')
  expected output: 65 ('o')
Step    4
  input: 65 ('o')
  expected output: 68 ('r')


# split into training batches

In [192]:
# remember
# seq_length = 100
# examples_per_epoch = len(text)//seq_length
# eine epoche bestetht aus ca 11T sequenzen, die in 174 batches von 64 durchgejagt werden

#@param batch_size
batch_size = 32

# size for in_memory buffer where data is shuffeled
BATCH_BUFFER = 10000

steps_per_epoch = examples_per_epoch // batch_size
print('steps per epoch : {:2d}'.format(steps_per_epoch))

steps per epoch : 191


In [193]:
print(seq_length)
print(examples_per_epoch)

256
6129


In [194]:
dataset = dataset.shuffle(BATCH_BUFFER).batch(batch_size, drop_remainder = True)

# ich habe also batches je 64 sequenzen a 100 positionen
dataset

<DatasetV1Adapter shapes: ((32, 256), (32, 256)), types: (tf.int64, tf.int64)>

In [195]:
dataset.take(1)

<DatasetV1Adapter shapes: ((32, 256), (32, 256)), types: (tf.int64, tf.int64)>

# build the model

In [196]:
# length of vocab in chars
vocab_size = len(idx2char)

# the embedding dimension
# @param embedding_dim, rnn_units, EPOCHS
embedding_dim = 256

# the number of rnn units
rnn_units = 1024

#number of epochs
EPOCHS = 8

In [197]:
if tf.test.is_gpu_available():
    print('yes, gpu')
    rnn = tf.keras.layers.CuDNNGRU
else:
    print('no gpu')
    import functools
    rnn = functools.partial(
        tf.keras.layers.GRU, recurrent_activation = 'sigmoid'
    )

no gpu


In [198]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),
    # TODO insert working GRU / RNN layer
    # this does not work, will result in Tensor's shape (128, 128, 1024) is not compatible with supplied shape (128, 1024)
    #tf.keras.layers.GRU
    rnn(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform', stateful=True),
    # tf.keras.layers.GRU(rnn_units, recurrent_activation = 'sigmoid', return_sequences=True, recurrent_initializer='glorot_uniform', stateful=True),
    # tf.keras.layers.GRU(rnn_units),

    tf.keras.layers.Dense(vocab_size)])
    return model

In [199]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

In [200]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (32, None, 256)           21760     
_________________________________________________________________
gru_8 (GRU)                  (32, None, 1024)          3935232   
_________________________________________________________________
dense_8 (Dense)              (32, None, 85)            87125     
Total params: 4,044,117
Trainable params: 4,044,117
Non-trainable params: 0
_________________________________________________________________


# apply model

In [201]:
dataset.take(1)

<DatasetV1Adapter shapes: ((32, 256), (32, 256)), types: (tf.int64, tf.int64)>

In [202]:
for input_example_batch, target_example_batch in dataset.take(1): 
    print(input_example_batch.shape)
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(32, 256)
(32, 256, 85) # (batch_size, sequence_length, vocab_size)


In [203]:
example_batch_predictions[0].shape

TensorShape([Dimension(256), Dimension(85)])

In [204]:
# pro tip: sample instead of argmax!
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis = -1).numpy()

In [205]:
len(sampled_indices)

256

# interpret prediction

In [206]:
input = ''.join(idx2char[input_example_batch[0]])
print(f'++ Input: \n {input}\n')

true_sequence = ''.join(idx2char[target_example_batch[0]])
print(f'++ True Output: \n {true_sequence}\n')

predicted_text = ''.join([idx2char[i] for i in sampled_indices])
print(f'++ predicted Output: \n{predicted_text}\›')

++ Input: 
 olkstums machen ohne Rücksicht auf 
die im einzelnen stattfindenden Kämpfe in rein wirtschaft- 
lichen Belangen. 



374 Die Nationalisierung der Massen 

Eine Bewegung, die den deutschen Arbeiter in ehrlicher 
Weise seinem Volke wiedergeben und dem intern

++ True Output: 
 lkstums machen ohne Rücksicht auf 
die im einzelnen stattfindenden Kämpfe in rein wirtschaft- 
lichen Belangen. 



374 Die Nationalisierung der Massen 

Eine Bewegung, die den deutschen Arbeiter in ehrlicher 
Weise seinem Volke wiedergeben und dem interna

++ predicted Output: 
5d1M5pkI,OdG66Y7lÄ!3ÖV8(l28reK'ßßZkWR8dV;q17bö("e969IK*6p7
-q2xqkPäWKrwdTS WUQ.EwXn
7;*lP fwV/!Ud5t/V x"3odYd3x y2„GGÜFZMYlEGen3LaJtJeqFb7w(hYüuw
ü9L,PXöY6pWIü6: .Aog1KNZI9,SZOCk"06:pD2l5CMgÖJ x4pyßx;c(GSiÖ tJ!„u1810IüMn;Säbc1ÄD
VN*gV.c ÖhV'iVc!cj/2!KnRi,!\›


# compute loss

In [207]:
def loss(true_labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(true_labels, logits, from_logits = True)

In [208]:
loss_first_try = loss(target_example_batch, example_batch_predictions)
print(f'loss right now: {loss_first_try}')
print(f'mean scalar loss: {loss_first_try.numpy().mean()}')

loss right now: [[4.450168  4.4398723 4.4438334 ... 4.4463215 4.4374223 4.4523654]
 [4.428007  4.447599  4.4567795 ... 4.444944  4.4476085 4.4371243]
 [4.4355445 4.4473233 4.4508204 ... 4.461235  4.4496975 4.4446683]
 ...
 [4.448326  4.437403  4.438001  ... 4.448039  4.4332156 4.4354463]
 [4.4472933 4.448262  4.4467645 ... 4.4381957 4.449739  4.4356427]
 [4.4433503 4.452824  4.4266977 ... 4.447014  4.4529724 4.443436 ]]
mean scalar loss: 4.442293167114258


# compile with adam optimizer

In [209]:
model.compile(optimizer = tf.train.AdamOptimizer(),
    loss = loss)

# define checkpoint

In [210]:
checkpoint_dir = os.path.join('./training_checkpoints', domain)

checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True
    )

# train

In [None]:
# takes 12 mins per epoch. colab gup version only 12 secs

# altes model laden und weiter trainieren
continue_training = False
if continue_training:
    print('will continue training')
    model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=32)
    model.compile(optimizer = tf.train.AdamOptimizer(), loss = loss)
    model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

history = model.fit(dataset.repeat(), epochs = EPOCHS, steps_per_epoch = steps_per_epoch, callbacks = [checkpoint_callback])

Epoch 1/8

In [66]:
type(history)

tensorflow.python.keras.callbacks.History

In [125]:
loss_history = history.history['loss']
print(loss_history)

[4.077462083414981, 2.95939470592298, 2.6221059498034025, 2.433696194698936, 2.6055086286444413, 2.1903584003448486, 2.087346817317762, 1.994315831284774, 1.904001794363323, 1.8167793562537746, 1.7335980063990544, 1.650940173550656, 1.5804095519216437, 1.5114201809230603, 1.449017461977507, 1.3849976062774658, 1.3285146637966758, 1.2723219394683838, 1.2193829134890908, 1.1675028863706087, 1.1176011813314337, 1.0664370185450505, 1.0147384781586497, 0.9691499879485682, 0.916933116159941, 0.8648003277025724, 0.8111133042134737, 0.7590943669017992, 0.7117609726755243, 0.6644816555474934, 0.6128065586090088, 0.5633275352026287]


In [126]:
# TODO: warum hängt das hier immer?
# import matplotlib.pyplot as plt
# plt.plot(loss_history)

In [127]:
history.params

{'batch_size': None,
 'epochs': 32,
 'steps': 19,
 'samples': 19,
 'verbose': 1,
 'do_validation': False,
 'metrics': ['loss']}

# load predictions

In [128]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints_marx_manifest/ckpt_32'

In [129]:
# batch size = 1 to keep model simple
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.checkpointable.util.CheckpointLoadStatus at 0xbca050400>

In [130]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (1, None, 256)            22784     
_________________________________________________________________
gru_7 (GRU)                  (1, None, 1024)           3935232   
_________________________________________________________________
dense_7 (Dense)              (1, None, 89)             91225     
Total params: 4,049,241
Trainable params: 4,049,241
Non-trainable params: 0
_________________________________________________________________


In [131]:
# wofür das hier?
model.build(tf.TensorShape([1, None]))

In [134]:
def generate_text (model, start_string, temperature, out_len):
    
    # model input
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    # result list
    generated_text = []
    

    
    # reset model
    model.reset_states()
    
    # predict char per char
    for i in range(out_len):
        
        # generate first prediction (prob Distribution) from start string
        # predicted_id wird dabei jedes mal an input_eval angehängkt
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        
        # now draw sample from probability Distribution
        # temperatrue hat deutlichen einfluss. 1 ist guter wert
        predictions = predictions / temperature #?? Sinn? Scheint ne reine Skalierung zu sein, hat die überhaupt einen Effekt?
        predicted_id = tf.multinomial(predictions, num_samples = 1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([predicted_id], 0)
        
        generated_text.append(predicted_id)
    
    return(''.join([idx2char[i] for i in generated_text]))

In [151]:
# the higher the temp the more chaotic the predicted sentence
temperature = 0.4

# how much text should be produced?
out_len =2000

#start string
start_string = 'D'


In [152]:
generated_string = generate_text(model, start_string, temperature, out_len)
with open (os.path.join('generierte_texte', domain + '_' + start_string +'.txt'), 'w') as file:
    file.write(start_string + generated_string)

print(start_string +generated_string )

Dem in englische Restaurationszeit 1660-1680, sondern in der erzwungenen Arbeiter in einfache Herrschaft der Industrieller Inderessen des gesamten bestand der Industrie verwandelt werden.

An die Stelle der herrschenden Klasse ins Proletariat eine Wirklichkeit, der deutsche Sozialismus
Der deutsche oder "wahre" Sozialismus
Der deutsche Einistung der Produktion und der Anterung der unmittelbare Erscheinung der Arbeiter entgegengenet ter alten Gesellschaft zu sprechen, und damit aller Literatur und Kampfliche Freiheit versteckten sich die gesellschaftlichen Parteien nur daben sie die ganze bestehen von Engels

Schaftliche Gewalt einer Klasse vereint und das Kapital selbständig und unter der Arbeiter und der politischen Ideen einer Klasse gegen die Bourgeoisie hat nicht nur das Proletariat die Klasse gesellschaftliche Produkte, die in ihrer proletarischen Bevölkerung aller pilitischen Klassen. Was alle deutsche Ausgabe 1872)
Vorwort (deutsche Ausgabe 1872)
Vorwort (italienische Ausgabe 18