In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import io
import re
from sklearn.preprocessing import OneHotEncoder
import joblib
from tensorflow import keras
using_colab = True

### Loading and _cleaning_ of the non-toxic tweets

In [117]:
if using_colab:
    path_to_txt = '/content/tweets_non_negative.txt'

else: #deepnote
    path_to_txt = '/datasets/toxic-dataset/non_toxic_tweets.txt'

with io.open(path_to_txt, encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

# removing non alphabetic characters, keeping the \n
clean_text = re.sub(r'[^A-Za-z19 ]+', '', text)

# unique characters
chars = sorted(list(set(clean_text)))
print('total chars:', len(chars))

# to make the conversion
char_to_indices = dict((c, i) for i, c in enumerate(chars))
indices_to_char = dict((i, c) for i, c in enumerate(chars))

corpus length: 6741499
total chars: 29


In [119]:
# chars

In [120]:
clean_text[0]

'c'

In [140]:
# cut the text in semi-redundant sequences of maxlen characters
MAXLEN = 25
WINDOWS_STEP = 3
ADDITIONAL_CHARS = 1
sentences = []
next_chars = []

# sentences will act as 'X' and next_chars 'y'
# so it will be like this
# ...clean tex | t
# ...the sente | c
# ...from covi | d
# etc etc etc

for i in range(0, len(clean_text) - MAXLEN, WINDOWS_STEP):
    sentences.append(clean_text[i: i + MAXLEN])
    next_chars.append(clean_text[i + MAXLEN: i + MAXLEN + ADDITIONAL_CHARS])
print('nb sequences:', len(sentences))

nb sequences: 2101736


In [141]:
sentences[198]

'meddle in short how can i'

In [142]:
next_chars[198]

' '

In [143]:
def convert_string_to_int(string):
    ''''
    This functions receives a single string and return a numpy array of all 
    its characters converted to integers.
    '''
    list_of_ints = [char_to_indices[ch] for ch in string]
    return np.array(list_of_ints)

def convert_int_to_string(list_of_ints):
    ''''
    This functions recives a single array and returns a string
    where all its letters were converted from integers.
    '''
    string = ''.join([indices_to_char[integ] for  integ in list_of_ints])
    return string


In [144]:
# this one-hot encodes the sentences, which is not ideal bc of the resources it consumes
# X = np.zeros((len(sentences), MAXLEN, len(chars)), dtype=np.bool)
# y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
# for i, sentence in enumerate(sentences):
#     for t, char in enumerate(sentence):
#         X[i, t, char_to_indices[char]] = 1
#     y[i, char_to_indices[next_chars[i]]] = 1

X = np.array([convert_string_to_int(stri) for stri in sentences])
y_arr = np.array([convert_string_to_int(y_int) for y_int in next_chars])

In [145]:
# Transform each different integer (corresponding to a char)
# to a one hot encoding representation
enc = OneHotEncoder()

enc.fit(y_arr)
# joblib.dump(enc, "/content/onehot_encoder.joblib")
y = enc.transform(y_arr).todense()

In [146]:
# enc_hot = joblib.load('/content/onehot_encoder.joblib')

In [147]:
y.shape

(2101736, 29)

In [148]:
X.shape

(2101736, 25)

### Creation and training of the model

In [149]:
X = X.reshape((len(X), len(X[0]), 1))
X.shape

# h is the number of inputs. 
# If X_reg_train was (1000,10) then we use the input_shape of (10,)

(2101736, 25, 1)

In [150]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

strategy = tf.distribute.TPUStrategy(resolver)

INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.






INFO:tensorflow:Initializing the TPU system: grpc://10.31.77.98:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.31.77.98:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]
INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


In [151]:
X.shape

(2101736, 25, 1)

In [152]:
with strategy.scope():
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(input_shape=(X.shape[1], X.shape[2])))
    model.add(keras.layers.LSTM(750))
    model.add(keras.layers.Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [153]:
model.fit(X, y, epochs=100, batch_size=4096)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f9529659410>

In [154]:
model.save('/content/generador_final___len25.h5')

In [139]:
chars

[' ',
 '1',
 '9',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']