In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import io
import re

### Loading and _cleaning_ of the non-toxic tweets

In [None]:
path_to_txt = '/datasets/toxic-dataset/non_toxic_tweets.txt'

with io.open(path_to_txt, encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

# removing non alphabetic characters, keeping the \n
clean_text = re.sub(r'[^A-Za-z19 \n]+', '', text)

# unique characters
chars = sorted(list(set(clean_text)))
print('total chars:', len(chars))

# to make the conversion
char_to_indices = dict((c, i) for i, c in enumerate(chars))
indices_to_char = dict((i, c) for i, c in enumerate(chars))

corpus length: 6706043
total chars: 30


In [None]:
clean_text[0]

'c'

In [None]:
# cut the text in semi-redundant sequences of maxlen characters
MAXLEN = 30
WINDOWS_STEP = 3
ADDITIONAL_CHARS = 1
sentences = []
next_chars = []

# sentences will act as 'X' and next_chars 'y'
# so it will be like this
# ...clean tex | t
# ...the sente | c
# ...from covi | d
# etc etc etc

for i in range(0, len(clean_text) - MAXLEN, WINDOWS_STEP):
    sentences.append(clean_text[i: i + MAXLEN])
    next_chars.append(clean_text[i + MAXLEN: i + MAXLEN + ADDITIONAL_CHARS])
print('nb sequences:', len(sentences))

nb sequences: 2111668


In [None]:
sentences[99]

'was the son of god changed fro'

In [None]:
next_chars[99]

'm'

In [None]:
def convert_string_to_int(string):
    ''''
    This functions receives a single string and return a numpy array of all 
    its characters converted to integers.
    '''
    list_of_ints = [char_to_indices[ch] for ch in string]
    return np.array(list_of_ints)

def convert_int_to_string(list_of_ints):
    ''''
    This functions recives a single array and returns a string
    where all its letters were converted from integers.
    '''
    string = ''.join([indices_to_char[integ] for  integ in list_of_ints])
    return string


In [None]:
# this one-hot encodes the sentences, which is not ideal bc of the resources it consumes
# X = np.zeros((len(sentences), MAXLEN, len(chars)), dtype=np.bool)
# y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
# for i, sentence in enumerate(sentences):
#     for t, char in enumerate(sentence):
#         X[i, t, char_to_indices[char]] = 1
#     y[i, char_to_indices[next_chars[i]]] = 1

X = [convert_string_to_int(stri) for stri in sentences]

KernelInterrupted: Execution interrupted by the Jupyter kernel.

In [None]:
sentences_vect = np.array(sentences_char)

In [None]:
lalayer = tf.keras.layers.StringLookup(vocabulary=chars)
sentenciado = lalayer(sentences)

In [None]:
sentenciado[0:10]

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>

In [None]:
tweet_prueba = 'el covichito es malo y te odio y odio a todo el mundo y esto es un tweet tóxito'

In [None]:
tweet_prueba[:30]

'el covichito es malo y te odio'

In [None]:
print(char_to_indices)

{'\n': 0, ' ': 1, '1': 2, '9': 3, 'a': 4, 'b': 5, 'c': 6, 'd': 7, 'e': 8, 'f': 9, 'g': 10, 'h': 11, 'i': 12, 'j': 13, 'k': 14, 'l': 15, 'm': 16, 'n': 17, 'o': 18, 'p': 19, 'q': 20, 'r': 21, 's': 22, 't': 23, 'u': 24, 'v': 25, 'w': 26, 'x': 27, 'y': 28, 'z': 29}


In [None]:
# convert_int_to_string(convertio)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a1e7defd-57f6-4a69-933f-51f8c5c266bb' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>