In [1]:
#importing libraries
import spacy
from spacy.vocab import Vocab
import numpy
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, GRU, Bidirectional
import pickle

import nltk
#nltk.download('stopwords')

In [2]:
#reading processed data
data = open('cleandata.csv').read()[:100000]
data

'status_message\nVamos tomar chopp hoje😍\n"Todo mundo curte um hot roll. E quem negar, mentiu. 😎 \n\nAbrimos às 18h00!\n🎌 Batista Campos - Rua dos Pariquis, 1712\n🎌 Av. Senador Lemos, 356 🎌 Av. Duque de Caxias, 510 \nDelivery:\n2121-4463"\nMesa cheia é mesa alegre!\nNosso Filé com Molho de Vinho é fabuloso! Ele ainda acompanha um delicioso risoto de funghi. Venha apreciar os sabores do Vegas! #VegasRestaurante #VivaVegas\n"Daqui a pouco tem Malino’s Hein!\n🍔🍺🍟😍"\n"Você sabia que além de hambúrgueres,temos também Coquetel de Fruta😲\nIsso mesmo, então vc já sabe☺☺☺"\n"Que esse é um dos pratos mais famosos e respeitados aqui em Belém, todos já sabem. Mas só dá para descobrir o motivo provando. Venha experimentar o Vatapá do Point e se surpreenda! 😉 👏 👌\n\n👉 Point Aquários\n- Municipalidade, nº 897, esquina da Wandenkolk.\n- (91) 3223-1686 / 3085-7229.\n- Domingo – 8h às 17h. Segunda a sábado – 8h às 22h30."\n"30ª ED. DO T.D.O. (Tributo Duplo Oficial)\nLEGIÃO URBANA E CHARLIE BROWN JR\n\nH

In [3]:
#function for preparing text data into sequences for training 
def data_sequencing(data):   
    # integer encode sequences of words
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([data])
    with open('tokenizer.pkl', 'wb') as f: # Save the tokeniser by pickling it
        pickle.dump(tokenizer, f)

    encoded = tokenizer.texts_to_sequences([data])[0]
    # retrieve vocabulary size
    vocab_size = len(tokenizer.word_index) + 1
    print('Vocabulary Size: %d' % vocab_size)
    
    # create line-based sequences
    sequences = list()
    rev_sequences = list()
    for line in data.split('.'):
        encoded = tokenizer.texts_to_sequences([line])[0]
        rev_encoded = encoded[::-1]
        for i in range(1, len(encoded)):
            sequence = encoded[:i+1]
            rev_sequence = rev_encoded[:i+1]
            sequences.append(sequence)
            rev_sequences.append(rev_sequence)
    print('Total Sequences: %d' % len(sequences))
    
    
    #find max sequence length 
    max_length = max([len(seq) for seq in sequences])
    with open('max_length.pkl', 'wb') as f: # Save max_length by pickling it
        pickle.dump(max_length, f)
    print('Max Sequence Length: %d' % max_length)

    # pad sequences and create the forward sequence
    sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
    # split into input and output elements
    sequences = array(sequences)
    X, y = sequences[:,:-1],sequences[:,-1]
    
    #pad sequences and create the reverse sequencing
    rev_sequences = pad_sequences(rev_sequences, maxlen=max_length, padding='pre')
    # split into input and output elements
    rev_sequences = array(rev_sequences)
    rev_X, rev_y = rev_sequences[:,:-1],rev_sequences[:,-1]

    return X,y,rev_X,rev_y,max_length,vocab_size

In [4]:
#returning forward and reverse sequences along with max sequence 
#length from the data 

X,y,rev_X,rev_y,max_length,vocab_size = data_sequencing(data)

Vocabulary Size: 3590
Total Sequences: 15938
Max Sequence Length: 229


In [5]:
# define forward sequence model
model_gru = Sequential()
model_gru.add(Embedding(vocab_size,100, input_length=max_length-1))
model_gru.add(Bidirectional(GRU(100)))
model_gru.add(Dense(vocab_size, activation='softmax'))
print(model_gru.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 228, 100)          359000    
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               121200    
_________________________________________________________________
dense (Dense)                (None, 3590)              721590    
Total params: 1,201,790
Trainable params: 1,201,790
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
# define reverse model
rev_model_gru = Sequential()
rev_model_gru.add(Embedding(vocab_size, 100, input_length=max_length-1))
rev_model_gru.add(Bidirectional(GRU(100)))
rev_model_gru.add(Dense(vocab_size, activation='softmax'))
print(rev_model_gru.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 228, 100)          359000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               121200    
_________________________________________________________________
dense_1 (Dense)              (None, 3590)              721590    
Total params: 1,201,790
Trainable params: 1,201,790
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
# compile forward sequence network
model_gru.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model_gru.fit(X, y,batch_size=100, epochs=200, verbose=2)
# save the model to file
model_gru.save('model_gru.h5')

Epoch 1/200
160/160 - 89s - loss: 7.2964 - accuracy: 0.0341
Epoch 2/200
160/160 - 100s - loss: 6.6626 - accuracy: 0.0437
Epoch 3/200
160/160 - 153s - loss: 6.0243 - accuracy: 0.0797
Epoch 4/200
160/160 - 179s - loss: 5.3787 - accuracy: 0.1429
Epoch 5/200
160/160 - 177s - loss: 4.8143 - accuracy: 0.2066
Epoch 6/200
160/160 - 179s - loss: 4.3372 - accuracy: 0.2561
Epoch 7/200
160/160 - 178s - loss: 3.9355 - accuracy: 0.2954
Epoch 8/200
160/160 - 178s - loss: 3.5922 - accuracy: 0.3340
Epoch 9/200
160/160 - 178s - loss: 3.2867 - accuracy: 0.3702
Epoch 10/200
160/160 - 178s - loss: 3.0162 - accuracy: 0.4073
Epoch 11/200
160/160 - 178s - loss: 2.7690 - accuracy: 0.4512
Epoch 12/200
160/160 - 178s - loss: 2.5461 - accuracy: 0.4914
Epoch 13/200
160/160 - 178s - loss: 2.3399 - accuracy: 0.5333
Epoch 14/200
160/160 - 179s - loss: 2.1546 - accuracy: 0.5706
Epoch 15/200
160/160 - 179s - loss: 1.9830 - accuracy: 0.6040
Epoch 16/200
160/160 - 179s - loss: 1.8280 - accuracy: 0.6367
Epoch 17/200
160/1

Epoch 133/200
160/160 - 127s - loss: 0.0605 - accuracy: 0.9753
Epoch 134/200
160/160 - 127s - loss: 0.0598 - accuracy: 0.9755
Epoch 135/200
160/160 - 129s - loss: 0.0592 - accuracy: 0.9756
Epoch 136/200
160/160 - 172s - loss: 0.0584 - accuracy: 0.9763
Epoch 137/200
160/160 - 178s - loss: 0.0589 - accuracy: 0.9758
Epoch 138/200
160/160 - 178s - loss: 0.0581 - accuracy: 0.9757
Epoch 139/200
160/160 - 178s - loss: 0.0580 - accuracy: 0.9761
Epoch 140/200
160/160 - 178s - loss: 0.0584 - accuracy: 0.9752
Epoch 141/200
160/160 - 178s - loss: 0.0576 - accuracy: 0.9756
Epoch 142/200
160/160 - 178s - loss: 0.0575 - accuracy: 0.9759
Epoch 143/200
160/160 - 184s - loss: 0.0576 - accuracy: 0.9759
Epoch 144/200
160/160 - 179s - loss: 0.0572 - accuracy: 0.9759
Epoch 145/200
160/160 - 179s - loss: 0.0576 - accuracy: 0.9756
Epoch 146/200
160/160 - 188s - loss: 0.0582 - accuracy: 0.9760
Epoch 147/200
160/160 - 179s - loss: 0.0586 - accuracy: 0.9752
Epoch 148/200
160/160 - 178s - loss: 0.0593 - accuracy:

In [8]:
# compile reverse sequence network
rev_model_gru.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
rev_model_gru.fit(rev_X, rev_y,batch_size=100, epochs=200, verbose=2)
# save the model to file
rev_model_gru.save('rev_model_gru.h5')

Epoch 1/200
160/160 - 135s - loss: 7.2634 - accuracy: 0.0376
Epoch 2/200
160/160 - 128s - loss: 6.5912 - accuracy: 0.0480
Epoch 3/200
160/160 - 127s - loss: 6.0988 - accuracy: 0.0742
Epoch 4/200
160/160 - 128s - loss: 5.5755 - accuracy: 0.1285
Epoch 5/200
160/160 - 128s - loss: 5.0530 - accuracy: 0.1872
Epoch 6/200
160/160 - 128s - loss: 4.5776 - accuracy: 0.2451
Epoch 7/200
160/160 - 128s - loss: 4.1588 - accuracy: 0.2936
Epoch 8/200
160/160 - 128s - loss: 3.7928 - accuracy: 0.3323
Epoch 9/200
160/160 - 128s - loss: 3.4699 - accuracy: 0.3662
Epoch 10/200
160/160 - 128s - loss: 3.1820 - accuracy: 0.4026
Epoch 11/200
160/160 - 127s - loss: 2.9214 - accuracy: 0.4386
Epoch 12/200
160/160 - 128s - loss: 2.6813 - accuracy: 0.4833
Epoch 13/200
160/160 - 128s - loss: 2.4636 - accuracy: 0.5219
Epoch 14/200
160/160 - 128s - loss: 2.2592 - accuracy: 0.5596
Epoch 15/200
160/160 - 130s - loss: 2.0757 - accuracy: 0.5954
Epoch 16/200
160/160 - 128s - loss: 1.9044 - accuracy: 0.6303
Epoch 17/200
160/

Epoch 133/200
160/160 - 128s - loss: 0.0372 - accuracy: 0.9830
Epoch 134/200
160/160 - 129s - loss: 0.0370 - accuracy: 0.9829
Epoch 135/200
160/160 - 129s - loss: 0.0371 - accuracy: 0.9823
Epoch 136/200
160/160 - 129s - loss: 0.0368 - accuracy: 0.9827
Epoch 137/200
160/160 - 129s - loss: 0.0459 - accuracy: 0.9814
Epoch 138/200
160/160 - 128s - loss: 0.0503 - accuracy: 0.9807
Epoch 139/200
160/160 - 129s - loss: 0.0467 - accuracy: 0.9808
Epoch 140/200
160/160 - 128s - loss: 0.0415 - accuracy: 0.9824
Epoch 141/200
160/160 - 129s - loss: 0.0391 - accuracy: 0.9822
Epoch 142/200
160/160 - 129s - loss: 0.0390 - accuracy: 0.9821
Epoch 143/200
160/160 - 129s - loss: 0.0377 - accuracy: 0.9828
Epoch 144/200
160/160 - 128s - loss: 0.0377 - accuracy: 0.9827
Epoch 145/200
160/160 - 128s - loss: 0.0368 - accuracy: 0.9829
Epoch 146/200
160/160 - 129s - loss: 0.0360 - accuracy: 0.9829
Epoch 147/200
160/160 - 128s - loss: 0.0362 - accuracy: 0.9828
Epoch 148/200
160/160 - 128s - loss: 0.0359 - accuracy:

In [9]:
# generate a sequence using a language model
def generate_seq(model, tokenizer, max_length, seed_text):
    if seed_text == "":
        return ""
    else:
        in_text = seed_text
        n_words = 1
        n_preds = 5 #number of words to predict for the seed text
        pred_words = ""
        # generate a fixed number of words
        for _ in range(n_words):
            # encode the text as integer
            encoded = tokenizer.texts_to_sequences([in_text])[0]
            # pre-pad sequences to a fixed length
            encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
            # predict probabilities for each word
            proba = model.predict(encoded, verbose=0).flatten()
            #take the n_preds highest probability classes 
            yhat = numpy.argsort(-proba)[:n_preds] 
            # map predicted words index to word
            out_word = ''

            for _ in range(n_preds):
                for word, index in tokenizer.word_index.items():
                    if index == yhat[_] and word not in stopwords:
                        out_word = word
                        pred_words += ' ' + out_word
                        #print(out_word)
                        break


        return pred_words