<a href="https://colab.research.google.com/github/noircir/Python/blob/master/09_Text_generation_with_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# CAREFUL ! THE model.fit() RUNS FOR ABOUT 2-3 HOURS ON CPU ! CHANGE TO GPU ! (3+ times faster)

# Load text

In [0]:
#!pip install docx2txt

In [0]:
import pandas as pd
import docx2txt
import re
import string

In [0]:
def compress(text):
  '''
  removes blank lines and replaces multiple spaces with one space
  '''
  text = text.replace('\t', ' ')
  return re.sub('\n+', '\n', text)

In [0]:
text = docx2txt.process ('/content/drive/My Drive/Colab Notebooks/Self-learning chatbot/texts/document16.docx')

In [0]:
text

In [0]:
text = text.replace(u'\xa0', u' ')

In [0]:
text

In [0]:
text = compress(text)

In [0]:
text

## Tokenize and Clean Text

In [0]:
import spacy

In [0]:
#!python -m spacy download fr_core_news_sm

In [0]:
# To load French vocab, RESTART THE RUNTIME !!

nlp = spacy.load('fr_core_news_sm',disable=['parser', 'tagger','ner'])

In [0]:
# (Needs further fine-tuning for multiple blank lines)

def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) 
    if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n \n\n\t\t \n\n\n\n\n \n\n\n\n \n\n\n\t']

In [0]:
tokens = separate_punc(text)

In [0]:
#tokens

In [18]:
len(tokens)

14485

## Create Sequences of Tokens

In [0]:
# organize into sequences of tokens. 
# A sequence of 20 words (for example), then predict the 21th word. 

train_len = 20+1 # training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [0]:
# Given 20 words, can you predict the 21st (the last one) ?

' '.join(text_sequences[100])

In [0]:
' '.join(text_sequences[220])

In [0]:
' '.join(text_sequences[400])

In [29]:
len(text_sequences)

14464

## Keras Tokenization

In [30]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [0]:
# Integer-encode sequences of words
# Tokenizer() has many options, including punctiuation and the number of words to be kept...

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [32]:
# Each of these numbers is an id for a particular word

sequences[0]

[48,
 9,
 45,
 220,
 175,
 9,
 48,
 9,
 45,
 54,
 11,
 2003,
 592,
 591,
 1,
 469,
 2001,
 11,
 142,
 468,
 34]

In [69]:
tokenizer.index_word[50]

'prix'

In [70]:
for i in sequences[50]:
    print(f'{i} : {tokenizer.index_word[i]}')

152 : documentation
9 : d’
1132 : appels
9 : d’
45 : offres
142 : biens
468 : informatiques
34 : logiciel
11 : contrat
278 : version
763 : détaillée
764 : 2019
173 : 12
174 : 20
1133 : table
17 : des
1134 : matières
592 : page
765 : préambule
1135 : 9
14 : 
  


In [0]:
# Word counts

#tokenizer.word_counts

In [42]:
# Vocabulary size

vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

2004

## Convert to Numpy Matrix

In [0]:
import numpy as np

In [0]:
sequences = np.array(sequences)

In [45]:
sequences

array([[  48,    9,   45, ...,  142,  468,   34],
       [   9,   45,  220, ...,  468,   34,  278],
       [  45,  220,  175, ...,   34,  278,  763],
       ...,
       [   7,  216,   17, ..., 1125, 1126, 1127],
       [ 216,   17,  219, ..., 1126, 1127,   11],
       [  17,  219,  162, ..., 1127,   11,   46]])

# Creating an LSTM-based model

Predict the last word in a sequence

In [0]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding # Embedding layer deals with vocabulary

In [0]:
# PARAMETERS CHOICE

# Activation = RELU
# The size of the output layer is 'vocabulary_size'
# Loss = 'categorical_crossentropy'

def create_model(vocabulary_size, seq_len):
    model = Sequential()
    # Embedding turns positive integers(indexes) into dense vectors of fixed size (see docs).
    model.add(Embedding(vocabulary_size, 20, input_length=seq_len)) 
    model.add(LSTM(150, return_sequences=True)) # better to take multiples of seq_len; smalle batches => faster
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

## Feature / Label Split

In [0]:
from keras.utils import to_categorical

In [50]:
# First 20 words (compare to 'sequences' : it's everything without the last index)
sequences[:,:-1]

array([[  48,    9,   45, ...,   11,  142,  468],
       [   9,   45,  220, ...,  142,  468,   34],
       [  45,  220,  175, ...,  468,   34,  278],
       ...,
       [   7,  216,   17, ...,  174, 1125, 1126],
       [ 216,   17,  219, ..., 1125, 1126, 1127],
       [  17,  219,  162, ..., 1126, 1127,   11]])

In [51]:
# last word
sequences[:,-1]

array([  34,  278,  763, ..., 1127,   11,   46])

In [53]:
# X is the arrays of 20 words (sequences)

X = sequences[:,:-1]

# y (the target) is the 21st element
y = sequences[:,-1]

# one-hot
y = to_categorical(y, num_classes=vocabulary_size+1)

seq_len = X.shape[1]

seq_len

20

## Training the model

In [54]:
# define model
model = create_model(vocabulary_size+1, seq_len) # +1 for Embeddings






Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 20)            40100     
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 150)           102600    
_________________________________________________________________
lstm_2 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense_1 (Dense)              (None, 150)               22650     
_________________________________________________________________
dense_2 (Dense)              (None, 2005)              302755    
Total params: 648,705
Trainable params: 648,705
Non-trainable params: 0
_________________________________________________________________


In [0]:
from pickle import dump,load

In [56]:
# fit model

# CAREFUL ! IT RUNS FOR ABOUT 2 HOURS ON CPU ! CHANGE TO GPU !

model.fit(X, y, batch_size=128, epochs=300,verbose=1). # epochs: at least > 200

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/300





Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/3

<keras.callbacks.History at 0x7f98e3aa1080>

# Generating New Text

In [0]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [0]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate 
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0] # [0] returns index 
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

## Save the model

In [0]:
model.save('LSTM_model.h5')

## Grab a random seed sequence

In [58]:
text_sequences[500]

['de',
 'services',
 'entrepreneur',
 '21',
 '\n  ',
 '7.01',
 'statut',
 'importante',
 '22',
 '\n  ',
 '7.02',
 'capacité',
 'importante',
 '22',
 '\n  ',
 '7.03',
 'divulgation',
 'importante',
 '22',
 '\n  ',
 '7.04']

In [0]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [60]:
random_seed_text = text_sequences[random_pick]
random_seed_text

['cadre',
 'd’',
 'un',
 'contrat',
 'antérieur',
 'avec',
 'un',
 'organisme',
 'public',
 'du',
 'québec',
 'fait',
 'l’',
 'objet',
 'd’',
 'une',
 'évaluation',
 'de',
 'rendement',
 'insatisfaisant',
 'de']

In [61]:
seed_text = ' '.join(random_seed_text)
seed_text

'cadre d’ un contrat antérieur avec un organisme public du québec fait l’ objet d’ une évaluation de rendement insatisfaisant de'

In [65]:
## GENERATED NEW TEXT !!!

generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=20)

'la part de cet organisme public ne pas faire l’ objet d’ une requête en faillite volontaire ou involontaire ou'

## Exploring generated sequence

In [67]:
for i,word in enumerate(text.split()):
    if word == 'organisme':
        print(' '.join(text.split()[i-20:i+20]))
        print('\n')

redevable d’un montant exigible en vertu d’une loi fiscale ou alimentaire, l’ORGANISME PUBLIC, étant ou agissant pour le compte d’un organisme public tel que défini à l’article 31.1.4 de la Loi sur l’administration fiscale, peut, s’il en est requis par


tel consentement doit notamment respecter les critères suivants : ne pas avoir, dans le cadre d’un contrat antérieur avec un organisme public du Québec, fait l’objet d’une évaluation de rendement insatisfaisant de la part de cet organisme public; ne pas


contrat antérieur avec un organisme public du Québec, fait l’objet d’une évaluation de rendement insatisfaisant de la part de cet organisme public; ne pas faire l’objet d’une requête en faillite volontaire ou involontaire ou de toute autre procédure relative à


dernier qui ne peut s’y opposer sans motif sérieux, ajouter, aux mêmes termes et conditions, d’autres établissements membres de son organisme parmi ceux indiqués à l’annexe A - Liste des Établissements Participants, dans la se

## To reuse the model, load it

In [0]:
from keras.models import load_model
model = load_model('LSTM_model.h5')
tokenizer =load(open('LSTM_model', 'rb'))

In [0]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=20)