### View alocated memory

In [152]:
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 10.5 GB  | Proc size: 3.3 GB
GPU RAM Free: 548MB | Used: 10891MB | Util  95% | Total 11439MB


## Import Dependency

In [153]:
!pip install gensim
import gensim
from gensim.models.doc2vec import LabeledSentence

import numpy as np
import os
import time
import codecs



### define output directory

In [154]:
data_dir = '../data'# data directory containing raw texts
save_dir = '../out' # directory to store trained NN models
seq_length = 30 # sequence length
vocab_file = os.path.join(save_dir, "words_vocab.pkl")
sequences_step = 1 #step to create sequences

## Load data

In [155]:
path = "../data/"
file_list = os.listdir( path )

remove_files = ['input.txt','LICENSE']
for r_files in remove_files:
    file_list.remove(r_files)

file_list = file_list[:10]
file_list

['203.txt',
 '103.txt',
 '209.txt',
 '403.txt',
 '107.txt',
 '407.txt',
 '202.txt',
 '301.txt',
 '111.txt',
 '101.txt']

In [156]:
! pip install spacy
#import spacy, and french model
import spacy
! python -m spacy download fr
nlp = spacy.load('fr')


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/fr_core_news_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/fr

    You can now load the model via spacy.load('fr')



In [157]:
#initiate sentences and labels lists
sentences = []
sentences_label = []

**create_sentences** function read a document and return a tokenize list of the sentences of the document

In [158]:
#create sentences function:
def create_sentences(doc):
    ponctuation = [".","?","!",":","…"]
    sentences = []
    sent = []
    for word in doc:
        if word.text not in ponctuation:
            if word.text not in ("\n","\n\n",'\u2009','\xa0'):
                sent.append(word.text.lower())
        else:
            sent.append(word.text.lower())
            if len(sent) > 1:
                sentences.append(sent)
            sent=[]
    return sentences

Convert all documents to list of tockens.

In [159]:
#create sentences from files
for file_name in file_list:
    #read data
    input_file = os.path.join(data_dir, file_name)
    
    with codecs.open(input_file, "r", encoding='UTF-8') as f:
        data = f.read()
        
    #create sentences
    doc = nlp(data)
    sents = create_sentences(doc)
    sentences = sentences + sents

In [160]:
sents[0]

['—', 'non', ',', 'celui-là', 'n’', 'y', 'parvient', 'pas', 'non', 'plus', '!']

In [161]:
np.array(sentences)[0]

['—', 'tu', 'sais', 'ce', 'que', 'c’', 'est', '?']

In [162]:
len(sentences)

2442

In [163]:
#create labels
for i in range(np.array(sentences).shape[0]):
    sentences_label.append("ID" + str(i))

In [164]:
sentences_label[:10]

['ID0', 'ID1', 'ID2', 'ID3', 'ID4', 'ID5', 'ID6', 'ID7', 'ID8', 'ID9']

In [165]:
printm()

Gen RAM Free: 10.4 GB  | Proc size: 3.6 GB
GPU RAM Free: 548MB | Used: 10891MB | Util  95% | Total 11439MB


## Train doc2vec model

In [166]:
#class LabeledLineSentence(object):
#    def __init__(self, doc_list, labels_list):
#        self.labels_list = labels_list
#        self.doc_list = doc_list
#    def __iter__(self):
#        for idx, doc in enumerate(self.doc_list):
#            yield gensim.models.doc2vec.TaggedDocument(doc,[self.labels_list[idx]])

**LabeledLineSentence** is a generator function which Replaces “sentence as a list of words” from gensim.models.word2vec.Word2Vec.

**generator function** is used for large data.it process a single instance of a data, return it and then process the next instance of data.

In [167]:
def LabeledLineSentence(doc_list,labels_list):
        for idx, doc in enumerate(doc_list):
            #print(doc)
            #print("###" + str(labels_list[idx]))
            yield gensim.models.doc2vec.TaggedDocument(doc,[labels_list[idx]])

Create a own word imbading model.

In [168]:
def train_doc2vec_model(data, docLabels, size=300, sample=0.000001, dm=0, hs=1, window=10, min_count=0, workers=8,alpha=0.024, min_alpha=0.024, epoch=15, save_file='../out/doc2vec.w2v') :
    startime = time.time()
    
    print("{0} articles loaded for model".format(len(data)))

    it = LabeledLineSentence(data, docLabels)

    model = gensim.models.Doc2Vec(vector_size=size, sample=sample, dm=dm, window=window, min_count=min_count, workers=workers,alpha=alpha, min_alpha=min_alpha, hs=hs) # use fixed learning rate
    model.build_vocab(it)
    for epoch in range(epoch):
        print("Training epoch {}".format(epoch + 1))
        model.train(it,total_examples=model.corpus_count,epochs=model.epochs)
        # model.alpha -= 0.002 # decrease the learning rate
        # model.min_alpha = model.alpha # fix the learning rate, no decay
        
    #saving the created model
    model.save(os.path.join(save_file))
    print('model saved')

In [169]:
train_doc2vec_model(sentences, sentences_label, size=500,sample=0.0,alpha=0.025, min_alpha=0.001, min_count=0, window=10, epoch=20, dm=0, hs=1, save_file='../out/doc2vec.w2v')

2442 articles loaded for model
Training epoch 1
Training epoch 2
Training epoch 3
Training epoch 4
Training epoch 5
Training epoch 6
Training epoch 7
Training epoch 8
Training epoch 9
Training epoch 10
Training epoch 11
Training epoch 12
Training epoch 13
Training epoch 14
Training epoch 15
Training epoch 16
Training epoch 17
Training epoch 18
Training epoch 19
Training epoch 20
model saved


## Create the Input Dataset

As LSTM don't accept string or list of string. So represent the sentence with a list of int. len of the list is 500.

In [170]:
#import library
from six.moves import cPickle

#load the model
d2v_model = gensim.models.doc2vec.Doc2Vec.load('../out/doc2vec.w2v')

sentences_vector=[]

t = 500

for i in range(len(sentences)):
    if i % t == 0:
        print("sentence", i, ":", sentences[i])
        print("***")
        #print(d2v_model.infer_vector(sentences[i], alpha=0.001, min_alpha=0.001, steps=10000))
        #print(len(d2v_model.infer_vector(sentences[i], alpha=0.001, min_alpha=0.001, steps=10000)))
    sent = sentences[i]
    sentences_vector.append(d2v_model.infer_vector(sent, alpha=0.001, min_alpha=0.001, steps=10000))
    
#save the sentences_vector
sentences_vector_file = os.path.join(save_dir, "sentences_vector_500_a001_ma001_s10000.pkl")
with open(os.path.join(sentences_vector_file), 'wb') as f:
    cPickle.dump((sentences_vector), f)

sentence 0 : ['—', 'tu', 'sais', 'ce', 'que', 'c’', 'est', '?']
***
sentence 500 : ['le', 'monstre', 'se', 'retourne', '.']
***
sentence 1000 : ['cependant', ',', 'les', 'battements', 'sourds', 'qui', 's’', 'en', 'échappent', 'sont', 'apaisants', '.']
***
sentence 1500 : ['désormais', ',', 'ils', 'se', 'laissent', 'guider', 'par', 'les', 'autres', 'apprentis', 'vers', 'l’', 'endroit', 'où', 'les', 'attend', 'lothar', ',', 'en', 'faisant', 'très', 'attention', 'à', 'ne', 'pas', 'les', 'perdre', 'de', 'vue', '.']
***
sentence 2000 : ['le', 'jeune', 'homme', 'leur', 'lance', 'un', 'ultime', 'regard', 'sans', 'réel', 'espoir', ',', 'mais', 'ils', 'ne', 'desserrent', 'même', 'pas', 'les', 'dents', '.']
***


In [171]:
printm()

Gen RAM Free: 10.4 GB  | Proc size: 3.6 GB
GPU RAM Free: 548MB | Used: 10891MB | Util  95% | Total 11439MB


In [172]:
sentences[0]

['—', 'tu', 'sais', 'ce', 'que', 'c’', 'est', '?']

In [173]:
sentences_vector[0]

array([-3.72007315e-04,  7.68059108e-04,  2.41480710e-04, -3.96743271e-04,
        1.61228643e-04, -6.11503085e-04,  1.72399770e-04, -3.11214040e-04,
        6.24620763e-04, -5.36825624e-04,  8.25450872e-04, -9.68994864e-04,
        7.31707958e-04,  8.62358778e-04, -3.35665216e-04, -6.67530519e-04,
        6.09208764e-05,  3.04898480e-04, -2.53713195e-04,  4.45768055e-05,
        5.78516163e-04, -4.19058459e-04,  7.24629441e-04, -5.27163269e-04,
        7.71471416e-04,  3.85969586e-04,  9.03094246e-04, -1.00288547e-04,
       -8.42830559e-05, -7.94795575e-04, -1.98369657e-04, -1.54940990e-05,
       -2.70826888e-04,  2.46611075e-04, -5.62766509e-04, -7.38711678e-04,
       -9.51848284e-04, -9.36580531e-04,  1.93576023e-04,  1.71097257e-04,
        3.68148088e-04, -6.92102010e-04, -9.04273766e-04,  7.93280080e-04,
       -7.72567582e-04, -6.48292422e-04,  1.14945178e-05, -4.87285870e-04,
        3.74200434e-04,  7.07626110e-04, -6.34238706e-04, -8.73770987e-05,
        8.37425468e-04, -

Now consider every 15 element from **sentences_vector** list as input X and 16th element as output Y.<br> Note that, each element in **sentences_vector** is a list of int with len of 500

In [174]:
nb_sequenced_sentences = 15
vector_dim = 500

X_train = np.zeros((len(sentences), nb_sequenced_sentences, vector_dim), dtype=np.float)
y_train = np.zeros((len(sentences), vector_dim), dtype=np.float)

t = 1000
for i in range(len(sentences_label)-nb_sequenced_sentences-1):
    if i % t == 0: print("new sequence: ", i)
    
    for k in range(nb_sequenced_sentences):
        sent = sentences_label[i+k]
        vect = sentences_vector[i+k]
        
        if i % t == 0:
            print("  ", k + 1 ,"th vector for this sequence. Sentence ", sent, "(vector dim = ", len(vect), ")")
            #print(vect[j])
            #print(j)
            
        for j in range(len(vect)):
            #print(vect[j])
            X_train[i, k, j] = vect[j]
    
    senty = sentences_label[i+nb_sequenced_sentences]
    vecty = sentences_vector[i+nb_sequenced_sentences]
    if i % t == 0: print("  y vector for this sequence ", senty, ": (vector dim = ", len(vecty), ")")
    for j in range(len(vecty)):
        y_train[i, j] = vecty[j]

print(X_train.shape, y_train.shape)

new sequence:  0
   1 th vector for this sequence. Sentence  ID0 (vector dim =  500 )
   2 th vector for this sequence. Sentence  ID1 (vector dim =  500 )
   3 th vector for this sequence. Sentence  ID2 (vector dim =  500 )
   4 th vector for this sequence. Sentence  ID3 (vector dim =  500 )
   5 th vector for this sequence. Sentence  ID4 (vector dim =  500 )
   6 th vector for this sequence. Sentence  ID5 (vector dim =  500 )
   7 th vector for this sequence. Sentence  ID6 (vector dim =  500 )
   8 th vector for this sequence. Sentence  ID7 (vector dim =  500 )
   9 th vector for this sequence. Sentence  ID8 (vector dim =  500 )
   10 th vector for this sequence. Sentence  ID9 (vector dim =  500 )
   11 th vector for this sequence. Sentence  ID10 (vector dim =  500 )
   12 th vector for this sequence. Sentence  ID11 (vector dim =  500 )
   13 th vector for this sequence. Sentence  ID12 (vector dim =  500 )
   14 th vector for this sequence. Sentence  ID13 (vector dim =  500 )
   15 th

In [175]:
X_train[1][1]

array([-7.79064256e-04,  5.56897896e-04,  3.71264643e-04,  6.99287164e-04,
        7.11440283e-04,  6.45804219e-04,  8.68366100e-04, -1.90325620e-04,
       -6.62198523e-04,  6.80964033e-04, -3.56519558e-05, -6.10794756e-04,
       -8.95912584e-04, -2.32867838e-04, -2.81689718e-04,  9.55429350e-05,
       -7.84405624e-04, -9.86512314e-05,  4.23429447e-04, -1.22950496e-05,
        3.15491081e-04,  6.85536797e-05,  2.52309459e-04, -8.64669215e-04,
       -3.95687879e-04, -1.16321113e-04, -1.83953307e-05,  4.15063434e-04,
        9.60217381e-04,  9.32978874e-04, -5.95584803e-04,  3.56841454e-04,
        1.21537734e-04,  1.91114959e-04,  9.69990215e-04,  1.36062416e-04,
        2.95663427e-04,  2.75869941e-04,  9.07400390e-04,  5.46150666e-04,
        6.41204068e-04, -1.59310977e-04,  2.68938573e-04,  3.12351942e-04,
        5.07670804e-04, -5.42500311e-05,  2.60365341e-05, -4.15587449e-04,
        9.43719933e-04,  1.89663246e-04, -7.57334987e-04,  1.26255167e-04,
       -3.90451401e-04, -

In [176]:
sentences[0]

['—', 'tu', 'sais', 'ce', 'que', 'c’', 'est', '?']

## Create the Keras Model

In [177]:
from __future__ import print_function
from keras import regularizers
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Embedding, Flatten, Bidirectional, Input, LSTM
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.optimizers import Adam
from keras.metrics import categorical_accuracy, mean_squared_error, mean_absolute_error, logcosh
from keras.layers.normalization import BatchNormalization

def bidirectional_lstm_model(seq_length, vector_dim):
    print('Building LSTM model...')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vector_dim)))
    model.add(Dropout(0.5))
    model.add(Dense(vector_dim))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='logcosh', optimizer=optimizer, metrics=['acc'])
    print('LSTM model built.')
    return model

In [178]:
rnn_size = 512 # size of RNN
vector_dim = 500
learning_rate = 0.0001 #learning rate

model_sequence = bidirectional_lstm_model(nb_sequenced_sentences, vector_dim)

Building LSTM model...
LSTM model built.


Train the model and save it.

In [179]:
batch_size = 30 # minibatch size

callbacks=[EarlyStopping(patience=3, monitor='val_loss'),
           ModelCheckpoint(filepath=save_dir + "/" + 'my_model_sequence_lstm.{epoch:02d}.hdf5',\
                           monitor='val_loss', verbose=1, mode='auto', period=5)]

history = model_sequence.fit(X_train, y_train,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=40,
                 callbacks=callbacks,
                 validation_split=0.1)

#save the model
model_sequence.save(save_dir + "/" + 'my_model_sequence_lstm.final2.hdf5')

Train on 2197 samples, validate on 245 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40


## Text Generation

In [180]:
from __future__ import print_function
import numpy as np
import os
import scipy
from six.moves import cPickle

In [181]:
save_dir = '../out' # directory to store models

In [182]:
#import spacy, and french model
import spacy
! python -m spacy download fr
nlp = spacy.load('fr')


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/fr_core_news_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/fr

    You can now load the model via spacy.load('fr')



In [183]:
! pip install gensim



Load **doc2vec** model we created above.

In [184]:
#import gensim library
import gensim
from gensim.models.doc2vec import LabeledSentence

#load the doc2vec model
print("loading doc2Vec model...")
d2v_model = gensim.models.doc2vec.Doc2Vec.load('../out/doc2vec.w2v')

print("model loaded!")

loading doc2Vec model...
model loaded!


Load the **words_vocab** model created in the first notebook.

In [185]:
#load vocabulary
print("loading vocabulary...")
vocab_file = os.path.join(save_dir, "words_vocab.pkl")

with open(os.path.join(save_dir, 'words_vocab.pkl'), 'rb') as f:
        words, vocab, vocabulary_inv = cPickle.load(f)

vocab_size = len(words)
print("vocabulary loaded !")

loading vocabulary...
vocabulary loaded !


Load the **word prediction model** created in the first notebook and **sentence selection model** created above in this notebook.

The perpose of creating two model is, **word prediction model** will generate several sentence prediction and **sentence selection model** will choose the best sentece among them.

In [186]:
from keras.models import load_model
# load the keras models
print("loading word prediction model...")
model = load_model(save_dir + "/" + 'my_model_gen_sentences_lstm.final.hdf5')
print("model loaded!")
print("loading sentence selection model...")
model_sequence = load_model(save_dir + "/" + 'my_model_sequence_lstm.final2.hdf5')
print("model loaded!")

loading word prediction model...
model loaded!
loading sentence selection model...
model loaded!


## Functions to generate Candidates Sentences

function **sample()**, will draw randomly a word from our vocabulary.but probability for a word to be drawn will depends directly on its probability to be the next word.

**"temperature"** to smooth or sharpen its value.<br>
-  if temperature = 1.0, the probability for a word to be drawn is equal to the probability for the word to be the next one in the sequence (output of the owrd prediction model),
2. if temperature is big (much bigger than 1), the range of probabilities is shorten: the probabilities for all words to be the next one is closer to 1. More variety of words will be picked-up from the vocabulary.
3. if temperatune is small (close to 0), small probabilities will be avoided (they will be set closed to 0). Less words will be picked-up from the vocabulary.

In [187]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [188]:
printm()

Gen RAM Free: 10.4 GB  | Proc size: 3.6 GB
GPU RAM Free: 548MB | Used: 10891MB | Util  95% | Total 11439MB


**create_seed** function read the last few words from the given sentence(**seed_sentences**) as a input text to generate output.

In [189]:
def create_seed(seed_sentences,nb_words_in_seq=20, verbose=False):
    #initiate sentences
    generated = ''
    sentence = []
    
    #fill the sentence with a default word
    for i in range (nb_words_in_seq):
        sentence.append("le")

    seed = seed_sentences.split()
    
    if verbose == True : print("seed: ",seed)

    for i in range(len(sentence)):
        sentence[nb_words_in_seq-i-1]=seed[len(seed)-i-1]
        print(i, sentence)

    generated += ' '.join(sentence)
    
    if verbose == True : print('Generating text with the following seed: "' + ' '.join(sentence) + '"')

    return [generated, sentence]

**generate_phrase()** is used to create the next phrase of a given sentence.

It requires as inputs:

- the previous sentence,
- the maximum number of words in the phrase,
- the temperature of the sample function.<br> If a punctuation word is reached before the maximum number of the words, the function ends.

In [212]:
def generate_phrase(sentence, max_words = 50, nb_words_in_seq=20, temperature=1, verbose = False):
    generated = ""
    words_number = max_words - 1
    ponctuation = [".","?","!",":","…"]
    seq_length = nb_words_in_seq
    #sentence = []
    is_punct = False
    
    #generate the text
    for i in range(words_number):
        #create the vector
        x = np.zeros((1, seq_length, vocab_size))
        for t, word in enumerate(sentence):
            #print(t, word, vocab[word])
            if word not in vocab:
                word = "nolan"
            x[0, nb_words_in_seq-len(sentence)+t, vocab[word]] = 1.
        #print(x.shape)

        #calculate next word
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_word = vocabulary_inv[next_index]
        
        if verbose == True:
            predv = np.array(preds)
            #arr = np.array([1, 3, 2, 4, 5])
            wi = predv.argsort()[-3:][::-1]
            print("potential next words: ", vocabulary_inv[wi[0]], vocabulary_inv[wi[1]], vocabulary_inv[wi[2]])

        #add the next word to the text
        if is_punct == False:
            if next_word in ponctuation:
                is_punct = True
            generated += " " + next_word
            # shift the sentence by one, and and the next word at its end
            sentence = sentence[1:] + [next_word]

    return(generated, sentence)

**define_phrases_candidates()** provides a list of probable phrases, for a given previous sentence and a specific temperature.

In [191]:
def define_phrases_candidates(sentence, max_words = 50,\
                              nb_words_in_seq=20, \
                              temperature=1, \
                              nb_candidates_sents=10, \
                              verbose = False):
    phrase_candidate = []
    generated_sentence = ""
    for i in range(nb_candidates_sents):
        generated_sentence, new_sentence = generate_phrase(sentence, \
                                                           max_words = max_words, \
                                                           nb_words_in_seq = nb_words_in_seq, \
                                                           temperature=temperature, \
                                                           verbose = False)
        phrase_candidate.append([generated_sentence, new_sentence])
    
    if verbose == True :
        for phrase in phrase_candidate:
            print("   " , phrase[0])
    return phrase_candidate

## Functions to select the best sentence

***infer_vector -- *** Subsequent calls to this function may infer different representations for the same document. For a more stable representation, increase the number of steps to assert a stricket convergence.

**generate_training_vector()** generate a vector for each sentence in the sentence list.<br>
As **infer_vector** define a new representation of a sentence,so the function is used to predict the next vectorized-sentence for a given sequence of vectorized-sentences.

In [225]:
def generate_training_vector(sentences_list, verbose = False):
    if verbose == True : print("generate vectors for each sentence...")
    seq = []
    V = []

    for s in sentences_list:
        #infer the vector of the sentence, from the doc2vec model
        print(s)
        v = d2v_model.infer_vector(create_sentences(nlp(s))[0], alpha=0.001, min_alpha=0.001, steps=10000)
    #create the vector array for the model
        #print(len(v))
        V.append(v)
    V_val=np.array(V)
    #expand dimension to fit the entry of the model : that's the training vector
    V_val = np.expand_dims(V_val, axis=0)
    if verbose == True : print("Vectors generated!")
    return V_val

The **select_next_phrase()** function allows us to pick-up the best candidates for the next phrase.

First, it calculates the vector for each candidates.

Then, based on the vector generated by the function **generate_training_vector()**, it performs a cosine similarity with them and pick the one with the biggest similarity.

In [193]:
def select_next_phrase(model, V_val, candidate_list, verbose=False):
    sims_list = []
    
    #calculate prediction
    preds = model.predict(V_val, verbose=0)[0]
    
    #calculate vector for each candidate
    for candidate in candidate_list:
        #calculate vector
        #print("calculate vector for : ", candidate[1])
        V = np.array(d2v_model.infer_vector(candidate[1]))
        #calculate csonie similarity
        sim = scipy.spatial.distance.cosine(V,preds)
        #populate list of similarities
        sims_list.append(sim)
    
    #select index of the biggest similarity
    m = max(sims_list)
    index_max = sims_list.index(m)
    
    if verbose == True :
        print("selected phrase :")
        print("     ", candidate_list[index_max][0])
    return candidate_list[index_max]

## Text generation - workflow

In [194]:
def generate_paragraphe(phrase_seed, sentences_seed, \
                        max_words = 50, \
                        nb_words_in_seq=20, \
                        temperature=1, \
                        nb_phrases=30, \
                        nb_candidates_sents=10, \
                        verbose=True):
    
    sentences_list = sentences_seed
    sentence = phrase_seed   
    text = []
    
    for p in range(nb_phrases):
        if verbose == True : print("")
        if verbose == True : print("#############")
        print("phrase ",p+1, "/", nb_phrases)
        if verbose == True : print("#############")       
        if verbose == True:
            print('Sentence to generate phrase : ')
            print("     ", sentence)
            print("")
            print('List of sentences to constrain next phrase : ')
            print("     ", sentences_list)
            print("")
    
        #generate seed training vector
        V_val = generate_training_vector(sentences_list, verbose = verbose)

        #generate phrase candidate
        if verbose == True : print("generate phrases candidates...")
        phrases_candidates = define_phrases_candidates(sentence, \
                                                       max_words = max_words, \
                                                       nb_words_in_seq = nb_words_in_seq, \
                                                       temperature=temperature, \
                                                       nb_candidates_sents=nb_candidates_sents, \
                                                       verbose = verbose)
        
        if verbose == True : print("select next phrase...")
        next_phrase = select_next_phrase(model_sequence, \
                                         V_val,
                                         phrases_candidates, \
                                         verbose=verbose)
        
        print("Next phrase: ",next_phrase[0])
        if verbose == True :
            print("")
            print("Shift phrases in sentences list...")
        for i in range(len(sentences_list)-1):
            sentences_list[i]=sentences_list[i+1]

        sentences_list[len(sentences_list)-1] = next_phrase[0]
        
        if verbose == True:
            print("done.")
            print("new list of sentences :")
            print("     ", sentences_list)     
        sentence = next_phrase[1]
        
        text.append(next_phrase[0])
    
    return text

Now define some training sentence and combine them in a list

In [195]:
s1 = "nolan s' approche du bord du chemin et regarde en contrebas ."
s2 = "il se tourne vers mara :"
s3 = "- que dis tu ?"
s4 = "- rien du tout , lui répond la jeune femme en détournant le regard ."
s5 = "- je t' ai entendu dire quelque chose , pourtant ."
s6 = "- je pensais à voix haute , explique mara  ."
s7 = "l' apprentie hésite , elle n' est pas certaine que nolan comprenne ."
s8 = "depuis quelques jours , nolan est à fleur de peau et s'inquiète pour un rien ."
s9 = "- je crois avoir vu une ombre , déclare finalement la jeune femme ."
s10 = "- à quel endroit ?"
s11 = "s' écrie le jeune homme ."
s12 = "nolan semble bouleversé et il est devenu blanc de peur ."
s13 = "les souvenirs des kaurocs sont suffisament frais dans sa mémoire pour qu' une étrange angoisse lui noue la poitrine ."
s14 = "- ne sois pas inquiet , s' exclame mara , confuse de la réaction de son ami ."
s15 = "il y a probablement une erreur ."

In [196]:
sentences_list = [s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15]
print(sentences_list)

["nolan s' approche du bord du chemin et regarde en contrebas .", 'il se tourne vers mara :', '- que dis tu ?', '- rien du tout , lui répond la jeune femme en détournant le regard .', "- je t' ai entendu dire quelque chose , pourtant .", '- je pensais à voix haute , explique mara  .', "l' apprentie hésite , elle n' est pas certaine que nolan comprenne .", "depuis quelques jours , nolan est à fleur de peau et s'inquiète pour un rien .", '- je crois avoir vu une ombre , déclare finalement la jeune femme .', '- à quel endroit ?', "s' écrie le jeune homme .", 'nolan semble bouleversé et il est devenu blanc de peur .', "les souvenirs des kaurocs sont suffisament frais dans sa mémoire pour qu' une étrange angoisse lui noue la poitrine .", "- ne sois pas inquiet , s' exclame mara , confuse de la réaction de son ami .", 'il y a probablement une erreur .']


In [197]:
phrase_seed, sentences_seed = create_seed(s1 + " " + s2 + " " +\
                                          s3 + " " + s4+ " " + s5 + " " +\
                                          s6 + " " + s7 + " " + s8 + " " +\
                                          s9+ " " + s10 + " " + s11 + " " +\
                                          s12 + " " + s13 + " " + s14+ " " + s15,20,True)
print(phrase_seed)
print(sentences_seed)

seed:  ['nolan', "s'", 'approche', 'du', 'bord', 'du', 'chemin', 'et', 'regarde', 'en', 'contrebas', '.', 'il', 'se', 'tourne', 'vers', 'mara', ':', '-', 'que', 'dis', 'tu', '?', '-', 'rien', 'du', 'tout', ',', 'lui', 'répond', 'la', 'jeune', 'femme', 'en', 'détournant', 'le', 'regard', '.', '-', 'je', "t'", 'ai', 'entendu', 'dire', 'quelque', 'chose', ',', 'pourtant', '.', '-', 'je', 'pensais', 'à', 'voix', 'haute', ',', 'explique', 'mara', '.', "l'", 'apprentie', 'hésite', ',', 'elle', "n'", 'est', 'pas', 'certaine', 'que', 'nolan', 'comprenne', '.', 'depuis', 'quelques', 'jours', ',', 'nolan', 'est', 'à', 'fleur', 'de', 'peau', 'et', "s'inquiète", 'pour', 'un', 'rien', '.', '-', 'je', 'crois', 'avoir', 'vu', 'une', 'ombre', ',', 'déclare', 'finalement', 'la', 'jeune', 'femme', '.', '-', 'à', 'quel', 'endroit', '?', "s'", 'écrie', 'le', 'jeune', 'homme', '.', 'nolan', 'semble', 'bouleversé', 'et', 'il', 'est', 'devenu', 'blanc', 'de', 'peur', '.', 'les', 'souvenirs', 'des', 'kaurocs'

#### Now generate the actual text

In [None]:
text = generate_paragraphe(sentences_seed, sentences_list, \
                           max_words = 80, \
                           nb_words_in_seq = 30,\
                           temperature=0.201, \
                           nb_phrases=5, \
                           nb_candidates_sents=7, \
                           verbose=True)


#############
phrase  1 / 5
#############
Sentence to generate phrase : 
      [',', "s'", 'exclame', 'mara', ',', 'confuse', 'de', 'la', 'réaction', 'de', 'son', 'ami', '.', 'il', 'y', 'a', 'probablement', 'une', 'erreur', '.']

List of sentences to constrain next phrase : 
      ["l' apprentie hésite , elle n' est pas certaine que nolan comprenne .", "depuis quelques jours , nolan est à fleur de peau et s'inquiète pour un rien .", '- je crois avoir vu une ombre , déclare finalement la jeune femme .', '- à quel endroit ?', "s' écrie le jeune homme .", 'nolan semble bouleversé et il est devenu blanc de peur .', "les souvenirs des kaurocs sont suffisament frais dans sa mémoire pour qu' une étrange angoisse lui noue la poitrine .", "- ne sois pas inquiet , s' exclame mara , confuse de la réaction de son ami .", 'il y a probablement une erreur .', ' le ne pas la est pas et .', ' .', ' — il est est est de pas une œil .', ' la jeune s’ se jeune .', ' , la tête de .', ' , , la plus est , , 

In [None]:
printm()