# Text generarion using RNN

In [2]:
import pandas as pd
review_df=pd.read_csv('data/music_reviews.csv',encoding="ISO-8859-1").head(500)

In [3]:
review_df.dropna(subset=['Text'],inplace=True)

In [4]:
review_df

Unnamed: 0,Stars,Text,Sentiment
0,5,"Spotify is way, way, way, way, way, way better...",Positive
1,5,I love Spotify!! Very good quality and good so...,Positive
2,5,"I love, love, love spotify !!!!! I'm 71 years ...",Positive
3,5,"I love this app. I put it on my phone first, b...",Positive
4,5,Great for listening to your favorite tunes and...,Positive
5,5,"I love this app. I've tried them all (apple, a...",Positive
6,5,this literally has almost every song i have ev...,Positive
7,5,This app is amazing and you don't even have to...,Positive
8,5,This app is wonderfully diverse in its music c...,Positive
9,5,It is such a good app I fecomed this to anyone...,Positive


In [5]:
reviews=[]
reviews=review_df['Text'].map(lambda x: x.strip().split(".")) #splitting up the sentences separated by dots

In [6]:
reviews

0      [Spotify is way, way, way, way, way, way bette...
1      [I love Spotify!! Very good quality and good s...
2      [I love, love, love spotify !!!!! I'm 71 years...
3      [I love this app,  I put it on my phone first,...
4      [Great for listening to your favorite tunes an...
5      [I love this app,  I've tried them all (apple,...
6      [this literally has almost every song i have e...
7      [This app is amazing and you don't even have t...
8      [This app is wonderfully diverse in its music ...
9      [It is such a good app I fecomed this to anyon...
10     [Can't live without iit!! It has opened up my ...
11     [A bit of a novice here curating music other t...
12     [I really like being able to pull up my favori...
13     [I switched to Spotify from Pandora because I ...
14                                             [Love it]
15     [It's really good for my wide range of music t...
16            [Easy app to find all your favorite songs]
17     [Spotify is a great app 

In [7]:
stories=[sent for sent in reviews for i in sent if len(i)>1]

In [8]:
stories=[i[:-1] for i in stories]  #removing all the last blank values

In [9]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(lower=True, filters='')
#tokenizer.fit_on_texts() takes a single list of string sequences as input
#The fit_on_texts() function maps each word in the stories to a numerical index. When working with large datasets it's common to filter all words occurring less than a certain number of times, and replace them with some "UNKNOWN" token. Here, because this dataset is small, every word encountered in the stories is added to the lexicon.
tokenizer.fit_on_texts([sent for sents in stories for sent in sents]) 

#print a sample of the dictionary
print (tokenizer.word_index.items())

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


dict_items([('i', 1), ('to', 2), ('the', 3), ('and', 4), ('it', 5), ('a', 6), ('music', 7), ('you', 8), ('is', 9), ('app', 10), ('for', 11), ('this', 12), ('my', 13), ('have', 14), ('on', 15), ('of', 16), ('that', 17), ('spotify', 18), ('can', 19), ('but', 20), ('love', 21), ('all', 22), ('so', 23), ('listen', 24), ('songs', 25), ('your', 26), ('if', 27), ('like', 28), ('in', 29), ('or', 30), ('has', 31), ('get', 32), ('great', 33), ("it's", 34), ('song', 35), ('me', 36), ('not', 37), ('want', 38), ('with', 39), ('there', 40), ('are', 41), ('was', 42), ("don't", 43), ('as', 44), ('really', 45), ('they', 46), ('very', 47), ('premium', 48), ('best', 49), ('from', 50), ('when', 51), ('be', 52), ('only', 53), ('much', 54), ('play', 55), ('out', 56), ('one', 57), ('listening', 58), ('am', 59), ('just', 60), ('at', 61), ('free', 62), ('playlists', 63), ('other', 64), ('find', 65), ('new', 66), ('now', 67), ('any', 68), ('also', 69), ('than', 70), ('pandora', 71), ("i'm", 72), ('use', 73), ('

In [10]:
len(tokenizer.word_index.items())

1783

In [11]:
#example of encoded story
print (stories[0], "\n")
encoded_story = tokenizer.texts_to_sequences(stories[0])
print (encoded_story)

['Spotify is way, way, way, way, way, way better music app than Pandora', " It doesn't limit you on the musicians you actually like with awful musicians with a few of your favorite songs peppered in between garbage pail cousin  music and brown water trash one hit wonders", " Spotify let's you play whole albums and let's you make custom and endless playlists"] 

[[18, 9, 262, 262, 262, 262, 262, 105, 132, 7, 10, 70, 71], [5, 150, 493, 8, 15, 3, 802, 8, 287, 28, 39, 884, 802, 39, 6, 115, 16, 26, 108, 25, 1173, 29, 659, 1174, 1175, 1176, 7, 4, 1177, 1178, 1179, 57, 708, 1180], [18, 336, 8, 55, 337, 141, 4, 336, 8, 160, 803, 4, 1181, 63]]


In [12]:
from keras.models import Sequential
from keras.layers import Dense, TimeDistributed
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU

In [13]:


lexicon_size = len(tokenizer.word_index)
n_embedding_nodes = 300
n_hidden_nodes = 500
batch_size = 1
n_timesteps = None #length of input data


In [14]:
len(tokenizer.word_index)

1783

In [15]:
from keras.models import Sequential
from keras.layers import Dense, TimeDistributed
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU

rnn = Sequential()

lexicon_size = len(tokenizer.word_index)
n_embedding_nodes = 300
n_hidden_nodes = 500
batch_size = 1
n_timesteps = None

#word embedding layer
embedding_layer = Embedding(batch_input_shape=(batch_size, n_timesteps),
                            input_dim=lexicon_size + 1, #add 1 because word indices start at 1, not 0
                            output_dim=n_embedding_nodes, 
                            mask_zero=True)
rnn.add(embedding_layer)

#recurrent layers (GRU)
recurrent_layer1 = GRU(output_dim=n_hidden_nodes,
                       return_sequences=True, 
                       stateful=True)
rnn.add(recurrent_layer1)

recurrent_layer2 = GRU(output_dim=n_hidden_nodes,
                       return_sequences=True, 
                       stateful=True)
rnn.add(recurrent_layer2)

#prediction (softmax) layer
pred_layer = TimeDistributed(Dense(lexicon_size + 1, #add 1 because word indices start at 1, not 0
                                   activation="softmax"))
rnn.add(pred_layer)

#select optimizer and compile
rnn.compile(loss="sparse_categorical_crossentropy", 
            optimizer='adam')



In [16]:
#select optimizer and compile
rnn.compile(loss="sparse_categorical_crossentropy", 
            optimizer='adam')

In [17]:
import numpy

def train_epoch(stories):  
    losses = []  #track cross-entropy loss during training
    c=0
    for story in stories:
        prev_eos = None
        encoded_story = tokenizer.texts_to_sequences(story) #encode story into word indices
        c=c+1
        #print (encoded_story)
        #print (c)
        for sent in encoded_story:
            
            sent = numpy.array(sent)
            if prev_eos:
                '''encode last token in previous sentence so that first word 
                of this sentence is conditioned on it'''
                sent = numpy.insert(sent, 0, prev_eos)
            #x is the sentence up to the last word, y is the sentence starting from the second word through the end
            sent_x = sent[None, :-1]
            sent_y = sent[None, 1:, None]
            #print(sent)
            try:
                loss = rnn.train_on_batch(x=sent_x, y=sent_y)
            except Exception as e:
                print (c)
                break
            #print (sent)
            losses.append(loss)
            prev_eos = sent[-1]
        #finished story, now clear hidden layer states to read a new story
        rnn.reset_states()
    loss = numpy.mean(losses)
    return loss




In [18]:
n_epochs = 10
print ("Training RNN on", len(stories), "stories for", n_epochs, "epochs...")
for epoch in range(n_epochs):
   # try:
        loss = train_epoch(stories)
        print ("epoch {} loss: {:.3f}".format(epoch + 1, loss))
    #except Exception as e:
        #print (c)
        #continue

Training RNN on 1174 stories for 10 epochs...
94
95
96
97
98
99
125
126
127
128
140
141
213
214
215
216
217
225
226
227
228
229
230
231
248
249
250
259
260
261
262
264
265
290
291
292
293
370
371
372
373
374
375
376
399
400
401
402
403
426
427
428
429
430
465
466
467
468
469
533
534
535
536
537
538
539
562
563
564
565
583
584
585
655
656
657
739
740
782
783
788
789
790
824
825
831
832
870
871
872
899
900
943
944
945
977
1019
1020
1053
1054
1077
1078
1102
1117
1118
1119
1161
1162
epoch 1 loss: 3.775
94
95
96
97
98
99
125
126
127
128
140
141
213
214
215
216
217
225
226
227
228
229
230
231
248
249
250
259
260
261
262
264
265
290
291
292
293
370
371
372
373
374
375
376
399
400
401
402
403
426
427
428
429
430
465
466
467
468
469
533
534
535
536
537
538
539
562
563
564
565
583
584
585
655
656
657
739
740
782
783
788
789
790
824
825
831
832
870
871
872
899
900
943
944
945
977
1019
1020
1053
1054
1077
1078
1102
1117
1118
1119
1161
1162
epoch 2 loss: 2.632
94
95
96
97
98
99
125
126
127
128
140


In [None]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

In [26]:
#sent_x,sent_y=to_sequences(3,numpy.array(tokenizer.texts_to_sequences(stories[0])))

In [27]:
#saving model architectures
json_string=rnn.to_json()#saving the model

from keras.models import model_from_json
rnn = model_from_json(json_string)#retrieving the model
    

In [28]:
#saving model weights
from keras.models import load_model

rnn.save('reviews_trained_model.h5')

In [29]:
#loading weights
rnn = load_model('reviews_trained_model.h5')



In [30]:
import random

def predict(init_story, max_words, mode='max'):
    '''generate the endings of stories word by word based on word probabilities predicted by rnn'''
    
    pred_ending = []
    
    '''read initial sentences of story into model'''
    encoded_init_story = tokenizer.texts_to_sequences(init_story)
    for sent in encoded_init_story:
        sent = numpy.array(sent)[None, :]
        p_next_word = rnn.predict_on_batch(sent)[0][-1]
        #print (p_next_word)
           
    '''now start predicting new words'''
    for idx in range(max_words):
        if mode == 'max':
            #generate word with highest probability of being next in this sequence
            next_word = numpy.argmax(p_next_word)
        elif mode == 'random':
            #sample from probability distribution to get next word
            next_word = numpy.random.choice(a=p_next_word.shape[-1], p=p_next_word)
        pred_ending.append(next_word)
        if lexicon_lookup[next_word][-1] in eos_tokens:
            #an end-of-sentence marker (e.g. punctuation) was generated, so stop generating
            break
        p_next_word = rnn.predict_on_batch(numpy.array(next_word)[None, None])[0][-1]
    
    rnn.reset_states()
    #decode predicted sentence from numerical indices back into words
    pred_ending = [lexicon_lookup[word] for word in pred_ending]
    return pred_ending

In [31]:
#index to word dictinary
'''create lookup table to get words from their indices'''
lexicon_lookup = {index: word for word, index in tokenizer.word_index.items()}
#specify which characters should indicate the end of a sentence and halt generation
eos_tokens = [".", "?", "!"]

In [35]:
for story in random.sample(stories, 15):
    try:
        print (story)
        init_story = story[:-1]
        print ("INIT STORY:", " ".join(init_story))
        print ("GOLD ENDING:", story[-1])
        pred_ending = predict(init_story, max_words=20, mode='random') #selects random based on probability distributin, if set to max, it outputs the max probability
        print ("PREDICTED ENDING:", " ".join(pred_ending))
        print ("\n")
    except Exception as e:
        continue

['I love this app', ' I have it downloaded on all my devices', ' The weekly discover playlist is one of my favorite features', '', '', " you won't regret becoming a premium member unless you don't really like music, I guess"]
INIT STORY: I love this app  I have it downloaded on all my devices  The weekly discover playlist is one of my favorite features  
GOLD ENDING:  you won't regret becoming a premium member unless you don't really like music, I guess
[]
INIT STORY: 
["The only downfall is paying premium for a few extra features, but other than that, it's awesome! Especially for Kindle Fire users"]
INIT STORY: 
GOLD ENDING: The only downfall is paying premium for a few extra features, but other than that, it's awesome! Especially for Kindle Fire users
[]
INIT STORY: 
['Spotify is by far the best music app on the market']
INIT STORY: 
GOLD ENDING: Spotify is by far the best music app on the market
['Better than pandora', '']
INIT STORY: Better than pandora
GOLD ENDING: 
PREDICTED ENDI

In [None]:
pred_ending=predict(["This is an amazing app I want"], max_words=30, mode='random')
pred_ending

In [None]:
print ("PREDICTED ENDING:", " ".join(pred_ending))

Reference:
# http://people.ict.usc.edu/~roemmele/publications/keras_rnn_demo.html
# https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/