In [1]:
# Loading the dataset and visualize it
import pandas as pd
import re

data = pd.read_csv('datasets/amazon_reviews.csv')

## Visualize and clean data

In [2]:
data

Unnamed: 0,review
0,"It's hard to believe ""Memory of Trees"" came ou..."
1,"A clasically-styled and introverted album, Mem..."
2,I never thought Enya would reach the sublime h...
3,This is the third review of an irish album I w...
4,"Enya, despite being a successful recording art..."
...,...
64700,I like the reggae sound a lot in this song. I ...
64701,I first heard this on Sirius and had to have i...
64702,"I absolutely love this song, it downloaded fin..."
64703,"Reggae, island beats aren't really my cup of t..."


### Remove special characters, punctuation, and numbers.

In [19]:
def preprocess(text):
    text_input = re.sub('[^a-zA-Z1-9]+', ' ', str(text))
    output = re.sub(r'\d+', '',text_input)
    return output.lower().strip()

corpus = data['review'].map(preprocess).astype(str).values[:100].tolist()

In [20]:
corpus[0:2]

['it s hard to believe memory of trees came out  years ago it has held up well over the passage of time it s enya s last great album before the new age pop of amarantine and day without rain back in  enya still had her creative spark her own voice i agree with the reviewer who said that this is her saddest album it is melancholy bittersweet from the opening title song memory of trees is elegaic majestic pax deorum sounds like it is from a requiem mass it is a dark threnody unlike the reviewer who said that this has a disconcerting blend of spirituality sensuality i don t find it disconcerting at all anywhere is is a hopeful song looking to possibilities hope has a place is about love but it is up to the listener to decide if it is romantic platonic etc i ve always had a soft spot for this song on my way home is a triumphant ending about return this is truly a masterpiece of new age music a must for any enya fan',
 'a clasically styled and introverted album memory of trees is a masterpi

## Prepare Data for training
The `fit_on_texts()` method in the Keras Tokenizer class updates the tokenizer's internal vocabulary based on a list of texts . It creates a dictionary where the keys are the unique words in the text list, and the values are the counts of each word in the text. This dictionary is used to create an index-based mapping of words to integers, where the most common words have the lowest integer values.

The method's signature is as follows:

`fit_on_texts(texts)`

where texts is a list of strings representing the input texts to be processed. The method updates the internal vocabulary based on the words in the input texts.

The `fit_on_texts()` method is typically used as a preprocessing step before transforming text data into numerical sequences using the `texts_to_sequences()` method. By learning the vocabulary from the input texts, the tokenizer is able to assign integer indices to each word that can be used to represent the texts as sequences of integers.

In [21]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [22]:
# visualize the indexes
tokenizer.word_index

{'the': 1,
 'and': 2,
 'of': 3,
 'a': 4,
 'is': 5,
 'to': 6,
 'i': 7,
 'this': 8,
 'it': 9,
 'in': 10,
 's': 11,
 'that': 12,
 'quot': 13,
 'album': 14,
 'with': 15,
 'you': 16,
 'on': 17,
 'for': 18,
 'as': 19,
 'song': 20,
 'was': 21,
 'one': 22,
 'but': 23,
 'my': 24,
 'all': 25,
 'are': 26,
 't': 27,
 'not': 28,
 'songs': 29,
 'like': 30,
 'cars': 31,
 'be': 32,
 'enya': 33,
 'just': 34,
 'an': 35,
 'her': 36,
 'there': 37,
 'music': 38,
 'from': 39,
 'have': 40,
 'best': 41,
 'good': 42,
 'new': 43,
 'has': 44,
 'what': 45,
 'their': 46,
 'up': 47,
 'more': 48,
 'great': 49,
 'rock': 50,
 'very': 51,
 'by': 52,
 'track': 53,
 'out': 54,
 'they': 55,
 'time': 56,
 'at': 57,
 'sound': 58,
 'had': 59,
 'me': 60,
 'only': 61,
 'band': 62,
 'most': 63,
 'which': 64,
 'or': 65,
 'were': 66,
 'so': 67,
 'love': 68,
 'cd': 69,
 'about': 70,
 'can': 71,
 'first': 72,
 'debut': 73,
 'well': 74,
 'if': 75,
 'some': 76,
 'sounds': 77,
 'times': 78,
 'still': 79,
 'vocals': 80,
 'many': 81,
 '

In [24]:
# create input sequences using list of tokens
input_sequences = []

for review in corpus:
    token_list = tokenizer.texts_to_sequences([review])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

Finally, we pad sequenced data and define predictors and labels. We use predictors to guess what is the next word in a sequence and labels to correct the model’s predictions.

In [25]:
import tensorflow.keras.utils as ku
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# create predictors and label
total_words = len(tokenizer.word_index) + 1
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = ku.to_categorical(label, num_classes=total_words)

In [26]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
import tensorflow as tf
     

In [27]:
model = Sequential()
model.add(Embedding(total_words, 240, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

2023-04-27 15:07:39.789574: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-27 15:07:39.790678: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-27 15:07:39.791715: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [28]:
class myCallback(tf.keras.callbacks.Callback):
	def on_epoch_end(self, epoch, logs={}):
		if(logs.get('accuracy')>0.93):
			print("\nReached 93% accuracy so cancelling training!")
			self.model.stop_training = True

callbacks = myCallback()

history = model.fit(predictors, label, epochs=300, verbose=1, callbacks=[callbacks])

Epoch 1/300


2023-04-27 15:07:53.792638: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-27 15:07:53.793805: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-27 15:07:53.794667: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

2023-04-27 15:07:55.606626: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis' with dtype int32 and shape [1]
	 [[{{node gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis}}]]


114/691 [===>..........................] - ETA: 9:42 - loss: 7.2938 - accuracy: 0.0419

KeyboardInterrupt: 

In [None]:
seed_text_list = ["i think", "this was","this cd", "i love","what a"]

next_words = 10

def generate_words(seed_text):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose="0")
        classes_x = np.argmax(predicted,axis=1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == classes_x:
                output_word = word
        seed_text += " " + output_word

    return (seed_text + ".").capitalize()


for seed_text in seed_text_list:
    print("Generated text: " + generate_words(seed_text))