In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf 
import numpy as np
import io
import json
from keras_preprocessing.text import tokenizer_from_json
import datetime
import os
import time

In [None]:
tf.__version__

'2.7.0'

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  1


In [None]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [None]:
train = pd.read_csv('/content/drive/MyDrive/CSE Research papers/Text Summarization /Reviews.csv')
train = train[['Summary','Text']]
train.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
train['text_length'] = train['Text'].str.count(' ')
train['text_length'].describe()

count    568454.000000
mean         81.005522
std          80.807102
min           2.000000
25%          33.000000
50%          57.000000
75%          99.000000
max        3525.000000
Name: text_length, dtype: float64

In [None]:
train['summary_length'] = train['Summary'].str.count(' ')
train['summary_length'].describe()

count    568427.000000
mean          3.128462
std           2.619420
min           0.000000
25%           1.000000
50%           3.000000
75%           4.000000
max          41.000000
Name: summary_length, dtype: float64

In [None]:
train.head()

Unnamed: 0,Summary,Text,text_length,summary_length
0,Good Quality Dog Food,I have bought several of the Vitality canned d...,48,3.0
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,30,2.0
2,"""Delight"" says it all",This is a confection that has been around a fe...,98,3.0
3,Cough Medicine,If you are looking for the secret ingredient i...,42,1.0
4,Great taffy,Great taffy at a great price. There was a wid...,29,1.0


In [None]:
train = train[train['summary_length']>=2].reset_index(drop=True)
train = train[train['summary_length']<=20].reset_index(drop=True)
train = train[train['text_length']<=100].reset_index(drop=True)

In [None]:
print(train.shape)
print(train.head())

(276719, 4)
                                         Summary  ... summary_length
0                          Good Quality Dog Food  ...            3.0
1                              Not as Advertised  ...            2.0
2                          "Delight" says it all  ...            3.0
3  Great!  Just as good as the expensive brands!  ...            8.0
4                         Wonderful, tasty taffy  ...            2.0

[5 rows x 4 columns]


In [None]:
train['text_lower'] = train['Text'].str.lower()
train['text_no_punctuation'] = train['text_lower'].str.replace('[^\w\s]','')

In [None]:
train['summary_lower'] = train["Summary"].str.lower()
train['summary_no_punctuation'] =  '_start_' + ' ' +train['summary_lower'].str.replace('[^\w\s]','')+ ' ' +'_end_'

In [None]:
train = train.sample(frac=1).reset_index(drop=True)

#save 100 values as a test dataset for scoring

test = train[0:100]
train = train[100:]
test.to_csv('test_set.csv')

In [None]:
max_features1 = 100000
maxlen1 = 100

max_features2 = 100000
maxlen2 = 20

In [None]:
tok1 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features1) 
tok1.fit_on_texts(list(train['text_no_punctuation'].astype(str))) #fit to cleaned text
tf_train_text =tok1.texts_to_sequences(list(train['text_no_punctuation'].astype(str)))
tf_train_text =tf.keras.preprocessing.sequence.pad_sequences(tf_train_text, maxlen=maxlen1) #let's execute pad step 

In [None]:
tokenizer1_json = tok1.to_json()
with io.open('tok1.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer1_json, ensure_ascii=False))

In [None]:
tok2 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features2, filters = '*') 
tok2.fit_on_texts(list(train['summary_no_punctuation'].astype(str))) #fit to cleaned text
tf_train_summary = tok2.texts_to_sequences(list(train['summary_no_punctuation'].astype(str)))
tf_train_summary = tf.keras.preprocessing.sequence.pad_sequences(tf_train_summary, maxlen=maxlen2, padding ='post')

In [None]:
tokenizer2_json = tok2.to_json()
with io.open('tok2.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer2_json, ensure_ascii=False))

In [None]:
vectorized_summary = tf_train_summary
# For Decoder Input, you don't need the last word as that is only for prediction
# when we are training using Teacher Forcing.
decoder_input_data = vectorized_summary[:, :-1]

# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
decoder_target_data = vectorized_summary[:, 1:]

print(f'Shape of decoder input: {decoder_input_data.shape}')
print(f'Shape of decoder target: {decoder_target_data.shape}')

vectorized_text = tf_train_text
# Encoder input is simply the body of the text
encoder_input_data = vectorized_text
doc_length = encoder_input_data.shape[1]
print(f'Shape of encoder input: {encoder_input_data.shape}')

Shape of decoder input: (276619, 19)
Shape of decoder target: (276619, 19)
Shape of encoder input: (276619, 100)


In [None]:
#setting size of vocabulary encoder and decoder
vocab_size_encoder = len(tok1.word_index) + 1 
vocab_size_decoder = len(tok2.word_index) + 1

In [None]:
#set latent dimension for embedding and hidden units
latent_dim = 100

In [None]:
# GLOVE_DIR = "/home/tiana/Data_Science/tests/glove"

# embeddings_index = {}
# f = open(os.path.join(GLOVE_DIR, 'glove.6B.{}d.txt'.format(latent_dim)))
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

# print('Found %s word vectors.' % len(embeddings_index))

In [None]:
#build embedding weights matrix for text

embedding_matrix = np.zeros((len(tok1.word_index) + 1, latent_dim))
for word, i in tok1.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
########################
#### Encoder Model ####
#setting Encoder Input
#putting the model under
#our GPU strategy


with strategy.scope():    
    encoder_inputs = tf.keras.Input(shape=(doc_length,), name='Encoder-Input')

        # GloVe Embeding for encoder
    x = tf.keras.layers.Embedding(vocab_size_encoder, 
                                  latent_dim, 
                                  name='Body-Word-Embedding',
                                      weights=[embedding_matrix],
                                      mask_zero=False, 
                                      trainable=False)(encoder_inputs)

        #Batch normalization is used so that the distribution of the inputs 
        #to a specific layer doesn't change over time
    x = tf.keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x)


        # We do not need the `encoder_output` just the hidden state
    _, state_h = tf.keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

        # Set the encoder as a separate entity so we can encode without decoding if desired
    encoder_model = tf.keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')


    seq2seq_encoder_out = encoder_model(encoder_inputs)



        ########################
        #### Decoder Model ####
    decoder_inputs = tf.keras.Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

        # Embedding For Decoder, not GloVe 
    dec_emb = tf.keras.layers.Embedding(vocab_size_decoder, 
                                            latent_dim, 
                                            name='Decoder-Word-Embedding',
                                            mask_zero=False, )(decoder_inputs)

        #batch normalization
    dec_bn = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

        # Set up the decoder, using `decoder_state_input` as initial state.
    decoder_gru = tf.keras.layers.GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
        #the decoder "decodes" the encoder out
    decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
    x = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

        # Dense layer for prediction
    decoder_dense = tf.keras.layers.Dense(vocab_size_decoder, activation='softmax', name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)


        ########################
        #### Seq2Seq Model ####
    seq2seq_Model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

    seq2seq_Model.compile(optimizer=tf.keras.optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')

  super(Nadam, self).__init__(name, **kwargs)


In [None]:
#from seq2seq_utils import viz_model_architecture
seq2seq_Model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Decoder-Input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 Decoder-Word-Embedding (Embedd  (None, None, 100)   2876100     ['Decoder-Input[0][0]']          
 ing)                                                                                             
                                                                                                  
 Encoder-Input (InputLayer)     [(None, 100)]        0           []                               
                                                                                                  
 Decoder-Batchnorm-1 (BatchNorm  (None, None, 100)   400         ['Decoder-Word-Embedding[0]

In [None]:
#find values for train/val split 
#I'm choosing a 85/15 train/val split

data_len = len(encoder_input_data)
val_split = int(np.floor(data_len*.15))
train_split = int(np.floor(data_len*.85))


#set hyperparameters
train_buffer_size = train_split
val_buffer_size = val_split

batch_size_per_replica = 256
global_batch_size = batch_size_per_replica * strategy.num_replicas_in_sync


# separating into train and validation data
X_enc_train = encoder_input_data[0:train_split]
X_dec_train = decoder_input_data[0:train_split]
y_t_train = np.expand_dims(decoder_target_data, -1)[0:train_split]

X_enc_val = encoder_input_data[-val_split:-1]
X_dec_val = decoder_input_data[-val_split:-1]
y_t_val = np.expand_dims(decoder_target_data, -1)[-val_split:-1]


#tf.data - make dataset, shuffle, batch and prefetch it
train_dataset = tf.data.Dataset.from_tensor_slices(({"Encoder-Input": X_enc_train, "Decoder-Input": X_dec_train}, y_t_train))
train_dataset = train_dataset.shuffle(train_buffer_size)
train_dataset = train_dataset.batch(global_batch_size).prefetch(1) 

val_dataset = tf.data.Dataset.from_tensor_slices(({"Encoder-Input": X_enc_val, "Decoder-Input": X_dec_val}, y_t_val))
val_dataset = val_dataset.shuffle(val_buffer_size)
val_dataset = val_dataset.batch(global_batch_size).prefetch(1) 

In [None]:
#tensorboard
log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

#checkpoints
checkpointer = tf.keras.callbacks.ModelCheckpoint(
    filepath='/tmp/weights.{epoch:02d}-{val_loss:.2f}.hdf5', verbose=1, save_best_only=True)

#early_stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',  
                                              patience=10, 
                                              verbose=1, mode='auto', 
                                              restore_best_weights=True)

#model
epochs = 2
history = seq2seq_Model.fit(train_dataset, validation_data = val_dataset,
                            epochs=epochs,  
                            validation_steps = val_split // global_batch_size,
                            callbacks=[tensorboard_callback, checkpointer]) 


#save final model
seq2seq_Model.save('your_model.h5')

Epoch 1/2
Epoch 00001: val_loss improved from inf to 1.33148, saving model to /tmp/weights.01-1.33.hdf5
Epoch 2/2
Epoch 00002: val_loss improved from 1.33148 to 1.22456, saving model to /tmp/weights.02-1.22.hdf5


In [None]:
seq2seq_Model.save('your_model.h5')

In [None]:
#load the model
seq2seq_Model = tf.keras.models.load_model('your_model.h5')

# Show the model architecture
seq2seq_Model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Decoder-Input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 Decoder-Word-Embedding (Embedd  (None, None, 100)   2876100     ['Decoder-Input[0][0]']          
 ing)                                                                                             
                                                                                                  
 Encoder-Input (InputLayer)     [(None, 100)]        0           []                               
                                                                                                  
 Decoder-Batchnorm-1 (BatchNorm  (None, None, 100)   400         ['Decoder-Word-Embedding[0]

In [None]:
#open the tokenizers

with open('tok1.json') as f:
    data = json.load(f)
    tok1 = tokenizer_from_json(data)
    
with open('tok2.json') as f:
    data = json.load(f)
    tok2 = tokenizer_from_json(data)

In [None]:
#look at test set
test.head(20)

Unnamed: 0,Summary,Text,text_length,summary_length,text_lower,text_no_punctuation,summary_lower,summary_no_punctuation
0,Great Roast good value,I discovered this coffee on a salmon fishing t...,50,3.0,i discovered this coffee on a salmon fishing t...,i discovered this coffee on a salmon fishing t...,great roast good value,_start_ great roast good value _end_
1,On my list,"There are a few coffees I order over and over,...",47,2.0,"there are a few coffees i order over and over,...",there are a few coffees i order over and over ...,on my list,_start_ on my list _end_
2,"Love, Love, LOVE!","So far my absolute favorite chips. I get my ""...",57,2.0,"so far my absolute favorite chips. i get my ""...",so far my absolute favorite chips i get my ch...,"love, love, love!",_start_ love love love _end_
3,Love the Flavor of this tea!,I bought this tea and both my best friend and ...,33,5.0,i bought this tea and both my best friend and ...,i bought this tea and both my best friend and ...,love the flavor of this tea!,_start_ love the flavor of this tea _end_
4,Great Product - Great Price!,"Being gluten intolerant, this is the best all-...",27,4.0,"being gluten intolerant, this is the best all-...",being gluten intolerant this is the best allpu...,great product - great price!,_start_ great product great price _end_
5,Great Dog Food,Works to keep your dogs teeth clean and add mu...,23,2.0,works to keep your dogs teeth clean and add mu...,works to keep your dogs teeth clean and add mu...,great dog food,_start_ great dog food _end_
6,Fantastic and Addictive!,This granola bar is sweet and crunchy but does...,25,2.0,this granola bar is sweet and crunchy but does...,this granola bar is sweet and crunchy but does...,fantastic and addictive!,_start_ fantastic and addictive _end_
7,Disappointed with Amazon's policy,Price dropped by $5 in matter of 2 days. Item...,28,3.0,price dropped by $5 in matter of 2 days. item...,price dropped by 5 in matter of 2 days item d...,disappointed with amazon's policy,_start_ disappointed with amazons policy _end_
8,Moist & Delicious Bran Muffins,Honey Raisin Bran Muffin Mix by Sun-Maid makes...,85,4.0,honey raisin bran muffin mix by sun-maid makes...,honey raisin bran muffin mix by sunmaid makes ...,moist & delicious bran muffins,_start_ moist delicious bran muffins _end_
9,Good coffee in K Cups,The only reason I did not give it five stars i...,49,4.0,the only reason i did not give it five stars i...,the only reason i did not give it five stars i...,good coffee in k cups,_start_ good coffee in k cups _end_


In [None]:
#pick a cell from the clean data to test and look at it
test_text = [test['text_no_punctuation'][34]]
test_text

['i have been using this curry paste for several years now and it makes a great dish although it doesnt beat eating indian out its the best paste i have used for at home dishes even though this is a hot paste i would consider it mild']

In [None]:
# get the encoder's features for the decoder

tok1.fit_on_texts(test_text)

In [None]:
#tokenize test text
raw_tokenized = tok1.texts_to_sequences(test_text)
raw_tokenized = tf.keras.preprocessing.sequence.pad_sequences(raw_tokenized, maxlen=maxlen1)

print(test['summary_no_punctuation'][34])

In [None]:
#predict the encoder state of the new sentence
body_encoding = encoder_model.predict(raw_tokenized) 

In [None]:
#get output shapes of decoder word embedding
latent_dim = seq2seq_Model.get_layer('Decoder-Word-Embedding').output_shape[-1]


In [None]:
#get layer method for getting the embedding (word clusters)

decoder_inputs = seq2seq_Model.get_layer('Decoder-Input').input 
dec_emb = seq2seq_Model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
dec_bn = seq2seq_Model.get_layer('Decoder-Batchnorm-1')(dec_emb)

gru_inference_state_input = tf.keras.Input(shape=(latent_dim,), name='hidden_state_input')

gru_out, gru_state_out = seq2seq_Model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])

# Reconstruct dense layers
dec_bn2 = seq2seq_Model.get_layer('Decoder-Batchnorm-2')(gru_out)
dense_out = seq2seq_Model.get_layer('Final-Output-Dense')(dec_bn2)

In [None]:
decoder_model = tf.keras.Model([decoder_inputs, gru_inference_state_input],
                          [dense_out, gru_state_out])

In [None]:
# save the encoder's embedding before its updated by decoder for later
# optional
original_body_encoding = body_encoding

In [None]:
state_value = np.array(tok2.word_index['_start_']).reshape(1, 1)

In [None]:
state_value

array([[1]])

In [None]:
decoded_sentence = []
stop_condition = False

In [None]:
vocabulary_inv = dict((v, k) for k, v in tok2.word_index.items())
vocabulary_inv

{1: '_start_',
 2: '_end_',
 3: 'great',
 4: 'the',
 5: 'good',
 6: 'for',
 7: 'a',
 8: 'and',
 9: 'best',
 10: 'not',
 11: 'my',
 12: 'love',
 13: 'this',
 14: 'it',
 15: 'coffee',
 16: 'but',
 17: 'i',
 18: 'of',
 19: 'tea',
 20: 'to',
 21: 'is',
 22: 'taste',
 23: 'in',
 24: 'these',
 25: 'like',
 26: 'product',
 27: 'dog',
 28: 'very',
 29: 'price',
 30: 'flavor',
 31: 'ever',
 32: 'food',
 33: 'delicious',
 34: 'as',
 35: 'you',
 36: 'loves',
 37: 'with',
 38: 'favorite',
 39: 'too',
 40: 'are',
 41: 'dogs',
 42: 'on',
 43: 'tasty',
 44: 'excellent',
 45: 'better',
 46: 'so',
 47: 'snack',
 48: 'healthy',
 49: 'them',
 50: 'just',
 51: 'chocolate',
 52: 'than',
 53: 'what',
 54: 'at',
 55: 'free',
 56: 'no',
 57: 'one',
 58: 'yummy',
 59: 'tastes',
 60: 'hot',
 61: 'tasting',
 62: 'stuff',
 63: 'all',
 64: 'chips',
 65: 'cats',
 66: 'cat',
 67: 'really',
 68: 'treat',
 69: 'nice',
 70: 'sweet',
 71: 'have',
 72: 'its',
 73: 'little',
 74: 'me',
 75: 'buy',
 76: 'quality',
 77: 'pe

In [None]:
while not stop_condition:
    #print(1)
    preds, st = decoder_model.predict([state_value, body_encoding])

    pred_idx = np.argmax(preds[:, :, 2:]) + 2
    pred_word_str = vocabulary_inv[pred_idx]
    print(pred_word_str)
    if pred_word_str == '_end_' or len(decoded_sentence) >= maxlen2:
        stop_condition = True
        break
    decoded_sentence.append(pred_word_str)

    # update the decoder for the next word
    body_encoding = st
    state_value = np.array(pred_idx).reshape(1, 1)
    #print(state_value)

great
product
but
_end_


In [None]:
#compare to original summary

print([test['summary_no_punctuation'][34]])

['_start_ great for chicken curry dishes _end_']
