In [0]:
class CharacterTable(object):
    """Given a set of characters:
    + Encode them to a one-hot integer representation
    + Decode the one-hot or integer representation to their character output
    + Decode a vector of probabilities to their character output
    """
    def __init__(self, input_chars, target_chars):
        """Initialize character table.
        # Arguments
            chars: Characters that can appear in the input.
        """
        self.input_chars = sorted(input_chars)
        self.input_char_indices = dict((c, i) for i, c in enumerate(self.input_chars))
        self.input_indices_char = dict((i, c) for i, c in enumerate(self.input_chars))
        self.target_chars = sorted(target_chars)
        self.target_char_indices = dict((c, i) for i, c in enumerate(self.target_chars))
        self.target_indices_char = dict((i, c) for i, c in enumerate(self.target_chars))

        
        print('Number of unique input tokens:', len(self.input_chars))
        print('Number of unique output tokens:', len(self.target_chars))
        
    def encode_input(self, sequences, max_seq_length):
        """One-hot encode given string C.
        # Arguments
            C: string, to be encoded.
            num_rows: Number of rows in the returned one-hot encoding. This is
                used to keep the # of rows for each data the same.
        """
#         print("input char indices")
#         print(self.input_char_indices)
#         print("target char indices")
#         print(self.input_indices_char)
        encoded_data = np.zeros(
            (len(sequences), max_seq_length, len(self.input_chars)),
            dtype='bool_')
        for i, seq in enumerate(sequences):
          for t, char in enumerate(seq):
#               print("char: {0} at index: {1}".format(char, self.input_char_indices[char]))
              encoded_data[i, t, self.input_char_indices[char]] = 1.
#               print("line nr {0}".format(t))
#               print(encoded_data[i, t])
        return encoded_data
      
    def encode_target(self, sequences, max_seq_length):
        """One-hot encode given string C.
        # Arguments
            C: string, to be encoded.
            num_rows: Number of rows in the returned one-hot encoding. This is
                used to keep the # of rows for each data the same.
        """
        decoder_input_data = np.zeros((len(sequences), max_seq_length, len(self.target_chars)),
        dtype='bool_')
        decoder_target_data = np.zeros(
            (len(sequences), max_seq_length, len(self.target_chars)),
            dtype='bool_')
        for i, seq in enumerate(sequences):
          for t, char in enumerate(seq):
            decoder_input_data[i, t, self.target_char_indices[char]] = 1.
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                decoder_target_data[i, t - 1, self.target_char_indices[char]] = 1.

        return decoder_input_data, decoder_target_data
      
    def decode_sequence(self, seq):
      
      # Encode the input as state vectors.
      states_value = encoder_model.predict(seq)
  
      
      
      
      # Generate empty target sequence of length 1.
      target_seq = np.zeros((1, 1, len(self.target_chars)))
      # Populate the first character of target sequence with the start character.
      target_seq[0, 0, self.target_char_indices['\t']] = 1.

      
      # Sampling loop for a batch of sequences
      # (to simplify, here we assume a batch of size 1).
      stop_condition = False
      decoded_sentence = ''
      while not stop_condition:
          output_tokens, h, c = decoder_model.predict(
              [target_seq] + states_value)

          sampled_token_index = np.argmax(output_tokens[0, -1, :])
          sampled_char = self.target_indices_char[sampled_token_index]
          decoded_sentence += sampled_char

          if (sampled_char == '\n' or
                  len(decoded_sentence) > max_decoder_seq_length):
              stop_condition = True

          target_seq = np.zeros((1, 1, len(self.target_chars)))
          target_seq[0, 0, sampled_token_index] = 1.

#           print("Target seq")
#           print(target_seq)
          
          # Update states
          states_value = [h, c]
 
          
      return decoded_sentence
    


In [0]:
from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np
import io
import pandas as pd
from pickle import load

stories = load(open('/review_dataset.pkl', 'rb'))
stories = stories[:10000]

# Vectorize the data.
input_characters = set()
target_characters = set()
max_encoder_seq_length = 0
max_decoder_seq_length = 0
input_texts=[]
target=[]
for story in stories:
    input_text = story['story']
    
    if len(input_text) > max_encoder_seq_length:
      max_encoder_seq_length = len(input_text)
   
    target_text = story['highlights']

    #"tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target.append(target_text)
    if len(target_text) > max_decoder_seq_length:
      max_decoder_seq_length = len(target_text)
    
    
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)


# print('Number of samples:', len(input_texts))
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

ctable = CharacterTable(input_characters, target_characters)


Loaded Stories 27782
Max sequence length for inputs: 5437
Max sequence length for outputs: 131
Number of unique input tokens: 51
Number of unique output tokens: 46


In [0]:
encoder_input_data = ctable.encode_input(input_texts, max_encoder_seq_length)
decoder_input_data, decoder_target_data = ctable.encode_target(target, max_decoder_seq_length)


print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_target_data.shape)

(10000, 5437, 51)
(10000, 131, 46)
(10000, 131, 46)


In [0]:
def define_models(n_input, n_output, n_units):
  # define training encoder
  encoder_inputs = Input(shape=(None, n_input))
  encoder = LSTM(n_units, return_state=True)
  encoder_outputs, state_h, state_c = encoder(encoder_inputs)
  encoder_states = [state_h, state_c]
  
  # define training decoder
  decoder_inputs = Input(shape=(None, n_output))
  decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
  decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
  decoder_dense = Dense(n_output, activation='softmax')
  decoder_outputs = decoder_dense(decoder_outputs)
  model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
  
  # define inference encoder
  encoder_model = Model(encoder_inputs, encoder_states)
  
  # define inference decoder
  decoder_state_input_h = Input(shape=(n_units,))
  decoder_state_input_c = Input(shape=(n_units,))
  decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
  decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs,  initial_state=decoder_states_inputs)
  decoder_states = [state_h, state_c]
  decoder_outputs = decoder_dense(decoder_outputs)
  decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
  
  # return all models
  return model, encoder_model, decoder_model

In [0]:
batch_size = 64  # Batch size for training.
epochs = 30 # Number of epochs to train for.
latent_dim = 256  # encoding space.


model, encoder_model, decoder_model = define_models(len(ctable.input_chars), len(ctable.target_chars), latent_dim)
print("Model summary")
model.summary()

print("Encoder summary")
encoder_model.summary()


print("Decoder summary")
decoder_model.summary()


model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
batch_size=batch_size,
epochs=epochs,
validation_split=0.2)
Save model
model.save('/home/model2.h5')

Model summary
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_69 (InputLayer)           (None, None, 51)     0                                            
__________________________________________________________________________________________________
input_70 (InputLayer)           (None, None, 46)     0                                            
__________________________________________________________________________________________________
lstm_35 (LSTM)                  [(None, 256), (None, 315392      input_69[0][0]                   
__________________________________________________________________________________________________
lstm_36 (LSTM)                  [(None, None, 256),  310272      input_70[0][0]                   
                                                                 lstm_35[0][1]                 

In [0]:
for seq_index in range(10):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = ctable.decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better
Decoded sentence: great  ood an  ore the the the the the the the the the the the the the the the the the the the the the the the the the the the the t
-
Input sentence: product arrived labeled jumbo salted peanuts peanuts actually small sized unsalted sure error vendor intended represent product jumbo
Decoded sentence: great  ood an  ore the the the the the the the the the the the the the the the the the the the the the the the the the the the the t
-
Input sentence: confection around centuries light pillowy citrus gelatin nuts case filberts cut tiny squares liberally coated powdered sugar tiny mouthful heaven chewy flavorful highly recommend yummy treat familiar story c lewis lion witch wardrobe treat seduces edmund selling brother sisters witch
Decoded sentence: great  ood an  ore the the the the the the