In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import nltk
from nltk.corpus import stopwords
import time
from os import listdir
import codecs
import pickle
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import tensor_array_ops
print('TensorFlow Version: {}'.format(tf.__version__))

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


TensorFlow Version: 1.13.1


In [2]:
class BasePreprocessor:
    """The abstract class for a preprocessor. You should subclass
    this and implement the methods actions and result, and possibly
    __init__, goal_test, and path_cost. Then you will create instances
    of your subclass and solve them with the various search functions."""
    
    # List of contractions.
    CONTRACTION_LIST = {
            "ain't": "is not",
            "aren't": "are not",
            "can't": "cannot",
            "can't've": "cannot have",
            "'cause": "because",
            "could've": "could have",
            "couldn't": "could not",
            "couldn't've": "could not have",
            "didn't": "did not",
            "doesn't": "does not",
            "don't": "do not",
            "hadn't": "had not",
            "hadn't've": "had not have",
            "hasn't": "has not",
            "haven't": "have not",
            "he'd": "he would",
            "he'd've": "he would have",
            "he'll": "he will",
            "he'll've": "he he will have",
            "he's": "he is",
            "how'd": "how did",
            "how'd'y": "how do you",
            "how'll": "how will",
            "how's": "how is",
            "I'd": "I would",
            "I'd've": "I would have",
            "I'll": "I will",
            "I'll've": "I will have",
            "I'm": "I am",
            "I've": "I have",
            "i'd": "i would",
            "i'd've": "i would have",
            "i'll": "i will",
            "i'll've": "i will have",
            "i'm": "i am",
            "i've": "i have",
            "isn't": "is not",
            "it'd": "it would",
            "it'd've": "it would have",
            "it'll": "it will",
            "it'll've": "it will have",
            "it's": "it is",
            "let's": "let us",
            "ma'am": "madam",
            "mayn't": "may not",
            "might've": "might have",
            "mightn't": "might not",
            "mightn't've": "might not have",
            "must've": "must have",
            "mustn't": "must not",
            "mustn't've": "must not have",
            "needn't": "need not",
            "needn't've": "need not have",
            "o'clock": "of the clock",
            "oughtn't": "ought not",
            "oughtn't've": "ought not have",
            "shan't": "shall not",
            "sha'n't": "shall not",
            "shan't've": "shall not have",
            "she'd": "she would",
            "she'd've": "she would have",
            "she'll": "she will",
            "she'll've": "she will have",
            "she's": "she is",
            "should've": "should have",
            "shouldn't": "should not",
            "shouldn't've": "should not have",
            "so've": "so have",
            "so's": "so as",
            "that'd": "that would",
            "that'd've": "that would have",
            "that's": "that is",
            "there'd": "there would",
            "there'd've": "there would have",
            "there's": "there is",
            "they'd": "they would",
            "they'd've": "they would have",
            "they'll": "they will",
            "they'll've": "they will have",
            "they're": "they are",
            "they've": "they have",
            "to've": "to have",
            "wasn't": "was not",
            "we'd": "we would",
            "we'd've": "we would have",
            "we'll": "we will",
            "we'll've": "we will have",
            "we're": "we are",
            "we've": "we have",
            "weren't": "were not",
            "what'll": "what will",
            "what'll've": "what will have",
            "what're": "what are",
            "what's": "what is",
            "what've": "what have",
            "when's": "when is",
            "when've": "when have",
            "where'd": "where did",
            "where's": "where is",
            "where've": "where have",
            "who'll": "who will",
            "who'll've": "who will have",
            "who's": "who is",
            "who've": "who have",
            "why's": "why is",
            "why've": "why have",
            "will've": "will have",
            "won't": "will not",
            "won't've": "will not have",
            "would've": "would have",
            "wouldn't": "would not",
            "wouldn't've": "would not have",
            "y'all": "you all",
            "y'all'd": "you all would",
            "y'all'd've": "you all would have",
            "y'all're": "you all are",
            "y'all've": "you all have",
            "you'd": "you would",
            "you'd've": "you would have",
            "you'll": "you will",
            "you'll've": "you will have",
            "you're": "you are",
            "you've": "you have"
    }

    def __init__(self):
        """The constructor. Your subclass's constructor can add
        other arguments."""
        
    def cleanData(self, text, removeStopwords = True):
        """
        This method is a standard implementation to clean any text that are
        passed in as parameter. Here the text is split into sentences and each
        sentence is in turn cleaned by invoking the cleanSentence() method.
        
        Any custom cleaning needs to be done at the subclass Preprocessor and
        the invoke this method.

        Parameters
        ----------
        text : string
            The text to be cleaned.

        Returns
        -------
        string
            The cleaned text.
        punctuationsToBeExcluded : list
            List of any particular punctuations to be ignored when cleaning 
            the sentence.

        """
        cleanedSentences = list()
        sentences = text.split('\n')
        for sentence in sentences:
            # Cleaning the sentence here
            sentence = self.cleanSentence(sentence, removeStopwords)
            if len(sentence) > 0:
                cleanedSentences.append(sentence)
        return ' '.join(cleanedSentences).lower()
        
    def cleanSentence(self, sentence, removeStopwords):
        """
        The method cleans a passed in sentence parameter by:
            i. removing all whitespace characters.
            ii. removing all punctuations.

        Parameters
        ----------
        sentence : string
            The sentence to be cleaned.

        Returns
        -------
        string
            The cleaned sentence.

        """
        sentence = sentence.lower()
        sentence = self.fixContractions(sentence)
        sentence = self.removeUnwantedCharacters(sentence)
        if removeStopwords:
            sentence = self.removeStopWords(sentence)
        return sentence
    
    def fixContractions(self, text, contractionList=CONTRACTION_LIST):
        """
        # Expands the contractions by finding a match in the Contraction list 
        Regular expression pattern matching.

        Parameters
        ----------
        text : string
            The text where contractions need to be fixed.
        contraction_list : dictionary, optional
            The dictionary which tells the mapping for different types of 
            contractions. The default is CONTRACTION_LIST.

        Returns
        -------
        string
            The expanded text.

        """
        text = re.findall(r"[\w']+", text)
        new_text = []
        for word in text:
            if word in contractionList:
                new_text.append(contractionList[word])
            else:
                new_text.append(word)
        return ' '.join(new_text)
    
    def removeUnwantedCharacters(self, text):
        """
        Removes all unwanted characters from the text.
        This includes any URLs, HTML tags, punctuations, line breaks.

        Parameters
        ----------
        text : string
            The text that needs to be cleaned.

        Returns
        -------
        text : string
            The cleaned text.

        """
        text = text.strip()
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)# remove links
        text = re.sub(r'\<a href', ' ', text)# remove html link tag
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)
        return text
    
    def removeStopWords(self, text):
        """
        Removes the stop words.

        Parameters
        ----------
        text : string
            The text where the stop words need to be removed.

        Returns
        -------
        string
            The stop words removed text.

        """
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        return ' '.join(text)

In [3]:
class CnnPreprocessor(BasePreprocessor):
    """This is a preprocessor class which implements CNN dataset specific
    cleaning methods."""

    def __init__(self):
        """
        The constructor method to do any initial value setting.

        Returns
        -------
        CnnProcessor class object.

        """
        super().__init__()
            
    def stripOffNewsSource(self, text):
        """
        This method helps to strip off the news source from the text.

        Parameters
        ----------
        text : string
            The news text.

        Returns
        -------
        text : string
            The news text with any news source stripped off.

        """
        closingBracketIndex = text.find(')')
        firstWord = ''
        if closingBracketIndex > -1:
            firstWordToBeExcluded = False
            countOfSpaceChar = 0
            for i in range(closingBracketIndex-1,-1,-1):
                if text[i] == ' ':
                    if countOfSpaceChar < 4:
                        countOfSpaceChar += 1
                        continue
                    else:
                        firstWordToBeExcluded = False
                        break
                elif text[i] == '(' and not firstWordToBeExcluded:
                    countOfSpaceChar = 0
                    firstWordToBeExcluded = True
            
            if firstWordToBeExcluded:
                firstWord = text[:closingBracketIndex + 1]
                text = text[len(firstWord):].strip()
        return text
    
    def cleanData(self, text, isSummary):
        """
        This method helps to clean any text by calling the cleanData from the base
        class. 
        
        The CNN dataset files can have the source of the news at the start of
        the file in brackets. It iss wise to remove this as part of the cleaning
        as this source name doesn't help with the actual summarisation task.
        Hence another method called stripOffNewsSource() is invoked before
        before calling the cleanData() method in the base class.
        
        Parameters
        ----------
        text : string
            The text to be cleaned.
        isSummary : boolean
            Denotes whether the text to be cleaned is actual News text or 
            the summary.
            
        Returns
        -------
        string
            The cleaned text.

        """
        # If the text is not a summary, then strip of the news source from
        # the text
        if not isSummary:
            text = self.stripOffNewsSource(text)
        
        # Invoking the standard cleanData method.
        return super().cleanData(text, not isSummary)

In [4]:
"""
Implementation of base class for the data loader.
"""
class DataLoader:
    """
    Class to help with the loading of data
    """
    
    def __init__(self, cleanDataOp):
        """
        The constructor method to do any initial value setting.
        

        Returns
        -------
        DataLoader class object.

        """
        self.cleanDataOp = cleanDataOp
    
    def loadSourceDocument(self, filePath):
        """
        Loads the contents of a single source document

        Parameters
        ----------
        filePath : string
            The file path of the source document.

        Returns
        -------
        text : string
            The loaded text.

        """
        file = open(filePath, encoding='utf-8')
        text = file.read()
        file.close()
        return text
        
    def loadSourceDocuments(self, sourceDirectoryPath, refreshSourceDocs):
        """
        This method helps to load the source documents.

        Parameters
        ----------
        sourceDirectoryPath : string
            Directory path where the source files reside.
        refreshSourceDocs : bool
            If this parameter is true, all the source files are read fresh else
            already pickled file is loaded.

        Returns
        -------
        List of dictionaries holding the loaded text and summaries.

        """
        all_text = {}
        all_text['Text'] = []
        all_text['Summary'] = []
        if refreshSourceDocs:
            fileIndex = 1
            for name in listdir(sourceDirectoryPath):
                if not name.startswith('._'):
                    filePath = sourceDirectoryPath + '/' + name
                    # load document
                    doc = self.loadSourceDocument(filePath)
                    text, summary = self.retrieveTextAndSummary(doc)
                    all_text['Text'].append(self.cleanDataOp(text, False))
                    all_text['Summary'].append(self.cleanDataOp(summary, True))
                    print('Extracted and cleaned file number', fileIndex, '=>', name)
                    fileIndex += 1
        return all_text
        
    def retrieveTextAndSummary(self, document):
        """
        This method helps separate the actual text and summary from the whole
        CNN news document.

        Parameters
        ----------
        document : string
            The content of the news story file from which the actual text and
            summary needs to be separated.

        Returns
        -------
        string
            The text and a list of summaries.

        """
        # All the summaries in the document are starting with the '@highlight'
        # phrase.
        textIndex = document.find('@highlight')
        
        # Splitting the actual text content and the summary lines
        text, summaries = document[:textIndex], document[textIndex:].split('@highlight')
        
        # Stripping all the whitespaces from each of the summary lines.
        summaries = [s.strip() for s in summaries if len(s) > 0]
        
        # Returning the actual text and the list of summaries
        return text, ' '.join(summaries)

In [5]:
"""
Implementation of base class for the Word Embedding framework.
"""
class WordEmbeddingBase:
    """The base class for Word Embedding framework.
    """
    
    def __init__(self, embeddingsDimension, specialTokens):
        """The constructor. Your subclass's constructor can add
        other arguments.

        Returns
        -------
        WordEmbeddingBase object.

        """
        self.embeddingsDimension = embeddingsDimension
        self.specialTokens = specialTokens
    
    def constructEmbeddingsIndex(self):
        """
        The method to build the embedding index using the vector file

        Returns
        -------
        embedding_index : dictionary
            The word to vector data mapping.

        """
        embeddingsIndex = {}
        with codecs.open(self.vectorFilePath, 'r', 'utf-8') as f:
            for i, line in enumerate(f):
                sr = line.split()
                word = sr[0]
                embedding = np.asarray(sr[1:], dtype='float32')
                embeddingsIndex[word] = embedding
        return embeddingsIndex
        
    def buildEmbeddingsVectorMatrix(self, wordToIntDict, embeddingsIndex):
        """
        The method to build the embedding index using the vector file

        Parameters
        ----------
        embeddingDimension : number
            The dimension of embedding used.

        Returns
        -------
        embeddingMatrix : dictionary
            The mapping from integer representation of the word to the 
            embedding vector.

        """
        embeddingsMatrix = np.zeros((len(wordToIntDict), self.embeddingsDimension), dtype=np.float32)
        for word, i in wordToIntDict.items():
            embeddingsVector = embeddingsIndex.get(word)
            if embeddingsVector is not None:
                # words not found in embedding index will be all-zeros.
                embeddingsMatrix[i] = embeddingsVector
            else:
                randomGeneratedEmbeddingsVector = np.array(np.random.uniform(-1.0, 1.0, self.embeddingsDimension))
                embeddingsIndex[word] = randomGeneratedEmbeddingsVector
                embeddingsMatrix[i] = randomGeneratedEmbeddingsVector
        return embeddingsMatrix

In [6]:
"""
Implementation of custom class for the Glove Word Embedding framework.
"""
class GloveEmbedding(WordEmbeddingBase):
    """The custom class for Glove Word Embedding framework.
    """
    
    def __init__(self, embeddingsDimension, specialTokens):
        """
        The constructor to do any initial value setting.

        Returns
        -------
        GloveEmbedding class object.

        """
        self.vectorFilePath = 'embeddings/glove.6B.50d.txt'
        super().__init__(embeddingsDimension, specialTokens)
        

In [7]:
"""
Implementation of custom class for the Conceptnet Numberbatch's Embedding framework.
"""
class ConceptNetEmbedding(WordEmbeddingBase):
    """The custom class for Coneptnet Numberbatch's Embedding framework.
    """
    
    def __init__(self, embeddingsDimension, specialTokens):
        """
        The constructor to do any initial value setting.

        Returns
        -------
        GloveEmbedding class object.

        """
        self.vectorFilePath = 'embeddings/numberbatch-en-19.08.txt'
        super().__init__(embeddingsDimension, specialTokens)

In [8]:
class Utils:
    """A Utility class for some static helper methods"""
    
    @staticmethod
    def pickle(filename, contents):
        """
        This method pickles the contents to a file

        Parameters
        ----------
        filename : string
            The pickle file location.
        contents : string
            The contents to be pickled.

        Returns
        -------
        None.

        """
        file = open(filename, "wb")
        pickle.dump(contents, file)
        file.close()

    @staticmethod
    def unPickle(filename):
        """
        This method loads the contents from a pickled file

        Parameters
        ----------
        filename : string
            The pickle file location.

        Returns
        -------
        The contents from a pickled file.

        """
        file = open(filename,"rb")
        contents = pickle.load(file)
        file.close()
        return contents
    
    @staticmethod
    def countWords(wordsCountDict, text):
        """
        This method returns a dictionary with the words to number of occurrences
        mapping.

        Parameters
        ----------
        wordsCountDict : dictionary
            Word to number of occurrences mapping.
        text : string
            The text.

        Returns
        -------
        None.

        """
        for sentence in text:
            for word in sentence.split():
                if word not in wordsCountDict:
                    wordsCountDict[word] = 1
                else:
                    wordsCountDict[word] += 1
    
    @staticmethod
    def buildWordToNumberRepresentations(wordsCountDict, specialTokens, embeddingsIndex, thresholdForRareWordsCount):
        """
        This method returns two dictionaries with a word to number mapping and another one with number to word 
        mapping.

        Parameters
        ----------
        wordsCountDict : dictionary
            Word to number of occurrences mapping.
        specialTokens: dictionary
            Special tokens to number mapping
        embeddingsIndex: dictionary
            The dictionary which has the mapping from a word to corresponding embedding vector. This dictionary
            is normally constructed from a word embeddings vector file.
        thresholdForRareWordsCount : int
            Only those words with frequencies above this threshold are considered if they are not part of
            the embeddings index dictionary.

        Returns
        -------
        Two dictionaries:
            i. Word to Number mapping
            ii. Number to Word mapping

        """
        wordToIntDict = {}
        intToWordDict = {}
        wordIndex = 0
        for word, count in wordsCountDict.items():
            if count >= thresholdForRareWordsCount or word in embeddingsIndex:
                wordToIntDict[word] = wordIndex
                intToWordDict[wordIndex] = word
                wordIndex += 1
        
        for token in specialTokens.values():
            wordToIntDict[token] = wordIndex
            intToWordDict[wordIndex] = token
            wordIndex += 1
        
        return wordToIntDict, intToWordDict
    
    @staticmethod
    def convertTextToNumberSequence(text, wordToIntDict, unknownToken, eosToken = None, applyEos = False):
        """
        This method converts a text to a sequence of numbers based on the word to integer mapping dictionary.
        If a word does not exist in the word to integer mapping dictionary, a number representation of 'Unknown'
        special token is used instead.
        
        Parameters
        ----------
        wordToIntDict : dictionary
            Word to number of  mapping.
        unknownToken: string
            The 'Unknown' specal token string.
        eosToken: number
            The 'End of Sequence' special token string.
        applyEos : boolean
            If true, at the end of the number sequence the number corresponding to 'End of Sequence' special token
            shall be appended. 
            
        Returns
        -------
        i. The sequence of numbers
        ii. Total words count
        iii. Total unknown words count
        """
        numberSequenceForText = []
        wordsCount = 0
        unknownWordsCount = 0
        for sentence in text:
            numberSequenceForSentence = []
            for word in sentence.split():
                wordsCount += 1
                if word in wordToIntDict:
                    numberSequenceForSentence.append(wordToIntDict[word])
                else:
                    numberSequenceForSentence.append(wordToIntDict[unknownToken])
                    unknownWordsCount += 1
            
            if applyEos and eosToken is not None:
                numberSequenceForSentence.append(wordToIntDict[eosToken])
            numberSequenceForText.append(numberSequenceForSentence)
        return numberSequenceForText, wordsCount, unknownWordsCount
    
    @staticmethod       
    def applyFilterAndSort(summariesAndTextZippedList, summaryAndTextAttributes):
        """
        Filter method to filter out summary and text zipped entry based on maximum Summary Length, 
        maximum Text length, unknown word limit in summaries and unknown word limit in text.
        
        Parameters
        ----------
        summariesAndTextZippedList: list
            List of zipped version of Summary and Text
        summaryAndTextAttributes : dictionary
            Carries:
                i. The maximum number of words allowed in a Summary
                ii. The maximum number of words allowed in a Text
                i. The minimum number of words required in a Summary
                ii. The minimum number of words required in a Text
                iii. The maximum number of unknown words allowed in a Summary
                iv. The maximum number of unknown words allowed in a Text
            
        Returns
        -------
        i. The sequence of numbers
        ii. Total words count
        iii. Total unknown words count
        """
        maximumSummaryLength = summaryAndTextAttributes['maximumSummaryLength']
        maximumTextLength = summaryAndTextAttributes['maximumTextLength']
        minimumSummaryLength = summaryAndTextAttributes['minimumSummaryLength']
        minimumTextLength = summaryAndTextAttributes['minimumTextLength']
        unknownsInSummaryLimit = summaryAndTextAttributes['unknownsInSummaryLimit']
        unknownsInTextLimit = summaryAndTextAttributes['unknownsInTextLimit']
        unknownTokenNumberRepresentation = summaryAndTextAttributes['unknownTokenNumberRepresentation']
        
        def countUnknowns(sentence, unknownTokenNumberRepresentation):
            '''Counts the number of time UNK appears in a sentence.'''
            unknownsCount = 0
            for word in sentence:
                if word == unknownTokenNumberRepresentation:
                    unknownsCount += 1
            return unknownsCount
    
        def filterCondition(item):
            """
            Filters an item based on certain conditions.
            """
            summarySeq = item[0]
            textSeq = item[1]
            if(len(summarySeq) <= maximumSummaryLength and
               len(textSeq) <= maximumTextLength and 
               len(summarySeq) >= minimumSummaryLength and
               len(textSeq) >= minimumTextLength and
               countUnknowns(summarySeq, unknownTokenNumberRepresentation) <= unknownsInSummaryLimit and 
               countUnknowns(textSeq, unknownTokenNumberRepresentation) <= unknownsInTextLimit):
                return True
            else:
                return False
    
        filteredSummariesAndText = list(filter(filterCondition, summariesAndTextZippedList))
        summariesAndTextSorted = sorted(filteredSummariesAndText, key=lambda entry: len(entry[1]))
        summariesAndTextSorted = list(zip(*summariesAndTextSorted))
        return list(summariesAndTextSorted[0]), list(summariesAndTextSorted[1])

In [9]:
# load text
sourceDirectoryPath = '../data/cnn/stories'
refreshSourceDocs = False
pickledFilePath = '../data/cnn_dataset.pkl'

In [10]:
if refreshSourceDocs:
    preprocessor = CnnPreprocessor()
    dataLoader = DataLoader(preprocessor.cleanData)
    loadedContent = dataLoader.loadSourceDocuments(sourceDirectoryPath, refreshSourceDocs)
                
    # save to file
    Utils.pickle(pickledFilePath, loadedContent)
    print('Pickled the cleaned data into the file:', pickledFilePath)

# load from file
news = Utils.unPickle(pickledFilePath)
print('Loaded Texts %d' % len(news['Text']))

Loaded Texts 92579


In [11]:
cleanedText = news['Text']
cleanedSummaries = news['Summary']

In [12]:
embeddingsDimension = 50
specialTokens = {
    'UNKNOWN': '<UNK>',
    'PADDING': '<PAD>',
    'ENDOFSEQUENCE': '<EOS>',
    'STARTOFSEQUENCE': '<GO>'
}
embedding = GloveEmbedding(embeddingsDimension, specialTokens)

In [13]:
wordsCountDict = {}
Utils.countWords(wordsCountDict, cleanedText)
Utils.countWords(wordsCountDict, cleanedSummaries)
print("Size of Vocabulary:", len(wordsCountDict))

Size of Vocabulary: 238749


In [14]:
embeddingsIndex = embedding.constructEmbeddingsIndex()

In [15]:
missingWordsCount = 0
missingWords = []
# The below threshold is used to ignore all those words which appear below threshold count in the entire data set
thresholdForRareWordsCount = 20
for word, count in wordsCountDict.items():
    if count > thresholdForRareWordsCount:
        if word not in embeddingsIndex:
            missingWordsCount += 1
            missingWords.append((word, count))
            
missingRatio = round(missingWordsCount/len(wordsCountDict), 4) * 100

print("Number of words missing from the embedding framework:", missingWordsCount)
print("Percent of words that are missing from vocabulary: {}%".format(missingRatio))

Number of words missing from the embedding framework: 992
Percent of words that are missing from vocabulary: 0.42%


In [16]:
wordToIntDict, intToWordDict = Utils.buildWordToNumberRepresentations(
    wordsCountDict, embedding.specialTokens, embeddingsIndex, thresholdForRareWordsCount
)

usageRatio = round(len(wordToIntDict) / len(wordsCountDict),4)*100

print("Total number of unique words:", len(wordsCountDict))
print("Number of words we will use:", len(wordToIntDict))
print("Percent of words we will use: {}%".format(usageRatio))

Total number of unique words: 238749
Number of words we will use: 155837
Percent of words we will use: 65.27%


In [17]:
embeddingsMatrix = embedding.buildEmbeddingsVectorMatrix(wordToIntDict, embeddingsIndex)
print('Total number of embeddings:', len(embeddingsMatrix))

Total number of embeddings: 155837


In [18]:
# Converting all the summaries to corresponding number sequences
summariesToNumberSequence, summaryWordsCount, summaryUnknownWordsCount = Utils.convertTextToNumberSequence(
    cleanedSummaries, 
    wordToIntDict, 
    embedding.specialTokens['UNKNOWN']
)

# Converting all the text to corresponding number sequences
textToNumberSequence, textWordsCount, textUnknownWordsCount = Utils.convertTextToNumberSequence(
    cleanedText, 
    wordToIntDict, 
    embedding.specialTokens['UNKNOWN'], 
    eosToken = embedding.specialTokens['ENDOFSEQUENCE'],
    applyEos = True
)

totalWordsCount = summaryWordsCount + textWordsCount
totalUnknownWordsCount = summaryUnknownWordsCount + textUnknownWordsCount
unknownPercentage = round(totalUnknownWordsCount/totalWordsCount,4) * 100

print("Total number of words:", totalWordsCount)
print("Total number of UNKs:", totalUnknownWordsCount)
print("Percent of words that are UNK: {}%".format(unknownPercentage))

Total number of words: 39010724
Total number of UNKs: 180753
Percent of words that are UNK: 0.45999999999999996%


In [19]:
def computeLengthDataframe(textToNumberSequences):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for textToNumberSequence in textToNumberSequences:
        lengths.append(len(textToNumberSequence))
    return pd.DataFrame(lengths, columns=['counts'])

In [20]:
lengthSummaries = computeLengthDataframe(summariesToNumberSequence)
lengthText = computeLengthDataframe(textToNumberSequence)

print("Summaries:")
print(lengthSummaries.describe())
print()
print("Texts:")
print(lengthText.describe())

Summaries:
             counts
count  92579.000000
mean      43.395781
std        9.982053
min        8.000000
25%       36.000000
50%       44.000000
75%       51.000000
max      108.000000

Texts:
             counts
count  92579.000000
mean     378.981897
std      193.264759
min        1.000000
25%      228.000000
50%      351.000000
75%      496.000000
max     1454.000000


In [21]:
# Inspect the length of texts
print(np.percentile(lengthText.counts, 70))
print(np.percentile(lengthText.counts, 90))
print(np.percentile(lengthText.counts, 95))
print(np.percentile(lengthText.counts, 99))

# Inspect the length of summaries
print(np.percentile(lengthSummaries.counts, 70))
print(np.percentile(lengthSummaries.counts, 90))
print(np.percentile(lengthSummaries.counts, 95))
print(np.percentile(lengthSummaries.counts, 99.5))

464.0
645.0
748.0
927.2200000000012
49.0
56.0
59.0
67.0


In [22]:
maximumTextLength = 464
maximumSummaryLength = 49
minimumTextLength = 2
minimumSummaryLength = 2
unknownsInSummaryLimit = 4
unknownsInTextLimit = 10
        
summariesAndTextSequence = list(zip(summariesToNumberSequence, textToNumberSequence))
sortedSummaries, sortedText = Utils.applyFilterAndSort(summariesAndTextSequence, {
    'maximumTextLength': maximumTextLength,
    'maximumSummaryLength': maximumSummaryLength,
    'minimumTextLength': minimumTextLength,
    'minimumSummaryLength': minimumSummaryLength,
    'unknownsInSummaryLimit': unknownsInSummaryLimit,
    'unknownsInTextLimit': unknownsInTextLimit,
    'unknownTokenNumberRepresentation': embedding.specialTokens['UNKNOWN']
})

# Compare lengths to ensure they match
print(len(sortedSummaries))
print(len(sortedText))

47962
47962


In [23]:
Utils.pickle("data/sorted_summaries.pkl",sortedSummaries)
Utils.pickle("data/sorted_text.pkl",sortedText)
Utils.pickle("data/embeddings_matrix.pkl",embeddingsMatrix)
Utils.pickle("data/word_to_int.pkl",wordToIntDict)
Utils.pickle("data/int_to_word.pkl",intToWordDict)

In [24]:
sortedSummaries = Utils.unPickle("data/sorted_summaries.pkl")
sortedText = Utils.unPickle("data/sorted_text.pkl")
embeddingsMatrix = Utils.unPickle("data/embeddings_matrix.pkl")
wordToIntDict = Utils.unPickle("data/word_to_int.pkl")
intToWordDict = Utils.unPickle("data/int_to_word.pkl")

In [25]:
class Seq2SeqModel:
    """
    The implementation for Sequence to sequence modelling
    """
    def __init__(self):
        """The constructor. Your subclass's constructor can add
        other arguments."""
        
    def createModelInputsPlaceholders(self):
        inputData = tf.placeholder(tf.int32, [None, None], name='inputData')
        targetData = tf.placeholder(tf.int32, [None, None], name='targetData')
        learningRate = tf.placeholder(tf.float32, name='learningRate')
        dropoutRate = tf.placeholder(tf.float32, name='dropoutRate')
        inputSummaryLengths = tf.placeholder(tf.int32, (None,), name='inputSummaryLengths')
        maximumSummaryLength = tf.reduce_max(inputSummaryLengths, name='maximumSummaryLength')
        inputTextLengths = tf.placeholder(tf.int32, (None,), name='inputTextLengths')

        return inputData, targetData, learningRate, dropoutRate, inputSummaryLengths, maximumSummaryLength, inputTextLengths
        
    def createLSTMCell(self, rnnPerCellUnitsCount, requireDropoutLayer = False, dropoutRate = 0.95):
        # Creating the RNN cell
        cell = tf.contrib.rnn.LSTMCell(rnnPerCellUnitsCount,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        
        # Attaching a dropout layer for the cell if required
        if requireDropoutLayer:
            cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob = dropoutRate)
        return cell
    
    '''def createBasicLSTMCell(self, rnnPerCellUnitsCount, requireDropoutLayer = False, dropoutRate = 0.95):
        # Creating the RNN cell
        cell = tf.contrib.rnn.BasicLSTMCell(rnnPerCellUnitsCount)
        # Attaching a dropout layer for the cell if required
        if requireDropoutLayer:
            cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob = dropoutRate)
            print('Finished createBasicLSTMCell ==> DropoutWrapper')
        return cell'''
    
    def doEncoding(self, rnnPerCellUnitsCount, inputTextLengths, rnnCellsCount, embeddedEncoderInput, dropoutRate):
        """
        This is the implementation of an encoding process.
        """
        for rnnCellIndex in range(rnnCellsCount):
            with tf.variable_scope('encoder_{}'.format(rnnCellIndex)):
                # Creating the forward RNN cell for the Bi-directional RNN
                forwardCell = self.createLSTMCell(rnnPerCellUnitsCount, 
                                             requireDropoutLayer = True, 
                                             dropoutRate = dropoutRate)
                
                # Creating the backward RNN cell for the Bi-directional RNN
                backwardCell = self.createLSTMCell(rnnPerCellUnitsCount, 
                                             requireDropoutLayer = True, 
                                             dropoutRate = dropoutRate)
                
                # Connecting the forward and backward cells to create a Bi-directional RNN
                encoderOutput, encoderStates = tf.nn.bidirectional_dynamic_rnn(forwardCell, 
                                                                    backwardCell, 
                                                                    embeddedEncoderInput,
                                                                    inputTextLengths,
                                                                    dtype=tf.float32)
                encoderOutput = tf.concat(encoderOutput, 2)
                # The current layer's output is being fed into next layer's input
                embeddedEncoderInput = encoderOutput
        return encoderOutput, encoderStates
    
    def processDecoderInput(self, targetData, wordToIntDict, batchSize, startToken):
        """
        Remove the last word id from each batch and concatenate the id of the STARTOFSEQUENCE to the 
        begining of each batch.
        """
        ending = tf.strided_slice(targetData, [0, 0], [batchSize, -1], [1, 1])
        decoderInput = tf.concat([tf.fill([batchSize, 1], wordToIntDict[startToken]), ending], 1)
        return decoderInput
        
    def processTrainingLayerForDecoder(self, embeddedDecoderInput, inputSummaryLengths, decoderCell,
                                      outputLayer, totalWordsCountInVocab, maximumSummaryLength,
                                      batchSize):
        """
        This is the implementation for a Training decoding layer.
        """
        trainingHelper = tf.contrib.seq2seq.TrainingHelper(inputs = embeddedDecoderInput,
                                                        sequence_length = inputSummaryLengths,
                                                        time_major = False)
        
        trainingDecoder = tf.contrib.seq2seq.BasicDecoder(cell = decoderCell,
                                                       helper = trainingHelper,
                                                       initial_state = decoderCell.zero_state(
                                                           dtype=tf.float32, batch_size=batchSize),
                                                       output_layer = outputLayer)
        
        trainingLogits = tf.contrib.seq2seq.dynamic_decode(trainingDecoder,
                                                           output_time_major = False,
                                                           impute_finished = True,
                                                           maximum_iterations = maximumSummaryLength)
        return trainingLogits
        
    def processInferenceLayerForDecoder(self, embeddingsMatrix, startOfSequenceToken, endOfSequenceToken,
                                       decoderCell, outputLayer, maximumSummaryLength, batchSize):
        """
        This is the implementation for an Inference decoding layer.
        """
        startTokens = tf.tile(tf.constant([startOfSequenceToken], dtype=tf.int32), 
                              [batchSize], 
                              name='start_tokens')
        
        inferenceHelper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddingsMatrix,
                                                                   startTokens,
                                                                   endOfSequenceToken)
        
        inferenceDecoder = tf.contrib.seq2seq.BasicDecoder(decoderCell,
                                                        inferenceHelper,
                                                        decoderCell.zero_state(
                                                            dtype=tf.float32, batch_size=batchSize),
                                                        outputLayer)
        
        inferenceLogits = tf.contrib.seq2seq.dynamic_decode(inferenceDecoder,
                                                            output_time_major=False,
                                                            impute_finished=True,
                                                            maximum_iterations=maximumSummaryLength)
        
        return inferenceLogits
    
    def doDecoding(self, embeddedDecoderInput, embeddingsMatrix, encoderOutput, encoderStates,
                   totalWordsCountInVocab, inputTextLengths, inputSummaryLengths, maximumSummaryLength, 
                   rnnPerCellUnitsCount, wordToIntDict, dropoutRate, batchSize, rnnCellsCount, 
                   enableAttention = True):
        # Creating the RNN cell for the decoder
        decoderCell = tf.contrib.rnn.MultiRNNCell([self.createLSTMCell(rnnPerCellUnitsCount, requireDropoutLayer = True, dropoutRate = dropoutRate) for _ in range(rnnCellsCount)])

        # If an additional Attention layer needs to be applied
        if enableAttention:
            attentionMechanism = tf.contrib.seq2seq.BahdanauAttention(rnnPerCellUnitsCount,
                                                     encoderOutput,
                                                     inputTextLengths,
                                                     normalize = False,
                                                     name = 'BahdanauAttention')
            decoderCell = tf.contrib.seq2seq.AttentionWrapper(decoderCell, attentionMechanism, rnnPerCellUnitsCount)
            
        outputLayer = Dense(totalWordsCountInVocab, 
                            kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
        with tf.variable_scope("decode"):
            trainingLogits = self.processTrainingLayerForDecoder(embeddedDecoderInput,
                                                            inputSummaryLengths,
                                                            decoderCell,
                                                            outputLayer,
                                                            totalWordsCountInVocab,
                                                            maximumSummaryLength,
                                                            batchSize)
        with tf.variable_scope("decode", reuse=True):
            inferenceLogits = self.processInferenceLayerForDecoder(embeddingsMatrix,
                                                               wordToIntDict[embedding.specialTokens['STARTOFSEQUENCE']],
                                                               wordToIntDict[embedding.specialTokens['ENDOFSEQUENCE']],
                                                               decoderCell,
                                                               outputLayer,
                                                               maximumSummaryLength,
                                                               batchSize)
        return trainingLogits, inferenceLogits
    
    def process(self, inputData, targetData, dropoutRate, inputTextLengths, inputSummaryLengths, 
                maximumSummaryLength, totalWordsCountInVocab, rnnPerCellUnitsCount, 
                rnnCellsCount, wordToIntDict, batchSize, embeddingsMatrix):
        
        # Performing parallel lookups of inputData on the embeddingMatrix
        embeddedEncoderInput = tf.nn.embedding_lookup(embeddingsMatrix, inputData)
        
        # Performing the encoding
        encoderOutput, encoderStates = self.doEncoding(rnnPerCellUnitsCount,
                                                       inputTextLengths,
                                                       rnnCellsCount,
                                                       embeddedEncoderInput,
                                                       dropoutRate)
        
        # Process the decoder input before passing to decoding layer
        decoderInput = self.processDecoderInput(targetData, 
                                           wordToIntDict, 
                                           batchSize, 
                                           embedding.specialTokens['STARTOFSEQUENCE'])
        
        # Performing parallel lookups of decoder input on the embeddingMatrix
        embeddedDecoderInput = tf.nn.embedding_lookup(embeddingsMatrix, decoderInput)
        
        # Performing the encoding
        trainingLogits, inferenceLogits = self.doDecoding(embeddedDecoderInput,
                                                     embeddingsMatrix,
                                                     encoderOutput,
                                                     encoderStates,
                                                     totalWordsCountInVocab,
                                                     inputTextLengths,
                                                     inputSummaryLengths,
                                                     maximumSummaryLength,
                                                     rnnPerCellUnitsCount,
                                                     wordToIntDict,
                                                     dropoutRate,
                                                     batchSize,
                                                     rnnCellsCount)
        
        return trainingLogits, inferenceLogits

In [26]:
class BatchDataGenerator:
    """
    A class which helps in the generation of batches of data
    """
    @staticmethod
    def generateBatches(summaries, texts, batchSize, paddingToken):
        def padBatchContents(contents, paddingToken):
            maxContentLength = max([len(content) for content in contents])
            return [content + [paddingToken] * (maxContentLength - len(content)) for content in contents]
        possibleBatchCount = len(texts)//batchSize
        for batchIndex in range(0, possibleBatchCount):
            batchStartPoint = batchIndex * batchSize
            summariesBatch = summaries[batchStartPoint: batchStartPoint + batchSize]
            textBatch = texts[batchStartPoint: batchStartPoint + batchSize]
            paddedSummariesBatch = np.array(padBatchContents(summariesBatch, paddingToken))
            paddedTextBatch = np.array(padBatchContents(textBatch, paddingToken))
            
            # Need the lengths for the lengths parameters
            paddedSummariesLength = []
            for summary in paddedSummariesBatch:
                paddedSummariesLength.append(len(summary))

            paddedTextLength = []
            for text in paddedTextBatch:
                paddedTextLength.append(len(text))

            yield paddedSummariesBatch, paddedTextBatch, paddedSummariesLength, paddedTextLength

In [27]:
paddingToken = wordToIntDict[embedding.specialTokens['PADDING']]
print("'<PAD>' has id: {}".format(paddingToken))
sortedSummariesSamples = sortedSummaries[7:50]
sortedTextSamples = sortedText[7:50]
pad_summaries_batch_samples, pad_texts_batch_samples, pad_summaries_lengths_samples, pad_texts_lengths_samples = next(BatchDataGenerator.generateBatches(sortedSummariesSamples, sortedTextSamples, 5, paddingToken))
print("pad summaries batch samples:\n\r {}".format(pad_summaries_batch_samples))

'<PAD>' has id: 155834
pad summaries batch samples:
 [[   195   3179   1094   4796 155305   8324   8703 155269   2069   3471
    5749 155323 155351  17601   6472   3687 155272    119    695    322
    4082 155341 155313   1084 155270   3337   5294  27755 155834 155834
  155834 155834 155834 155834 155834 155834 155834 155834 155834]
 [ 19844  22845 155267   2674   1053    438  30564 155308 155269   4353
   22084    322   6146   4350 155305  12480  26407 155267  37460 155834
  155834 155834 155834 155834 155834 155834 155834 155834 155834 155834
  155834 155834 155834 155834 155834 155834 155834 155834 155834]
 [155269    251 155282    674   8423 155278   1293 155269    535 155295
  155289   6783 155277 155269   7906   1010 155834 155834 155834 155834
  155834 155834 155834 155834 155834 155834 155834 155834 155834 155834
  155834 155834 155834 155834 155834 155834 155834 155834 155834]
 [    31 155267   9595   1341    812 155270   7346    221 155293  12904
  155289   2728    433  1290

In [33]:
# Set the Hyperparameters
epochs = 100
batchSize = 50
rnnPerCellUnitsCount = 256
rnnCellsCount = 2
learningRate = 0.005
dropoutRate = 0.95

In [34]:
seq2seqModel = Seq2SeqModel()
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
    
    # Load the model inputs    
    input_data, targets, lr, dropout_rate, summary_length, max_summary_length, text_length = seq2seqModel.createModelInputsPlaceholders()

    # Create the training and inference logits
    trainingLogits, inferenceLogits = seq2seqModel.process(tf.reverse(input_data, [-1]),
                                                      targets, 
                                                      dropout_rate,   
                                                      text_length,
                                                      summary_length,
                                                      max_summary_length,
                                                      len(wordToIntDict)+1,
                                                      rnnPerCellUnitsCount, 
                                                      rnnCellsCount, 
                                                      wordToIntDict,
                                                      batchSize,
                                                      embeddingsMatrix)
    
    # Create tensors for the training logits and inference logits
    trainingLogits = tf.identity(trainingLogits[0].rnn_output, 'logits')
    inferenceLogits = tf.identity(inferenceLogits[0].sample_id, name='predictions')
    
    # Create the weights for sequence_loss, the sould be all True across since each batch is padded
    masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            trainingLogits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learningRate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
print("Graph is built.")
graph_location = "./graph"
print(graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(train_graph)

Graph is built.
./graph


In [35]:
# Subset the data for training
start = 150
end = start + 45000
print(len(sortedSummaries))
sampledSortedSummaries = sortedSummaries[start:end:15]
sampledSortedText = sortedText[start:end:15]
print(len(sampledSortedSummaries))
print("The shortest text length:", len(sampledSortedText[0]))
print("The longest text length:",len(sampledSortedText[-1]))

47962
3000
The shortest text length: 44
The longest text length: 434


In [37]:
# Train the Model
learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 10 # Check training loss after every 10 batches
stop_early = 0 
stop = 2 # If the update loss does not decrease in 3 consecutive update checks, stop training
per_epoch = 1 # Make 3 update checks per epoch
update_check = (len(sampledSortedText)//batchSize//per_epoch)-10

update_loss = 0 
batch_loss = 0
summary_update_loss = [] # Record the update losses for saving improvements in the model
paddingToken = wordToIntDict[embedding.specialTokens['PADDING']]
checkpoint = "./best_model.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    # If we want to continue training a previous session
    #loader = tf.train.import_meta_graph("./" + checkpoint + '.meta')
    #loader.restore(sess, checkpoint)
    
    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
                BatchDataGenerator.generateBatches(sampledSortedSummaries, sampledSortedText, batchSize, paddingToken)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: texts_batch,
                 targets: summaries_batch,
                 lr: learningRate,
                 summary_length: summaries_lengths,
                 text_length: texts_lengths,
                 dropout_rate: dropoutRate})

            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if (batch_i+1) % display_step == 0 and batch_i > 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i+1, 
                              len(sampledSortedText) // batchSize, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0

            if (batch_i+1) % update_check == 0 and batch_i > 0:
                print("Average loss for this update:", round(update_loss/update_check,3))
                summary_update_loss.append(update_loss)
                
                # If the update loss is at a new minimum, save the model
                if update_loss <= min(summary_update_loss):
                    print('New Record!') 
                    stop_early = 0
                    saver = tf.train.Saver() 
                    saver.save(sess, checkpoint)

                else:
                    print("No Improvement.")
                    stop_early += 1
                    if stop_early == stop:
                        break
                update_loss = 0
            
                    
        # Reduce learning rate, but not below its minimum value
        learningRate *= learning_rate_decay
        if learningRate < min_learning_rate:
            learningRate = min_learning_rate
        
        if stop_early == stop:
            print("Stopping Training.")
            break

Epoch   1/100 Batch   10/60 - Loss: 10.311, Seconds: 216.82
Epoch   1/100 Batch   20/60 - Loss:  6.616, Seconds: 243.38
Epoch   1/100 Batch   30/60 - Loss:  6.598, Seconds: 281.27
Epoch   1/100 Batch   40/60 - Loss:  6.622, Seconds: 313.89
Epoch   1/100 Batch   50/60 - Loss:  6.453, Seconds: 336.14
Average loss for this update: 7.32
New Record!
Epoch   1/100 Batch   60/60 - Loss:  6.670, Seconds: 405.37
Epoch   2/100 Batch   10/60 - Loss:  5.338, Seconds: 223.33
Epoch   2/100 Batch   20/60 - Loss:  5.745, Seconds: 251.36
Epoch   2/100 Batch   30/60 - Loss:  6.088, Seconds: 273.84
Epoch   2/100 Batch   40/60 - Loss:  6.194, Seconds: 331.02
Epoch   2/100 Batch   50/60 - Loss:  6.130, Seconds: 344.45
Average loss for this update: 5.899
New Record!
Epoch   2/100 Batch   60/60 - Loss:  6.345, Seconds: 396.47
Epoch   3/100 Batch   10/60 - Loss:  5.262, Seconds: 224.32
Epoch   3/100 Batch   20/60 - Loss:  5.708, Seconds: 256.19
Epoch   3/100 Batch   30/60 - Loss:  5.982, Seconds: 312.27
Epoch

Epoch  21/100 Batch   20/60 - Loss:  3.026, Seconds: 288.21
Epoch  21/100 Batch   30/60 - Loss:  3.204, Seconds: 284.62
Epoch  21/100 Batch   40/60 - Loss:  3.192, Seconds: 354.49
Epoch  21/100 Batch   50/60 - Loss:  3.169, Seconds: 365.54
Average loss for this update: 3.087
New Record!
Epoch  21/100 Batch   60/60 - Loss:  3.242, Seconds: 426.93
Epoch  22/100 Batch   10/60 - Loss:  2.675, Seconds: 232.91
Epoch  22/100 Batch   20/60 - Loss:  2.888, Seconds: 259.28
Epoch  22/100 Batch   30/60 - Loss:  3.069, Seconds: 308.21
Epoch  22/100 Batch   40/60 - Loss:  3.050, Seconds: 323.30
Epoch  22/100 Batch   50/60 - Loss:  3.033, Seconds: 371.63
Average loss for this update: 2.943
New Record!
Epoch  22/100 Batch   60/60 - Loss:  3.097, Seconds: 426.93
Epoch  23/100 Batch   10/60 - Loss:  2.570, Seconds: 230.41
Epoch  23/100 Batch   20/60 - Loss:  2.776, Seconds: 285.40
Epoch  23/100 Batch   30/60 - Loss:  2.926, Seconds: 306.33
Epoch  23/100 Batch   40/60 - Loss:  2.910, Seconds: 344.77
Epoc

Epoch  41/100 Batch   30/60 - Loss:  1.360, Seconds: 293.82
Epoch  41/100 Batch   40/60 - Loss:  1.340, Seconds: 338.20
Epoch  41/100 Batch   50/60 - Loss:  1.361, Seconds: 371.48
Average loss for this update: 1.299
New Record!
Epoch  41/100 Batch   60/60 - Loss:  1.373, Seconds: 435.99
Epoch  42/100 Batch   10/60 - Loss:  1.138, Seconds: 235.60
Epoch  42/100 Batch   20/60 - Loss:  1.230, Seconds: 266.34
Epoch  42/100 Batch   30/60 - Loss:  1.300, Seconds: 303.14
Epoch  42/100 Batch   40/60 - Loss:  1.275, Seconds: 343.67
Epoch  42/100 Batch   50/60 - Loss:  1.296, Seconds: 380.69
Average loss for this update: 1.248
New Record!
Epoch  42/100 Batch   60/60 - Loss:  1.317, Seconds: 431.62
Epoch  43/100 Batch   10/60 - Loss:  1.092, Seconds: 234.84
Epoch  43/100 Batch   20/60 - Loss:  1.179, Seconds: 272.12
Epoch  43/100 Batch   30/60 - Loss:  1.251, Seconds: 323.37
Epoch  43/100 Batch   40/60 - Loss:  1.233, Seconds: 343.20
Epoch  43/100 Batch   50/60 - Loss:  1.261, Seconds: 379.99
Aver

In [48]:
newsIndex = 165
totalNewsCount = len(textToNumberSequence)
testNews = [textToNumberSequence[newsIndex]]
maxSummaryLength = len(news['Summary'][newsIndex])
print(testNews)

[[656, 7915, 11957, 11958, 1151, 2521, 1220, 1010, 7906, 286, 814, 4107, 11959, 886, 800, 11960, 172, 3010, 2979, 886, 4384, 11961, 1698, 7197, 172, 1254, 352, 11962, 2012, 10, 771, 396, 239, 849, 2852, 773, 1593, 3288, 402, 172, 1220, 8928, 4276, 1864, 3959, 2526, 69, 7906, 11957, 4205, 1151, 1136, 4109, 6403, 4095, 886, 2467, 370, 1010, 1151, 4107, 11959, 6403, 1392, 11963, 372, 4242, 3085, 718, 11360, 8115, 216, 2695, 10, 322, 11964, 11965, 1069, 666, 8125, 1366, 10, 11966, 504, 172, 651, 343, 788, 11957, 244, 10, 656, 1668, 1068, 9815, 4804, 644, 11967, 1879, 761, 1304, 7827, 4242, 3707, 892, 1151, 3898, 4256, 1318, 285, 11968, 147, 4242, 445, 7281, 406, 11969, 7827, 4242, 372, 886, 800, 3777, 814, 3689, 4269, 11970, 11968, 147, 4242, 760, 3105, 3898, 773, 11971, 5628, 1151, 9580, 11972, 2012, 11973, 3131, 4632, 3978, 2797, 2902, 257, 372, 1421, 238, 402, 240, 11974, 118, 1315, 3595, 3898, 7686, 11975, 4301, 2647, 11976, 286, 2315, 1220, 11977, 11916, 7402, 11978, 2647, 1705, 892, 

In [49]:
checkpoint = "./best_model.ckpt"
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)
    input_data = loaded_graph.get_tensor_by_name('inputData:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('inputTextLengths:0')
    summary_length = loaded_graph.get_tensor_by_name('inputSummaryLengths:0')
    dropout_rate = loaded_graph.get_tensor_by_name('dropoutRate:0')
    #Multiply by batch_size to match the model's input parameters
    for i, text in enumerate(testNews):
        answer_logits = sess.run(logits, {input_data: [text]*batchSize, 
                                          summary_length: [maxSummaryLength], #summary_length: [np.random.randint(5,8)], 
                                          text_length: [len(text)]*batchSize,
                                          dropout_rate: 1.0})[0] 
        # Remove the padding from the summaries
        pad = wordToIntDict["<PAD>"] 
        #print('- News:\n\r {}\n\r\n\r'.format(" ".join([intToWordDict[j] for j in testNews[i] if j != pad])))
        print('- News:\n\r {}\n\r\n\r'.format(news['Text'][newsIndex]))
        print('- Actual Summary:\n\r {}\n\r\n\r'.format(news['Summary'][newsIndex]))
        print('- Predicted Summary:\n\r {}\n\r\n\r'.format(" ".join([intToWordDict[j] for j in answer_logits if j != pad])))

INFO:tensorflow:Restoring parameters from ./best_model.ckpt
- News:
 president viktor yanukovych defended government handling political crisis ukraine thursday saying fulfilled obligations opposition leaders stoking people anger gain opposition continues escalate situation encourage people maintain protests icy streets said address posted website think wrong must understand future state people political interests certain groups set higher existence ukraine yanukovych insisted government lived concrete agreements reached opposition try end crisis government fulfilled obligations agreements including adoption law amnesty guarantees freedom liberation persons arrested conflict said also appealed ukrainians everything peace normal life said regrets young people died confrontation earlier yanukovych office said president sick leave acute respiratory disease accompanied fever country parliament approved amnesty bill anti government protesters extraordinary session wednesday unacceptable cond