In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
from matplotlib import pyplot as plt

In [2]:
dataset = pd.read_csv('news_summary.csv')#,encoding="ISO-8859-1")
dataset.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",Virgin' now corrected to 'Unmarried' in IGIMS'...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [3]:
dataset = dataset.dropna(how='any',axis=0) 
x = dataset['ctext']
y = dataset['text']
headline = dataset['headlines']
del dataset

In [4]:
def preprocess_sentence(w):
    w = w.lower().strip()

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.rstrip().strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [5]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer

In [6]:
x = [preprocess_sentence(w) for w in x]
y = [preprocess_sentence(w) for w in y]
headline = [preprocess_sentence(w) for w in headline]
x_train, inp_lang = tokenize(x)
y_train, targ_lang = tokenize(y)
headline_train , headline_lang = tokenize(headline)
print("Input Tensor Shape :" , x_train.shape , "Target Tensor Shape :" , y_train.shape)
print("Preprocessed Text : \n" , x[10])
print("Tokenized Text : \n" , x_train[10])
print("Headline Text : \n" , headline[10])
print("Headline Tokenized : \n" , headline_train[10])

Input Tensor Shape : (4395, 5791) Target Tensor Shape : (4395, 86)
Preprocessed Text : 
 <start> the food safety and standards authority of india fssai is in the process of creating a network of food banking partners to collect and distribute leftover food from large parties and weddings to the hungry . a notification to create a separate category of food business operators fbos , who will be licensed to deal only with leftover food , has been drafted to ensure the quality of food . ? we are looking at partnering with ngos or organisations that collect , store and distribute surplus food to ensure they maintain certain hygiene and health standards when handling food , ? said pawan agarwal , ceo of fssai . ? tonnes of food is wasted annually . we are looking at creating a mechanism through which food can be collected from restaurants , weddings , large scale parties , ? says pawan agarwal , ? all food , whether it is paid for or distributed free , must meet the country ? s food safety a

In [7]:
class Encoder(tf.keras.Model):
    '''
    The code of encoder is used for text summarization. The model needs an initial_state which is the headline of the article.
    The input is supposed to be padded sequence for initial state as well as for the input. The model cuurently implements
    GRU for the encoder sequence as there is some bug with using lstm with bidirectional. Bidirectional was also tried with 
    GRU but same bug occured.
    
    The initialization is based on docuemnt context vector paper referenced from 
    https://arxiv.org/pdf/1807.08000.pdf paper named 
    Abstractive and Extractive Text Summarization using DocumentContext Vector and Recurrent Neural Networks.
    
    However the logic is modeified. The model takes the preprocessed tokenized padded sequence as input for initials state
    and the then it is passed through the embedded vector and the size of the output vector after embedding is (batch_size , time , embedding size)
    after this the model is averaged about the time axis and gives the output as (batch_size , embedding size).
    This tensor is then passed as input to dence vector and is used to give the output size of (batch_zie , lgur_hidden_units)
    '''
    def __init__(self,units = 16,activation='tanh',recurrent_activation='sigmoid',return_state=True,return_sequence=True , vocab_size=30000 , embedding_dim=30 , batch_size=2):
        super (Encoder , self).__init__()
        self.units = units
        self.activation = activation
        self.recurrent_activation = recurrent_activation
        self.return_state = return_state
        self.return_sequence = return_sequence
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
#         self.lstm = tf.keras.layers.LSTM(units=self.units , activation=self.activation , recurrent_activation=self.activation, return_sequences=self.return_sequence , return_state=self.return_state)
        self.gru = tf.keras.layers.GRU(units=self.units , activation=self.activation , recurrent_activation=self.activation, return_sequences=self.return_sequence , return_state=self.return_state)
#         self.bidirectional = tf.keras.layers.Bidirectional(self.gru)
        self.dense = tf.keras.layers.Dense(units=self.units , input_shape = (self.embedding_dim,))
    
    def call(self,x,headline):
        x = self.embedding(x)
        print(x.shape)
        headline = self.embedding(headline)
        headline = tf.reduce_mean(headline,axis=1)
        headline  = self.dense(headline)
        output , state = self.gru(x,initial_state=headline)
        return output , state

In [8]:
encoder = Encoder()

In [9]:
output , state = encoder(x_train[:2] , headline_train[:2])
print(output)
print(state)

(2, 5791, 30)
tf.Tensor(
[[[ 0.02062547  0.06393021 -0.00950941 ...  0.00337951  0.03245128
   -0.00304279]
  [ 0.01858877  0.02067099 -0.00554727 ... -0.01323937  0.00283354
    0.00513096]
  [-0.01093131 -0.04160754 -0.00731713 ...  0.00408089  0.04524069
    0.02155972]
  ...
  [ 0.0017015   0.02574603  0.0404877  ... -0.03360253  0.02527002
    0.01492347]
  [ 0.0017015   0.02574603  0.0404877  ... -0.03360253  0.02527002
    0.01492347]
  [ 0.0017015   0.02574603  0.0404877  ... -0.03360253  0.02527002
    0.01492347]]

 [[ 0.02114395  0.06358767 -0.00943915 ...  0.00340502  0.03268466
   -0.00293377]
  [-0.0082793  -0.02797831  0.01635849 ... -0.03019698  0.0143802
   -0.02463259]
  [-0.00127468 -0.01066426 -0.00922157 ... -0.02539098 -0.00258767
    0.00905837]
  ...
  [ 0.0017015   0.02574603  0.0404877  ... -0.03360253  0.02527002
    0.01492347]
  [ 0.0017015   0.02574603  0.0404877  ... -0.03360253  0.02527002
    0.01492347]
  [ 0.0017015   0.02574603  0.0404877  ... -0.033

In [10]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [11]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(state, output)
print(attention_result)
print(attention_weights)

tf.Tensor(
[[ 0.00165956  0.024303    0.03754503  0.00435443  0.01489141  0.01777105
   0.00478836  0.02164434  0.04274286 -0.00514809 -0.01843343 -0.03887654
   0.03769667 -0.03141996  0.02352603  0.01377087]
 [ 0.00151765  0.02418846  0.03722648  0.00436103  0.0150664   0.01752903
   0.00466847  0.02137492  0.04241479 -0.00519783 -0.01839531 -0.03835709
   0.03747584 -0.03126115  0.02323024  0.01374478]], shape=(2, 16), dtype=float32)
tf.Tensor(
[[[0.00017624]
  [0.00016342]
  [0.00017594]
  ...
  [0.00017229]
  [0.00017229]
  [0.00017229]]

 [[0.00017618]
  [0.00018363]
  [0.00019861]
  ...
  [0.00017216]
  [0.00017216]
  [0.00017216]]], shape=(2, 5791, 1), dtype=float32)


In [12]:

class Decoder(tf.keras.Model):
    
    def __init__(self, units = 16,activation='tanh',recurrent_activation='sigmoid',return_state=True,return_sequence=False , vocab_size=30000 , embedding_dim=30 , batch_size=2):
        super(Decoder, self).__init__()
        self.units = units
        self.activation = activation
        self.recurrent_activation = recurrent_activation
        self.return_state = return_state
        self.return_sequence = return_sequence
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
#         self.gru = tf.keras.layers.GRU(self.dec_units,
#                                        return_sequences=True,
#                                        return_state=True,
#                                        recurrent_initializer='glorot_uniform')
        self.gru = tf.keras.layers.GRU(units=self.units , activation=self.activation , recurrent_activation=self.activation, return_sequences=self.return_sequence , return_state=self.return_state)

        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.units)

    def call(self, x, hidden, output):
    # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[1]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [13]:
decoder = Decoder()

sample_decoder_output, _, _ = decoder(tf.random.uniform((2, 1)),state,output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (2, 30000)


In [15]:
import numpy as np
filename = 'glove.6B.50d.txt'
def loadGloVe(filename):
    vocab = []
    embd = []
    file = open(filename,'r')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append(row[1:])
    print('Loaded GloVe!')
    file.close()
    return vocab,embd
vocab,embd = loadGloVe(filename)
vocab_size = len(vocab)
embedding_dim = len(embd[0])
embedding = np.asarray(embd)

Loaded GloVe!


In [16]:
embedding_variable = tf.Variable(embedding)
embedding_variable.shape

TensorShape([400000, 50])

list