In [0]:
from absl import logging
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import time
import re
import seaborn as sns
from tensorflow.keras.layers import LSTM, TimeDistributed, Dense, Bidirectional, Input, Embedding 
from tensorflow.keras.layers import Dropout, Conv1D, Flatten
from tensorflow.keras.layers import Concatenate, Dot, Activation
from tensorflow.keras.models import Model
import os
import collections
import pickle

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
    return model(input)

In [0]:
newsdf = pd.read_csv("./cleannewsdata.csv")
newsdf.Summary = newsdf.Summary.apply(lambda s: s[6:])
def cleaner(s):
    s = re.sub("[.?%$0-9!&*+-/:;<=\[\]£]"," ", s)
    return " "+" ".join(s.split())
newsdf.Summary = newsdf.Summary.apply(cleaner)

In [3]:
np.random.seed(1)
testindices = np.random.choice(newsdf.shape[0], np.int(0.2*newsdf.shape[0]), replace=False)
trainindices = np.sort(list(set(np.arange(newsdf.shape[0]))-set(testindices)))
traindf, testdf = newsdf.iloc[trainindices], newsdf.iloc[testindices]
traindf.shape, testdf.shape

((1616, 4), (403, 4))

In [0]:
vocab = np.unique([word for sent in newsdf.Summary.apply(lambda s: list(s)).values for word in sent])
def windowed_summary(s, WINDOW_LENGTH=100):
    summ = np.zeros((len(s)-WINDOW_LENGTH, WINDOW_LENGTH))
    nextchar = np.zeros(len(s)-WINDOW_LENGTH, dtype='<U1')
    for i in range(WINDOW_LENGTH, len(s)):
        summ[i-WINDOW_LENGTH,:] = [np.where(vocab==r)[0][0] for r in list(s[i-WINDOW_LENGTH:i])]
        nextchar[i-WINDOW_LENGTH] = s[i]
    return summ, nextchar

In [0]:
def LSTM_data(df, WINDOW_LENGTH=100):
    chararray = np.zeros((df.Summary.apply(lambda s: len(s)-WINDOW_LENGTH).sum(), WINDOW_LENGTH))
    predarray = np.zeros((df.Summary.apply(lambda s: len(s)-WINDOW_LENGTH).sum(), vocab.shape[0]))
    pos = 0
    for i in range(df.shape[0]):
    chars, nextval = windowed_summary(df.iloc[i]['Summary'])
    chararray[pos:pos+chars.shape[0],:] = chars
    for j in range(pos, pos+nextval.shape[0]):
        predarray[j,np.where(vocab==nextval[j-pos])[0][0]] = 1
    pos+=chars.shape[0]
    return chararray, predarray

In [6]:
start = time.time()
trainX, trainY = LSTM_data(traindf)
testX, testY = LSTM_data(testdf)
print("Data Generation Exited in "+str(time.time()-start))
trainX.shape, trainY.shape, testX.shape, testY.shape

Data Generation Exited in 293.3199450969696


((907172, 100), (907172, 27), (225456, 100), (225456, 27))

In [7]:
#Generate Universal Sentence Encodings
trainstateX = embed(traindf.Text.values).numpy()
trainstateX = np.repeat(trainstateX, traindf.Summary.apply(lambda s: len(s)-100).values, 0)
teststateX = embed(testdf.Text.values).numpy()
teststateX = np.repeat(teststateX, testdf.Summary.apply(lambda s: len(s)-100).values, 0)
trainstateX.shape, teststateX.shape

((907172, 512), (225456, 512))

In [8]:
latentdim = 512
tf.keras.backend.clear_session()
state = Input(shape=(latentdim,))
decinput = Input(shape=(100,))
embed_layer = Embedding(vocab.shape[0], vocab.shape[0], weights=[np.eye(vocab.shape[0])], 
                           trainable=False, input_length=100)
embedval = embed_layer(decinput)
lstm_layer1 = LSTM(latentdim, return_sequences=True, return_state=True)
lstm1val, _, _ = lstm_layer1(embedval, initial_state=[state, state])
lstm1val = Dropout(0.2)(lstm1val)
lstm_layer2 = Bidirectional(LSTM(latentdim, return_sequences=True, return_state=True))
lstm2val, _, _, _, _ = lstm_layer2(lstm1val, initial_state=[state, state, state, state])
lstm2val = Dropout(0.2)(lstm2val)
lstm_layer3 = LSTM(latentdim, return_sequences=False, return_state=True)
lstm3val, _, _ = lstm_layer3(lstm2val, initial_state=[state, state])
lstm3val = Dropout(0.2)(lstm3val)
dense_layer = Dense(vocab.shape[0], activation="softmax")
output = dense_layer(lstm3val)
mdl = Model(inputs=[decinput, state], outputs=output)
mdl.compile(optimizer="adam", loss="categorical_crossentropy")
mdl.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 27)      729         input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 100, 512), ( 1105920     embedding[0][0]                  
                                                                 input_1[0][0]                

In [9]:
chckpt = tf.keras.callbacks.ModelCheckpoint("./newspred.h5", monitor='val_loss', save_best_only=True,
                                            save_weights_only=True, save_freq='epoch')
hist = mdl.fit([trainX, trainstateX], trainY, callbacks=[chckpt], verbose=True, batch_size=1000, epochs=10,
               validation_data=([testX, teststateX], testY))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
mdl.load_weights("./newspred.h5")

In [0]:
def beamer(start, state, k, toplimit=10):
    returnvals = collections.deque()
    pred = mdl.predict([start, state])
    if k==1:
        returnvals.append(np.argmax(pred[0]))
        return np.max(pred[0]), returnvals
    else:
        maxval, beamseq = None, None
        topchoices = np.argsort(pred[0])[-toplimit:]
        for j in topchoices:
            chars = start.copy()
            chars[0,:-1] = chars[0,1:]
            chars[0,-1] = j
            val, shortseq = beamer(chars, state, k-1)
            if (not maxval) or ((val*pred[0,j])>maxval):
                maxval = val*pred[0,j]
                beamseq = shortseq
                beamseq.appendleft(j)
        return maxval, beamseq

In [0]:
def generate_text(start, state, k):
    start = start.copy().reshape(1, start.shape[-1])
    state = state.copy().reshape(1, state.shape[-1])
    seq = "".join([vocab[np.int(char)] for char in start[0]])+"|"
    for _ in range(200):
        maxval, beamseq = beamer(start.copy(), state.copy(), k)
        seq+="".join([vocab[np.int(i)] for i in beamseq])
        start[0,:-k] = start[0,k:]
        start[0,-k:] = beamseq
    return seq