In [None]:
# keras and tensorflow > 2.0

In [None]:
# 1st step is data collection
# 2. Preprocess the data - Train and Test
# 3. Create an stacked LSTM model
# 4. Predict the test data and plot the output.
# 5. predict the future (some days) and plot the output.


In [6]:
# Import libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
# loading the dataset
headlines = []
df=pd.read_csv("ArticlesApril2017.csv")
headlines.extend(list(df.headline.values))
headlines = [line for line in headlines if line!= "Unknown"]
print(*headlines[:10], sep = "\n")

Finding an Expansive View  of a Forgotten People in Niger
And Now,  the Dreaded Trump Curse
Venezuela’s Descent Into Dictatorship
Stain Permeates Basketball Blue Blood
Taking Things for Granted
The Caged Beast Awakens
An Ever-Unfolding Story
O’Reilly Thrives as Settlements Add Up
Mouse Infestation
Divide in G.O.P. Now Threatens Trump Tax Plan


In [8]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [9]:
# cleaning the data step by step 
import string
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt


Defined a function that takes a single headline at a time and returns the cleaned headline. 
Using iteration we have passed each headline and made a list of cleaned data corpus.

In [10]:
corpus = [clean_text(x) for x in headlines]
corpus[:10]

['finding an expansive view  of a forgotten people in niger',
 'and now  the dreaded trump curse',
 'venezuelas descent into dictatorship',
 'stain permeates basketball blue blood',
 'taking things for granted',
 'the caged beast awakens',
 'an everunfolding story',
 'oreilly thrives as settlements add up',
 'mouse infestation',
 'divide in gop now threatens trump tax plan']

Generating n-gram squence for Training

-nlp model requires sequential input data and input word\tokenmust be numerical.

-Goal is to generate n-grams in order to train our model for next word prediction. 

In [2]:
import nltk
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku

In [3]:

tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

In [11]:
inp_sequences, total_words = get_sequence_of_tokens(corpus)
print(*inp_sequences[:10], sep="\n")

[169, 17]
[169, 17, 665]
[169, 17, 665, 367]
[169, 17, 665, 367, 4]
[169, 17, 665, 367, 4, 2]
[169, 17, 665, 367, 4, 2, 666]
[169, 17, 665, 367, 4, 2, 666, 170]
[169, 17, 665, 367, 4, 2, 666, 170, 5]
[169, 17, 665, 367, 4, 2, 666, 170, 5, 667]
[6, 80]


In [None]:
#conclusion: inp_sequence is an n-gram seq reqired for training next word prediction

In [None]:
# Padding the sequence:
#-variable sequence length is not favourable for Training using padding we make
# -every sequence of same length.

In [31]:
from tensorflow import keras
import tensorflow as tf
import keras.utils 
from keras import utils as np_utils
from keras.utils.np_utils import to_categorical
#from keras.utils.np_utils.to_categorical
#from tensorflow.keras.utils import to_categorical
import keras


In [32]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

In [33]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
# predictors: are the tokens that will be used as input for predicting the next word
# label : next word to be predicted.
# max_sequence_len : is the seq length
# pad_sequence : is used to pad an array of tokens to same length.


In [None]:
# Model Creation :
# so far data for Training is prepared!
# we will create LSTM model which takes predictors as input X and labels as input Y

1. Input Layer: takes input sequence.
2. LSTM Layer: It calculates the output using LSTM units and returns hidden and cell states. 
3. Dropout Layer: This layer is responsible for regularisation which means it prevents over-fitting. this is done by turning off the activations of some neurons in the LSTM layer.
4. Output Layer: This Computes the probability of our prediction.

In [34]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    # ----------Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    # ----------Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    # ----------Add Output Layer
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [35]:
model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 18, 10)            24220     
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 2422)              244622    
                                                                 
Total params: 313,242
Trainable params: 313,242
Non-trainable params: 0
_________________________________________________________________


Training the model

In [107]:
model.fit(predictors, label, epochs=95, verbose=95)

Epoch 1/95
Epoch 2/95
Epoch 3/95
Epoch 4/95
Epoch 5/95
Epoch 6/95
Epoch 7/95
Epoch 8/95
Epoch 9/95
Epoch 10/95
Epoch 11/95
Epoch 12/95
Epoch 13/95
Epoch 14/95
Epoch 15/95
Epoch 16/95
Epoch 17/95
Epoch 18/95
Epoch 19/95
Epoch 20/95
Epoch 21/95
Epoch 22/95
Epoch 23/95
Epoch 24/95
Epoch 25/95
Epoch 26/95
Epoch 27/95
Epoch 28/95
Epoch 29/95
Epoch 30/95
Epoch 31/95
Epoch 32/95
Epoch 33/95
Epoch 34/95
Epoch 35/95
Epoch 36/95
Epoch 37/95
Epoch 38/95
Epoch 39/95
Epoch 40/95
Epoch 41/95
Epoch 42/95
Epoch 43/95
Epoch 44/95
Epoch 45/95
Epoch 46/95
Epoch 47/95
Epoch 48/95
Epoch 49/95
Epoch 50/95
Epoch 51/95
Epoch 52/95
Epoch 53/95
Epoch 54/95
Epoch 55/95
Epoch 56/95
Epoch 57/95
Epoch 58/95
Epoch 59/95
Epoch 60/95
Epoch 61/95
Epoch 62/95
Epoch 63/95
Epoch 64/95
Epoch 65/95
Epoch 66/95
Epoch 67/95
Epoch 68/95
Epoch 69/95
Epoch 70/95
Epoch 71/95
Epoch 72/95
Epoch 73/95
Epoch 74/95
Epoch 75/95
Epoch 76/95
Epoch 77/95
Epoch 78/95
Epoch 79/95
Epoch 80/95
Epoch 81/95
Epoch 82/95
Epoch 83/95
Epoch 84/95
E

<keras.callbacks.History at 0x7fa8546ce710>

In [None]:
# We have trained our model architecture and now it’s ready to generate text. 
# We need to write a function to predict the next word based on the input words. 
# We also have to tokenize the sequence and pad it with the same sequence_length we provided for training, and then we will append each predicted word as a string.

In [108]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list],              maxlen=max_sequence_len-1, padding='pre')
        #predicted = model.predict_classes(token_list, verbose=0)
        predicted = np.argmax(model.predict(token_list),axis=1)       


        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [109]:
# set seeds for reproducability
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

1. seed_text : it’s the initial words that will be passed for text generation.
2. predict_classes: it will return the token id for the predicted word.
3. predicted: Its token id for predicted word and this will be converted back into a word using the dictionarytokenizer.word_index .items()
4. next_words: It’s the number of next words we want to be predicted.


In [110]:
print (generate_text("finding an expansive view  of a forgotten", 3, model, max_sequence_len))
print (generate_text("venezuelas", 3, model, max_sequence_len))
print (generate_text("stain", 4, model, max_sequence_len))
print (generate_text("taking things", 2, model, max_sequence_len))
print (generate_text("oreilly", 3, model, max_sequence_len))
print (generate_text("divide in gop", 5, model, max_sequence_len))

Finding An Expansive View  Of A Forgotten People In Niger
Venezuelas Descent Into Dictatorship
Stain Permeates Basketball Blue Blood
Taking Things For Granted
Oreilly Thrives As Settlements
Divide In Gop Now Threatens Trump Tax Plan


In [None]:
# Conclusion: 
# implemented text generation using LSTM model
# The Trained model worked perfectly. 
