In [1]:
import keras
from keras.layers import Dense , LSTM , Dropout , Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential

from tensorflow import set_random_seed
import numpy as np
set_random_seed(2)
np.random.seed(10)

import pandas as pd
import os

import string

Using TensorFlow backend.


In [2]:
folder_path = os.path.dirname(os.path.abspath('__file__'))+'/nyt-comments'
headlines = []
for file in os.listdir(folder_path):
    if 'Articles' in file :
        data = pd.read_csv(folder_path + '/' + file)
        headlines.extend(data.headline.values)
        break
headlines = [h for h in headlines if h!= 'Unknown']
len(headlines)

777

In [3]:
''.join(h for h in headlines[0] if h not in string.punctuation)

' GOP Leadership Poised to Topple Obama’s Pillars'

In [4]:
final_text = []
for headline in headlines:
    txt = ''.join([h for h in headline if h not in string.punctuation]).lower()
    final_text.append(txt)
final_text[:10]

[' gop leadership poised to topple obama’s pillars',
 'fractured world tested the hope of a young president',
 'little troublemakers',
 'angela merkel russia’s next target',
 'boots for a stranger on a bus',
 'molder of navajo youth where a game is sacred',
 '‘the affair’ season 3 episode 6 noah goes home',
 'sprint and mr trump’s fictional jobs',
 'america  becomes a stan',
 'fighting diabetes and leading by example']

In [5]:
tokenize = Tokenizer()
tokenize.fit_on_texts(final_text)
total_words = len(tokenize.word_index) + 1
input_sequences = []
for line in final_text:
    token_list = tokenize.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
input_sequences[:10]

[[70, 300],
 [70, 300, 607],
 [70, 300, 607, 3],
 [70, 300, 607, 3, 608],
 [70, 300, 607, 3, 608, 203],
 [70, 300, 607, 3, 608, 203, 609],
 [610, 40],
 [610, 40, 611],
 [610, 40, 611, 1],
 [610, 40, 611, 1, 204]]

In [6]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = keras.utils.to_categorical(label, num_classes=total_words)

In [7]:
model = Sequential()
model.add(Embedding(total_words , 10 , input_length = max_sequence_len - 1))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(total_words , activation = 'softmax'))
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [8]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 21, 10)            22890     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2289)              231189    
Total params: 298,479
Trainable params: 298,479
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.fit(predictors , label , epochs = 100 , verbose = 1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f7fa5a843c8>

In [14]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenize.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenize.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [17]:
print(generate_text('Nilesh' , 3 , model , max_sequence_len))
print(generate_text('america' , 4 , model , max_sequence_len))
print(generate_text('India' , 5 , model , max_sequence_len))
print(generate_text('UK' , 4 , model , max_sequence_len))
print(generate_text('History' , 11 , model , max_sequence_len))

Nilesh We Diet New
America Leader To Fad Found
India Going On The Lights Go
Uk We Diet New Year
History Trump Not A Little Downside Of The ‘Galloping Gourmet’ Forged In
