In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import numpy as np

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
tokenizer = Tokenizer()
total_words = 0

In [3]:
def dataset_preparation(data):

	# basic cleanup
	corpus = data.lower().split("\n")

	# tokenization	
	tokenizer.fit_on_texts(corpus)
	total_words = len(tokenizer.word_index) + 1

	# create input sequences using list of tokens
	input_sequences = []
	for line in corpus:
		token_list = tokenizer.texts_to_sequences([line])[0]
		for i in range(1, len(token_list)):
			n_gram_sequence = token_list[:i+1]
			input_sequences.append(n_gram_sequence)

	# pad sequences 
	max_sequence_len = max([len(x) for x in input_sequences])
	print(max_sequence_len)
	input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
	# create predictors and label
	predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
# 	label = ku.to_categorical(label, num_classes=total_words)

	return predictors, label, max_sequence_len, total_words


In [4]:
def create_model(predictors, label, max_sequence_len, total_words):
	
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
    model.add(LSTM(150, return_sequences = True))
#     model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
#     earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
#     model.fit_generator(generator(predictors, label, BATCH_SIZE) , steps_per_epoch=int(len(predictors)/BATCH_SIZE) + 1, epochs=50, verbose=1, callbacks=[earlystop])
    return model 


In [5]:
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = []
        y = []
        for i in range(batch_size):
            x.append(sentence_list[index])
            y.append(ku.to_categorical(next_word_list[index], num_classes=total_words))
            index = index+1
            if index == len(sentence_list):
                index = 0
        yield np.array(x), np.array(y)

In [6]:
def generate_text(seed_text, next_words, max_sequence_len,model):
	for _ in range(next_words):
		token_list = tokenizer.texts_to_sequences([seed_text])[0]
		token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
		predicted = model.predict_classes(token_list, verbose=0)
		
		output_word = ""
		for word, index in tokenizer.word_index.items():
			if index == predicted:
				output_word = word
				break
		seed_text += " " + output_word
	return seed_text

In [7]:
data = open('en_US_3.txt', encoding="utf8").read()

In [8]:
BATCH_SIZE = 64

In [17]:
predictors, label, max_sequence_len, total_words = dataset_preparation(data)
print(predictors.shape, label.shape)
model = create_model(predictors, label, max_sequence_len, total_words)
model.load_weights('model_weights.h5')
earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
model.fit_generator(generator(predictors, label, BATCH_SIZE) , steps_per_epoch=int(len(predictors)/BATCH_SIZE) + 1, epochs=50, verbose=1, callbacks=[earlystop])

# print (generate_text("we naughty", 3, max_sequence_len))

39
(697219, 38) (697219,)
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 38, 10)            442800    
_________________________________________________________________
lstm_3 (LSTM)                (None, 38, 150)           96600     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               100400    
_________________________________________________________________
dense_2 (Dense)              (None, 44280)             4472280   
Total params: 5,112,080
Trainable params: 5,112,080
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoc

<keras.callbacks.callbacks.History at 0x14cb6a5624a8>

In [23]:
print (generate_text("oh man", 30, max_sequence_len,model))

oh man i hate cheesecake i don't know what i don't know what i was a dancer and i have a 300 store and i have a crush for adding in the


In [20]:
model.save_weights('model_weights.h5')

In [21]:
with open('model_architecture.json', 'w') as f:
    f.write(model.to_json())