In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
     
# set seeds for reproducability
import tensorflow as tf

from numpy.random import seed
tf.random.set_seed(2) 
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import json

In [3]:
with open('combined_2000.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
comms = []

for thread in data:
    for comment in thread['comments']:
        comms.append(comment['comment'])

In [4]:
from cleaner_funcs import clean_list

In [5]:
corpus = clean_list(comms)

In [6]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[395, 210],
 [395, 210, 2],
 [395, 210, 2, 68],
 [395, 210, 2, 68, 1272],
 [395, 210, 2, 68, 1272, 2749],
 [395, 210, 2, 68, 1272, 2749, 4],
 [395, 210, 2, 68, 1272, 2749, 4, 8530],
 [395, 210, 2, 68, 1272, 2749, 4, 8530, 3],
 [395, 210, 2, 68, 1272, 2749, 4, 8530, 3, 159],
 [395, 210, 2, 68, 1272, 2749, 4, 8530, 3, 159, 2]]

In [7]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [8]:
from tensorflow.keras.layers import SimpleRNN


In [9]:
def create_model(max_sequence_len, total_words):    
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1338, 10)          215630    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 21563)             2177863   
Total params: 2,437,893
Trainable params: 2,437,893
Non-trainable params: 0
_________________________________________________________________


In [10]:
from datetime import datetime

In [11]:
start = datetime.now()
#model.fit(predictors, label, epochs=5, verbose=5, use_multiprocessing=True)
print(datetime.now() - start)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
2 days, 12:10:56.279885


In [12]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk") 

Saved model to disk
