## Importing data

In [1]:
import json

with open("data/askreddit.json") as json_data:
    data_raw = json.load(json_data)

data = []

# we create a list of comments, where each comment is stored as list of characters
for item in data_raw:
    # less than 150 character comments are too short for training
    if (len(item["body"]) >= 150 and item["score"] > 200):
        data.append(list(item["body"]))

## Prepare data for neural network

In [2]:
# all characters
characters = []
for sublist in data:
    for item in sublist:
        characters.append(item)

characters = sorted(list(set(characters)))
n_to_char = {n:char for n, char in enumerate(characters)}
char_to_n = {char:n for n, char in enumerate(characters)}

In [5]:
import numpy as np
from keras.utils import np_utils

seq_length = 100
X = []
Y = []
# each comment is used as a single piece of text
for comment in data:
    length = len(comment)
    for i in range(0, length-seq_length, 1):
        sequence = comment[i:i + seq_length]
        label = comment[i + seq_length]
        X.append([char_to_n[char] for char in sequence])
        Y.append(char_to_n[label])

# lstm requires data in the form of (number_of_sequences, length_of_sequence, number_of_features)
X_modified = np.reshape(X, (len(X), seq_length, 1))
X_modified = X_modified / float(len(characters))
# one-hot encoding y values
Y_modified = np_utils.to_categorical(Y)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
# separating the training, validation and test data
valid_split = 0.2
test_split = 0.1
sample_size = X_modified.shape[0]

X_train = X_modified[0:int(sample_size * (1 - valid_split - test_split))]
Y_train = Y_modified[0:int(sample_size * (1 - valid_split - test_split))]
X_valid = X_modified[int(sample_size * (1 - valid_split - test_split)):int(sample_size * (1 - test_split))]
Y_valid = Y_modified[int(sample_size * (1 - valid_split - test_split)):int(sample_size * (1 - test_split))]
X_test  = X_modified[int(sample_size * (1 - test_split)):]
Y_test  = Y_modified[int(sample_size * (1 - test_split)):]

## The LSTM network with 2 hidden layers

In [7]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.models import load_model
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

# an LSTM model that can learn character sequences
model = Sequential()
model.add(LSTM(400, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400))
model.add(Dropout(0.2))
model.add(Dense(Y_modified.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

### Training the model for 100 epochs

In [None]:
# training the model
model.fit(X_train, Y_train,
        batch_size = 200,
        epochs = 100,
        verbose = 2,
        validation_data = (X_valid, Y_valid),
        shuffle=True)
model.save('models/char_based_initial.hdf5')

#### Create text with the network

In [9]:
model = load_model('models/char_based_initial.hdf5')
# some random reddit comment that is 100 character long, we make our comment from this
full_text = list("Pineapples do not grow on palm trees. I always thought there were certain types of palm trees that a")
string_mapped = [char_to_n[c] for c in full_text]
for i in range(300):
        x = np.reshape(string_mapped,(1,len(string_mapped), 1))
        x = x / float(len(characters))

        pred_index = np.argmax(model.predict(x, verbose=0))
        full_text.append(n_to_char[pred_index])

        string_mapped.append(pred_index)
        string_mapped = string_mapped[1:len(string_mapped)]

# the predicted comment
print(''.join(full_text))

Pineapples do not grow on palm trees. I always thought there were certain types of palm trees that a terily ileck.  Ho was the quietest I've ever seen him watching it. Following the movie he just says "Damn... I'll just stick with the zombies."

The way they capture the struggle of survival in the worst situation. The hopelessness and helplessness. How long it takes to die and the drive to survive


### Training the model with early stopping

In [None]:
# early stopping with saving best model weights
early_stopping = EarlyStopping(patience = 10, verbose = 1)
checkpointer = ModelCheckpoint(filepath = 'models/char_based_early_stopping.hdf5', save_best_only = True, verbose = 1)
# training the model
model.fit(X_train, Y_train,
        batch_size = 100,
        epochs = 1000,
        verbose = 2,
        callbacks=[checkpointer, early_stopping],
        validation_data = (X_valid, Y_valid),
        shuffle=True)

#### Create text with the network

In [10]:
model = load_model('models/char_based_early_stopping.hdf5')
# some random reddit comment that is 100 character long, we make our comment from this
full_text = list("Pineapples do not grow on palm trees. I always thought there were certain types of palm trees that a")
string_mapped = [char_to_n[c] for c in full_text]
for i in range(300):
        x = np.reshape(string_mapped,(1,len(string_mapped), 1))
        x = x / float(len(characters))

        pred_index = np.argmax(model.predict(x, verbose=0))
        full_text.append(n_to_char[pred_index])

        string_mapped.append(pred_index)
        string_mapped = string_mapped[1:len(string_mapped)]

# the predicted comment
print(''.join(full_text))

Pineapples do not grow on palm trees. I always thought there were certain types of palm trees that a coua th the mode oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the mole oe the
