In [2]:
import json

In [3]:
with open('data/askreddit.json') as f:
    data = json.load(f)

Format
-----------

`{author_fullname, score, fullname, parent_id, body, is_submitter}`

In [4]:
print(data[0])

{'author_fullname': 't2_5m13p', 'body': "The 1963 version of Shirley Jackson's creepy book The Haunting. No special effects, no CGI, no rubber monster, just good, scary writing and acting. All alone. In the night. In the dark...\n", 'fullname': 't1_e75tb7l', 'is_submitter': False, 'parent_id': 't3_9lcjo3', 'score': 3443}


In [11]:
# now let's tokenize the body part
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

import pandas as pd

In [6]:
tokenizer = Tokenizer()

def dataset_preparation(data):
    # split text to lines
    corpus = data.lower().split("\n")
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    input_sequences = []
    for line in corpus:
        # List of words (I hope)
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            # Let's do predictions based on all of the words before
            # the actual word.
            # see n_gram: https://medium.com/@shivambansal36/language-modelling-text-generation-using-lstms-deep-learning-for-nlp-ed36b224b275
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
            max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences,   
                          maxlen=max_sequence_len, padding='pre'))
    return input_sequences, total_words

In [12]:
input_seq, total_words = dataset_preparation(data[0]["body"])
predictors, label = input_seq[:,:-1], input_seq[:,-1]
print( len(predictors), len(label))

# The following example dataframe contains the tokenized value of a word. (first column)
# all the other columns represent words which occurred before the word in the same line.
df = pd.DataFrame(data=np.column_stack((label, predictors)))

31 31


In [13]:
print(df)

    0   1   2   3   4   5   6   7   8   9  ...  22  23  24  25  26  27  28  \
0    4   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   0   
1    5   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   0   
2    6   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   0   
3    7   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   1   
4    8   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   1   4   
5    9   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   1   4   5   
6   10   0   0   0   0   0   0   0   0   0 ...   0   0   0   1   4   5   6   
7    1   0   0   0   0   0   0   0   0   0 ...   0   0   1   4   5   6   7   
8   11   0   0   0   0   0   0   0   0   0 ...   0   1   4   5   6   7   8   
9    2   0   0   0   0   0   0   0   0   0 ...   1   4   5   6   7   8   9   
10  12   0   0   0   0   0   0   0   0   0 ...   4   5   6   7   8   9  10   
11  13   0   0   0   0   0   0   0   0   0 ...   5   6   7   8  

In [14]:
# Label should be one-hot encoded for learning
import keras.utils as k_utils
label = k_utils.to_categorical(label, num_classes=total_words)