# Learning and Generating text from Blog Data

## Load data, remove punctuations & tokenize

In [4]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load document
in_filename = 'blogdata_nonextline.txt'
doc = load_doc(in_filename)
print(doc[:200])

def clean_doc(doc):
  #print(doc)
  doc = doc.replace('.', ' ')
  print(doc[:200])
  tokens = doc.split(' ') 
  tokens = [word for word in tokens if word.isalpha()]
  tokens = [word.lower() for word in tokens]
  return tokens
  
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))


i took out reading from my list of hobbies a couple of years ago. i remember going to public library to borrow algorithms texts last year. the year before that i browsed the shelves for books on devel
i took out reading from my list of hobbies a couple of years ago  i remember going to public library to borrow algorithms texts last year  the year before that i browsed the shelves for books on devel
['i', 'took', 'out', 'reading', 'from', 'my', 'list', 'of', 'hobbies', 'a', 'couple', 'of', 'years', 'ago', 'i', 'remember', 'going', 'to', 'public', 'library', 'to', 'borrow', 'algorithms', 'texts', 'last', 'year', 'the', 'year', 'before', 'that', 'i', 'browsed', 'the', 'shelves', 'for', 'books', 'on', 'development', 'studies', 'and', 'sociology', 'this', 'year', 'ive', 'visited', 'only', 'on', 'behalf', 'of', 'my', 'mother', 'to', 'get', 'her', 'yoga', 'books', 'from', 'the', 'ground', 'floor', 'main', 'section', 'it', 'is', 'quite', 'probable', 'that', 'i', 'was', 'never', 'an', 'avid', '

#### Aand as you can see here, my vocabulary isn't extensive. 
A total of 5921 unique tokens of which at least 150-200 must be Malayalam (my mother tongue) words :D So much room to (and must) improve my writing.

## Generating sequences from the text data

Sequences of (20+1) words built from the entire text data. Since all punctuations have been removed, it has become a single piece of text, and sequences are generated from it. Sample sequences are displayed further below.

In [5]:
length = 20 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 32502


In [0]:
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [0]:
# save sequences to file

out_filename = 'blogdata_nonl_sequences.txt'
save_doc(sequences, out_filename)

### Sample Sequences

In [8]:
print(sequences[:10])

['i took out reading from my list of hobbies a couple of years ago i remember going to public library to', 'took out reading from my list of hobbies a couple of years ago i remember going to public library to borrow', 'out reading from my list of hobbies a couple of years ago i remember going to public library to borrow algorithms', 'reading from my list of hobbies a couple of years ago i remember going to public library to borrow algorithms texts', 'from my list of hobbies a couple of years ago i remember going to public library to borrow algorithms texts last', 'my list of hobbies a couple of years ago i remember going to public library to borrow algorithms texts last year', 'list of hobbies a couple of years ago i remember going to public library to borrow algorithms texts last year the', 'of hobbies a couple of years ago i remember going to public library to borrow algorithms texts last year the year', 'hobbies a couple of years ago i remember going to public library to borrow algo

#  Building the Language Model

In [0]:

from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding



# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# load
in_filename = 'blogdata_nonl_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [12]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]


# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())


# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model

model.fit(X, y, batch_size=128,epochs=200)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 50)            296100    
_________________________________________________________________
lstm_5 (LSTM)                (None, 20, 100)           60400     
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_5 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_6 (Dense)              (None, 5922)              598122    
Total params: 1,045,122
Trainable params: 1,045,122
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 

<keras.callbacks.History at 0x7f9b104fc950>

In [0]:
# save the model to file
model.save('./model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))


# Loading Language Model to Generate Text

In [22]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

# load cleaned text sequences
in_filename = 'blogdata_nonl_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1

# load the model
model = load_model('model.h5')

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

# select a seed text
# seed_text = lines[randint(0,len(lines))]
seed_text = "i have always wondered why i like to observe people "
print("SEED:"+seed_text + '\n')

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 40)
print("GENERATED:"+generated)


SEED:i have always wondered why i like to observe people 

GENERATED:i just meant stunned full adorable theyre smug at the window directly facing my villagers from the chain library there couldnt start their burn all at the morning trying in chinese corner at my laundry enthayi and beneath their backdrop


# TADA!