# Introduction
#### she changlue
24th May 2017

This project use LSTM model to handle text classification problems.


this notebook will process as follow:
1. load library and raw corpus data
2. cut the corpus in to a list format
3. encode the tokens and corpus
4. construct model and train
5. use kmeans to do tokens' and docs' cluster 
6. use T-SNE to visualization
7. save the outcomes

### 1)   load library and raw corpus data

In [1]:
import os
import re
import string
import requests
import numpy as np
import collections
import random
import pickle
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
parameterssess = tf.Session()

In [5]:
# Set RNN Parameters
min_word_freq = 5
rnn_size = 128
epochs = 10
batch_size = 100
learning_rate = 0.001
training_seq_len = 50
embedding_size = rnn_size
save_every = 500
eval_every = 50
prime_texts = ['thou art more', 'to be or not to', 'wherefore art thou']

In [6]:
data_dir       = 'temp'
data_file      = 'shakespeare.txt'
model_path     = 'shakespeare_model'
full_model_dir = os.path.join(data_dir, model_path)
# Declare punctuation to remove, everything except hyphens and apostrophes
punctuation    = string.punctuation
punctuation    = ''.join([x for x in punctuation if x not in ['-', "'"]])

In [8]:
if not os.path.exists(full_model_dir):
    os.makedirs(full_model_dir)
# Make data directory
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
print('Loading Shakespeare Data')

Loading Shakespeare Data


In [9]:
# Check if file is downloaded.
if not os.path.isfile(os.path.join(data_dir, data_file)):
    print('Not found, downloading Shakespeare texts from www.gutenberg.org')
    shakespeare_url = 'http://www.gutenberg.org/cache/epub/100/pg100.txt'
    # Get Shakespeare text
    response = requests.get(shakespeare_url)
    shakespeare_file = response.content
    # Decode binary into string
    s_text = shakespeare_file.decode('utf-8')
    # Drop first few descriptive paragraphs.
    s_text = s_text[7675:]
    # Remove newlines
    s_text = s_text.replace('\r\n', '')
    s_text = s_text.replace('\n', '')
    # Write to file
    with open(os.path.join(data_dir, data_file), 'w') as out_conn:
        out_conn.write(s_text)
else:
# If file has been saved, load from that file
    with open(os.path.join(data_dir, data_file), 'r') as file_conn:
        s_text = file_conn.read().replace('\n', '')

Not found, downloading Shakespeare texts fromwww.gutenberg.org


In [10]:
s_text = re.sub(r'[{}]'.format(punctuation), ' ', s_text)
s_text = re.sub('\s+', ' ', s_text ).strip().lower()

In [11]:
class LSTM_Model():
    def __init__(self, rnn_size, batch_size, learning_rate,training_seq_len, vocab_size, infer =False):
        self.rnn_size = rnn_size
        self.vocab_size = vocab_size
        self.infer = infer
        self.learning_rate = learning_rate
        
        if infer:
            self.batch_size = 1
            self.training_seq_len = 1
        else:
            self.batch_size = batch_size
            self.training_seq_len = training_seq_len
            
        self.lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_size)
        self.initial_state = self.lstm_cell.zero_state(self.batch_size, tf.float32)
        self.x_data = tf.placeholder(tf.int32,[self.batch_size, self.training_seq_len])
        self.y_output = tf.placeholder(tf.int32, [self.batch_size, self.training_seq_len])
        
        with tf.variable_scope('lstm_vars'):
            # Softmax Output Weights
            W = tf.get_variable('W', [self.rnn_size,self.vocab_size], tf.float32, tf.random_normal_initializer())
            b = tf.get_variable('b', [self.vocab_size],tf.float32, tf.constant_initializer(0.0))
            
    # Define Embedding
        embedding_mat   = tf.get_variable('embedding_mat',[self.vocab_size, self.rnn_size], tf.float32,tf.random_normal_initializer())
        embedding_output =tf.nn.embedding_lookup(embedding_mat, self.x_data)
        rnn_inputs = tf.split(1, self.training_seq_len,embedding_output)
        rnn_inputs_trimmed = [tf.squeeze(x, [1]) for x in rnn_inputs]
# If we are inferring (generating text), we add a 'loop' function
# Define how to get the i+1 th input from the i th output
        def inferred_loop(prev, count):
            prev_transformed = tf.matmul(prev, W) + b
            prev_symbol      = tf.stop_gradient(tf.argmax(prev_transformed, 1))
            output           = tf.nn.embedding_lookup(embedding_mat,prev_symbol)
            return(output)
        
        decoder = tf.nn.seq2seq.rnn_decoder
        outputs, last_state = decoder(rnn_inputs_trimmed,self.initial_state,self.lstm_cell,loop_function=inferred_loop if infer else None)
        # Non inferred outputs
        output = tf.reshape(tf.concat(1, outputs), [-1,self.rnn_size])
        # Logits and output
        self.logit_output = tf.matmul(output, W) + b
        self.model_output = tf.nn.softmax(self.logit_output)
        
        loss_fun = tf.nn.seq2seq.sequence_loss_by_example
        loss = loss_fun([self.logit_output],[tf.reshape(self.y_output, [-1])],[tf.ones([self.batch_size *self.training_seq_len])],self.vocab_size)
        
        self.cost = tf.reduce_sum(loss) / (self.batch_size *self.training_seq_len)
        self.final_state = last_state
        gradients, _ =  tf.clip_by_global_norm(tf.gradients(self.cost,tf.trainable_variables()), 4.5)
        optimizer =tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = optimizer.apply_gradients(zip(gradients, tf.trainable_variables()))
        
        def sample(self, sess, words=ix2vocab, vocab=vocab2ix,num=10, prime_text='thou art'):
            state = sess.run(self.lstm_cell.zero_state(1,tf.float32))
            word_list = prime_text.split()
            for word in word_list[:-1]:
                x = np.zeros((1, 1))
                x[0, 0] = vocab[word]
                feed_dict = {self.x_data: x,self.initial_state:state}
                [state] = sess.run([self.final_state],feed_dict=feed_dict)
                out_sentence = prime_text
                word = word_list[-1]
            for n in range(num):
                x = np.zeros((1, 1))
                x[0, 0] = vocab[word]
                feed_dict = {self.x_data: x,self.initial_state:state}
                [model_output, state] =sess.run([self.model_output, self.final_state],feed_dict=feed_dict)
                sample = np.argmax(model_output[0])
                if sample == 0:
                    break
                word = words[sample]
                out_sentence = out_sentence + ' ' + word
            return(out_sentence)