# Introduction
#### she changlue
20th April 2017

This project use LSTM model to handle text classification problems.


this notebook will process as follow:
1. load library and raw corpus data
2. cut the corpus in to a list format
3. encode the tokens and corpus
4. construct model and train
5. use kmeans to do tokens' and docs' cluster 
6. use T-SNE to visualization
7. save the outcomes

### 1)   load library and raw corpus data

In [1]:
% matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import matplotlib

from sklearn.cluster import KMeans#cluster
from random import shuffle
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})

from tensorflow.contrib.layers import fully_connected

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import jieba.posseg as pseg # cut the documents with token and tags
import jieba
import tensorflow as tf

from sklearn.manifold import TSNE#cluster
# Any results you write to the current directory are saved as output.

In [2]:
from nltk.corpus import brown
tokenCorpus = brown.sents(categories=None)
tokenCorpus = list(tokenCorpus)

# 1) encode the tokens and corpus

In [3]:
HP_miniTokenFreq = 50 #minimal tokens frequency
tokenCount = dict()  #token count
token2code = dict()  #token to code
code2token = ['inFreqTokens'] #code to token
code       = 1       #code

##### transfer token into codes

In [4]:
#get token frequency
for tokens in tokenCorpus:
    for token in tokens:
        tokenCount.setdefault(token,0)
        tokenCount[token]+=1  
#encode those tokens which have minumal frequency
for token in tokenCount:
    if tokenCount[token] > HP_miniTokenFreq:       
        token2code[token] = code
        code += 1
        code2token.append(token)
    else:
        token2code[token] = 0
#transfer the raw token corpus into encoded corpus
codeCorpus = []
for tokens in tokenCorpus:          
    codeCorpus.append([token2code[token]for token in tokens])

# 2) construct model and train

In [5]:
documNums = len(codeCorpus)
tokenNums = len(code2token)
print("the training documents num:",documNums)
print("the training tokens    num:",tokenNums)

the training documents num: 57340
the training tokens    num: 2241


##### dense list to sparse numpy

In [6]:
HP_max_steps = 30
tokenMat   = np.array([ [tokens[idx] if idx<len(tokens)-1else 0 for idx in range(HP_max_steps)]for tokens in codeCorpus],dtype=np.int32)
labelMat   = np.array([ [tokens[idx] if idx<len(tokens)else 0 for idx in range(1,HP_max_steps+1)]for tokens in codeCorpus],dtype=np.int32)
seq_length = np.array([len(tokens)for tokens in codeCorpus],dtype=np.int32)
all_index  = np.arange(len(tokenMat))

In [7]:
shuffle(all_index)

In [8]:
training_num = int(len(tokenMat)*0.7)
train_index = all_index[:training_num]
test_index  = all_index[training_num:]

train_x = tokenMat[train_index]
train_y = labelMat[train_index]
test_x  = tokenMat[test_index]
test_y  = labelMat[test_index]
train_seq_length = seq_length[train_index]
test_seq_length  = seq_length[test_index]

##### construct the graph

In [9]:
#hyperparameters
HP_embed_size  = 128
HP_n_neurons   = 100
learning_rate  = 0.01
n_outputs      = tokenNums

In [10]:
#placeholder
tokenEmbed     = tf.Variable(tf.random_uniform([tokenNums,HP_embed_size], -1.0, 1.0))
token_code_ph  = tf.placeholder(tf.int32, [None,HP_max_steps])
seq_length_ph  = tf.placeholder(tf.int32, [None])
y_ph           = tf.placeholder(tf.int32, [None,HP_max_steps])

token_embed    = tf.nn.embedding_lookup(tokenEmbed, token_code_ph)

In [11]:
token_embed

<tf.Tensor 'embedding_lookup:0' shape=(?, 30, 128) dtype=float32>

In [12]:
#forward 
with tf.name_scope("lstm"):   
    basic_cell = tf.contrib.rnn.BasicLSTMCell(num_units=HP_n_neurons)
    outputs, states      = tf.nn.dynamic_rnn(basic_cell, token_embed, dtype=tf.float32,sequence_length=seq_length_ph) 
    stacked_lstm_outputs = tf.reshape(outputs, [-1, HP_n_neurons])
    stacked_outputs      = fully_connected(stacked_lstm_outputs, n_outputs,activation_fn=None)
    logits = tf.reshape(stacked_outputs, [-1, HP_max_steps, n_outputs])

In [13]:
with tf.name_scope("loss"):   
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_ph, logits=logits)
    loss = tf.reduce_mean(xentropy)

In [14]:
with tf.name_scope("training"):   
    optimizer   = tf.train.AdamOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

In [None]:
#initial and save
init = tf.global_variables_initializer()
saver = tf.train.Saver()

#### training the tensorflow graph

In [None]:
#start training
n_epochs   = 1000
batch_size = 100
batch_num  = documNums//batch_size+1
 
with tf.Session() as sess:    
    init.run()
    for epoch in range(n_epochs):
        for batch in range(batch_num):          
            train_x_batch    = train_x [batch*batch_size:(batch+1)*batch_size]
            seq_length_batch = train_seq_length [batch*batch_size:(batch+1)*batch_size]
            train_y_batch    = train_y [batch*batch_size:(batch+1)*batch_size]

            #training
            sess.run(training_op, feed_dict={token_code_ph  : train_x_batch,
                                             seq_length_ph  : seq_length_batch,
                                             y_ph           : train_y_batch})  
        if epoch%1==0:    
            train_acc = loss.eval(feed_dict={ token_code_ph : train_x,
                                                  seq_length_ph : train_seq_length,
                                                  y_ph          : train_y})

            test_acc  = loss.eval(feed_dict={ token_code_ph : test_x,
                                                  seq_length_ph : test_seq_length,
                                                  y_ph          : test_y})
            print(train_acc,test_acc)
    save_path = saver.save(sess,"tfsave/topic_model.ckpt")
 