# Introduction
#### she changlue
2th May 2017

This project use word2vec model to training word embeddings 


this notebook will process as follow:
1. load library and raw corpus data
2. cut the corpus in to a list format
3. encode the tokens and corpus
4. construct model and train
5. use kmeans to do tokens' cluster 
6. use T-SNE to visualization
7. save the outcomes

### 1)   load library and raw corpus data

In [45]:
% matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import matplotlib
import pickle
from sklearn.cluster import KMeans#cluster
from random import shuffle
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})

from tensorflow.contrib.layers import fully_connected

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import jieba.posseg as pseg # cut the documents with token and tags
import jieba
import tensorflow as tf

from sklearn.manifold import TSNE#cluster

 
# Any results you write to the current directory are saved as output.

In [12]:
jieba.load_userdict('corpus/custormTokens.txt')  
rawdata = pd.read_csv('corpus/opr_rem.csv', header=0,nrows=10000)

#### corpus briefing

In [13]:
rawdata.head()

Unnamed: 0,opr_rem
0,今天下午5点472
1,\t告知客户电话15226088432
2,说是叔侄关系 转告
3,客户承诺今天下午五点存入184
4,已提醒征信影响。已告知从今天凌晨开始又会多增加75元的滞纳金。客户敷衍明天去解决


In [14]:
rawdata.shape

(10000, 1)

In [15]:
rawdata.describe()

Unnamed: 0,opr_rem
count,10000
unique,10000
top,客户主动来电称要还全款，告知要打客服咨询，告知了客服热线
freq,1


### 2) cut the corpus in to a list format

In [16]:
tokenCorpus  = []#corpus list of cutted tokens
rawSentences = []#raw text 
documents    = list(rawdata['opr_rem'])#text which is send by custormers

In [17]:
# cstruct the corpus
for idx,sentence in enumerate(documents):
    if len(str(sentence))>4:
        sentence = sentence.replace('\t','')
        words = [pair.word for pair in pseg.lcut(sentence) if pair.flag in ['n','ns','vs','nv','v']]       
        if len(words)>1:
            tokenCorpus.append(words)
            rawSentences.append(sentence)

In [21]:
tokenCorpus[:3]

[['告知', '客户', '电话'], ['说', '是', '叔侄', '关系', '转告'], ['客户', '承诺', '存入']]

In [20]:
rawSentences[:5]

['告知客户电话15226088432',
 '说是叔侄关系  转告 ',
 '客户承诺今天下午五点存入184',
 '已提醒征信影响。已告知从今天凌晨开始又会多增加75元的滞纳金。客户敷衍明天去解决',
 '13606375572销售来电扣款 ']

### 3) encode the tokens and corpus

In [62]:
HP_miniTokenFreq = 3 #minimal tokens frequency
tokenCount = dict()  #token count
token2code = dict()  #token to code
code2token = ['inFreqTokens'] #code to token
code       = 1       #code

##### transfer token into codes

In [63]:
#get token frequency
for tokens in tokenCorpus:
    for token in tokens:
        tokenCount.setdefault(token,0)
        tokenCount[token]+=1  
#encode those tokens which have minumal frequency
for token in tokenCount:
    if tokenCount[token] > HP_miniTokenFreq:       
        token2code[token] = code
        code += 1
        code2token.append(token)
#transfer the raw token corpus into encoded corpus
codeCorpus = []
for tokens in tokenCorpus:          
    codeCorpus.append([token2code.get(token,0)for token in tokens])

###### save the dict

In [65]:
with open('save/wordEmbed/word2codeDict.pkl', 'wb') as f:  
    pickle.dump(token2code, f)   

### 4) construct model and train

In [71]:
documNums = len(codeCorpus)
tokenNums = len(code2token)
print("the training documents num:",documNums)
print("the training tokens    num:",tokenNums)

the training documents num: 8094
the training tokens    num: 509


##### dense list to sparse numpy

In [13]:
HP_max_steps = 30
tokenMat     = np.array([ [tokens[idx] if idx<len(tokens)else 0 for idx in range(HP_max_steps)]for tokens in codeCorpus],dtype=np.int32)
labels       = np.array(labels,dtype=np.int32)-1
seq_length   = np.array([len(tokens)for tokens in codeCorpus],dtype=np.int32)
all_index    = np.arange(len(tokenMat))

In [14]:
shuffle(all_index)

In [15]:
training_num = int(len(tokenMat)*0.7)
train_index = all_index[:training_num]
test_index  = all_index[training_num:]

train_x = tokenMat[train_index]
train_y = labels[train_index]
test_x  = tokenMat[test_index]
test_y  = labels[test_index]
train_seq_length = seq_length[train_index]
test_seq_length  = seq_length[test_index]

##### construct the graph

In [16]:
#hyperparameters
HP_embed_size  = 100
HP_n_neurons   = 100
learning_rate  = 0.1
n_outputs      = 7

In [17]:
#placeholder
tokenEmbed     = tf.Variable(tf.random_uniform([tokenNums,HP_embed_size], -1.0, 1.0))
token_code_ph  = tf.placeholder(tf.int32, [None,HP_max_steps])
seq_length_ph  = tf.placeholder(tf.int32, [None])
token_embed    = tf.nn.embedding_lookup(tokenEmbed, token_code_ph)
y_ph           = tf.placeholder(tf.int32, [None])

In [18]:
#forward 
with tf.name_scope("lstm"):   
    basic_cell = tf.contrib.rnn.BasicLSTMCell(num_units=HP_n_neurons)
    outputs, states = tf.nn.dynamic_rnn(basic_cell, token_embed, dtype=tf.float32,sequence_length=seq_length_ph)   
    logits = fully_connected(states.h, n_outputs, activation_fn=None)

In [19]:
with tf.name_scope("loss"):   
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_ph, logits=logits)
    loss = tf.reduce_mean(xentropy)

In [20]:
with tf.name_scope("training"):   
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

In [21]:
with tf.name_scope("eval"):   
    correct = tf.nn.in_top_k(logits, y_ph, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [22]:
learning_rate = 0.0001
with tf.name_scope("train"):
    optimizer   = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

In [None]:
#initial and save
init = tf.global_variables_initializer()
saver = tf.train.Saver()

#### training the tensorflow graph

In [None]:
#start training
n_epochs   = 1000
batch_size = 100
batch_num  = documNums//batch_size+1
 
with tf.Session() as sess:    
    init.run()
    for epoch in range(n_epochs):
        for batch in range(batch_num):          
            train_x_batch    = train_x [batch*batch_size:(batch+1)*batch_size]
            seq_length_batch = train_seq_length [batch*batch_size:(batch+1)*batch_size]
            train_y_batch    = train_y [batch*batch_size:(batch+1)*batch_size]

            #training
            sess.run(training_op, feed_dict={token_code_ph  : train_x_batch,
                                             seq_length_ph  : seq_length_batch,
                                             y_ph           : train_y_batch})  
        if epoch%10==0:    
            train_acc = accuracy.eval(feed_dict={ token_code_ph : train_x,
                                                  seq_length_ph : train_seq_length,
                                                  y_ph          : train_y})

            test_acc  = accuracy.eval(feed_dict={ token_code_ph : test_x,
                                                  seq_length_ph : test_seq_length,
                                                  y_ph          : test_y})
            print(train_acc,test_acc)
    save_path = saver.save(sess,"tfsave/topic_model.ckpt")
 

0.317782 0.332649
0.575704 0.593429
0.658451 0.648871
0.712148 0.669405
0.741197 0.683778
0.772007 0.694045
0.793134 0.706366
0.810739 0.708419
