# Introduction
#### she changlue
20th April 2017

This project is construct to handle documents topic classification.
I use self designed algorithm to efficiently embedding the documents' topic vector and tokens' topic vector.It is emperiacally proved that this algorithm can classify the topic of different documents and tokens in a speed way.

this notebook will process as follow:
1. load library and raw corpus data
2. cut the corpus in to a list format
3. encode the tokens and corpus
4. construct model and train
5. use kmeans to do tokens' and docs' cluster 
6. use T-SNE to visualization
7. save the outcomes

### 1)   load library and raw corpus data

In [1]:
% matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import matplotlib
from random import shuffle
from sklearn.cluster import KMeans#cluster

import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import jieba.posseg as pseg # cut the documents with token and tags
import jieba
import tensorflow as tf

from sklearn.manifold import TSNE#cluster

from subprocess import check_output
print(check_output(["ls", "corpus"]).decode("utf8"))
# Any results you write to the current directory are saved as output.

custormTokens.txt
opr_rem.csv
催收sample.csv
马上消费金融_jiuhui.wu_2017-04-05.csv



In [3]:
jieba.load_userdict('corpus/custormTokens.txt')  
rawdata = pd.read_csv('corpus/opr_rem.csv', header=0)

#### corpus briefing

In [4]:
rawdata.head()

Unnamed: 0,opr_rem
0,今天下午5点472
1,\t告知客户电话15226088432
2,说是叔侄关系 转告
3,客户承诺今天下午五点存入184
4,已提醒征信影响。已告知从今天凌晨开始又会多增加75元的滞纳金。客户敷衍明天去解决


In [5]:
rawdata.shape

(999988, 1)

In [6]:
rawdata.describe()

Unnamed: 0,opr_rem
count,999986
unique,999986
top,称昨天有把父亲电话告诉，就是13187358666
freq,1


### 2) cut the corpus in to a list format

In [7]:
tokenCorpus  = []#corpus list of cutted tokens
rawSentences = []#raw text 
#documents    = list(rawdata[rawdata['消息目标']=='机器人']['消息内容'])#text which is send by custormers
documents    = list(rawdata['opr_rem'])#text which is send by custormers

In [8]:
# cstruct the corpus
for sentence in documents:
    if len(str(sentence))>4:
        sentence = sentence.replace('\t','')
        tokens = []    
        for pair in pseg.lcut(sentence):
            if pair.flag in ['t','n','ns','vs','nv','v']:
                tokens.append(pair.word)
            elif pair.flag=='m':
                if len(str(pair.word))==11:
                    tokens.append('NUMB')
                else:
                    tokens.append('MON')   
        if len(tokens)>1:
            tokenCorpus.append(tokens)
            rawSentences.append(sentence)

In [9]:
tokenCorpus[:3]

[['MON', 'MON', 'MON'],
 ['告知', '客户', '电话', 'NUMB'],
 ['说', '是', '叔侄', '关系', '转告']]

In [10]:
rawSentences[:5]

['今天下午5点472',
 '告知客户电话15226088432',
 '说是叔侄关系  转告 ',
 '客户承诺今天下午五点存入184',
 '已提醒征信影响。已告知从今天凌晨开始又会多增加75元的滞纳金。客户敷衍明天去解决']

### 3) encode the tokens and corpus

In [11]:
HP_miniTokenFreq = 3 #minimal tokens frequency
tokenCount = dict()  #token count
token2code = dict()  #token to code
code2token = ['IFRQ'] #code to token
code       = 1       #code

##### transfer token into codes

In [12]:
#get token frequency
for tokens in tokenCorpus:
    for token in tokens:
        tokenCount.setdefault(token,0)
        tokenCount[token]+=1  
#encode those tokens which have minumal frequency
for token in tokenCount:
    if tokenCount[token] > HP_miniTokenFreq:       
        token2code[token] = code
        code += 1
        code2token.append(token)
    else:
        token2code[token] = 0
#transfer the raw token corpus into encoded corpus
codeCorpus = []
for tokens in tokenCorpus:          
    codeCorpus.append([token2code[token]for token in tokens])

### 4) construct model and train

In [None]:
HP_topicNums = 8
documNums = len(codeCorpus)
tokenNums = len(code2token)
print("the training documents num:",documNums)
print("the training tokens    num:",tokenNums)

the training documents num: 979931
the training tokens    num: 5904


##### dense list to sparse numpy

In [None]:
tokenDocMat = np.zeros(shape=(documNums,tokenNums),dtype=np.float32)
for docIdx in range(documNums):
    tokenDocMat[docIdx][codeCorpus[docIdx]]=1
docTokenNum = tokenDocMat.sum(axis=1).reshape((-1,1))   

##### get doc index and shuffle it

In [None]:
docIndxSpan = np.array(range(documNums),dtype=np.int32)
shuffle(docIndxSpan)

##### construct the graph

In [None]:
#embedding initial
docum2topicEmbed  = tf.Variable(tf.random_uniform([documNums,HP_topicNums], -1.0, 1.0),name='docEmbed')
token2topicWeight = tf.Variable(tf.random_uniform([HP_topicNums,tokenNums], -1.0, 1.0),name='tokenEmbed')
multiplierEmbed   = tf.constant(docTokenNum,dtype=tf.float32)

In [None]:
#placeholder
document_idx   = tf.placeholder(tf.int32, shape=[None])
document_embed = tf.nn.embedding_lookup(docum2topicEmbed,document_idx) 
multiplier     = tf.nn.embedding_lookup(multiplierEmbed ,document_idx) 
tokensInDoc    = tf.placeholder(tf.float32,shape=[None,tokenNums])

In [None]:
#forward 
with tf.name_scope("TM"):
    documentembed_Sfmx     = tf.nn.softmax(document_embed)
    token2topicWeight_Sfmx = tf.nn.softmax(token2topicWeight,dim=0)
    tokensInDoc_pred       = tf.nn.softmax(tf.matmul(documentembed_Sfmx,token2topicWeight_Sfmx))*multiplier                           

In [None]:
# use least squre loss to train model
with tf.name_scope("loss"):   
    loss = tf.reduce_mean(tf.square(tokensInDoc - tokensInDoc_pred))

In [None]:
learning_rate = 0.1
with tf.name_scope("train"):    
    optimizer   = tf.train.AdamOptimizer(learning_rate)
    #set train operation
    training_op = optimizer.minimize(loss)
    #set predict operation
    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope="docEmbed")
    predict_op = optimizer.minimize(loss, var_list=train_vars)

In [None]:
#initial and save
init = tf.global_variables_initializer()
saver = tf.train.Saver()

#### training the tensorflow graph

In [None]:
#hyperparameters
n_epochs   = 1000
printLoop  = n_epochs/10
traningNum = 20000
batchNums  = documNums//traningNum+1
with tf.Session() as sess:    
    init.run()
    for epoch in range(n_epochs):
        #training
        trainIdx = docIndxSpan[:traningNum]
        top,train_loss =sess.run([training_op,loss], feed_dict={document_idx : trainIdx,                                                          
                                                                tokensInDoc  : tokenDocMat[trainIdx]})  
        if epoch%printLoop==0:
            print(train_loss)
    save_path = saver.save(sess,"tfsave/topic_model.ckpt")
    print("finished training,now begin predict")
    for epoch in range(n_epochs//10):
        for batch in range(batchNums):
        #training
            trainIdx = docIndxSpan[batch*traningNum:(batch+1)*traningNum]
            top,train_loss =sess.run([predict_op,loss], feed_dict={document_idx : trainIdx,                                                          
                                                                   tokensInDoc  : tokenDocMat[trainIdx]})  
        if epoch%printLoop==0:
            print(train_loss)
    # get embedding
    documMat = documentembed_Sfmx.eval(feed_dict={document_idx : docIndxSpan})
    tokenMat = token2topicWeight_Sfmx.eval()

0.00084257


In [None]:
documMat.sum(axis=1)[:5],tokenMat.sum(axis=0)[:5]

#### show the rough topic

In [None]:
topK = 10
outDF = []
keyWords = []
keySentences = []
for idx in range(HP_topicNums):    
    ords = np.argsort(-tokenMat[idx])[:topK]
    tmp = ['主题'+str(idx+1)]
    tmp += [code2token[i]for i in ords]
    keyWords+=tmp
    ords = np.argsort(-documMat.T[idx])[:topK]
    tmp = ['------------------------------']
    tmp +=[rawSentences[i]for i in ords]
    keySentences+=tmp
outDF = [keyWords,keySentences] 
outDF = pd.DataFrame(outDF).T
outDF.to_csv('save/topic.csv',encoding='gbk',index=False)

In [None]:
outDF.head(10)

### 5)  use kmeans to do tokens' and docs' cluster

In [None]:
tokenMat_KM = KMeans(n_clusters=HP_topicNums, random_state=0).fit(tokenMat.T)
documMat_KM = KMeans(n_clusters=HP_topicNums, random_state=0).fit(documMat)

In [None]:
tokenMat_Label = tokenMat_KM.labels_
documMat_Label = documMat_KM.labels_

### 6)  use T-SNE to visualization

In [None]:
tokenMat_tsne_embed = TSNE(random_state=1).fit_transform(tokenMat.T)  
documMat_tsne_embed = TSNE(random_state=1).fit_transform(documMat)  

##### the visualization of word embeddings

In [None]:
plt.scatter(tokenMat_tsne_embed[:,0],tokenMat_tsne_embed[:,1],c=tokenMat_Label, cmap=plt.cm.get_cmap("jet", 10))

##### the visualization of document embeddings

In [None]:
plt.scatter(documMat_tsne_embed[:,0],documMat_tsne_embed[:,1],c=documMat_Label, cmap=plt.cm.get_cmap("jet", 10))

### 7)  save the outcomes

In [None]:
outDF= pd.DataFrame()
outDF['text']=rawSentences
outDF['label']=documMat_Label

In [None]:
outDF.head()

In [None]:
outDF.to_csv('save/docLabel.csv',encoding='gbk',index=False)