# 循环神经网络语言模型
## 自然语言编码
神经网络无法直接处理汉字，需要将汉字编号。下面这段代码就是利用Python的字典，对一句话中的每个字进行编号。


In [None]:
def encode_sentence(s,chars):
    sid=[0]
    for c in s:
        if not c in chars:
            chars[c]=len(chars)
        sid.append(chars[c])
    sid.append(1)
    return sid

In [None]:
chars={'<BOS>':0,'<EOS>':1,'<UNK>':2}
sen="巴黎是法国的首都及最大都市，同时是法兰西岛大区首府，为法国的政治与文化中心，隶属法兰西岛大区之下的巴黎省"
encode_sentence(sen,chars)


## 读取数据
读取的同时将汉字处理成上述的编号，同时要记录汉字和编号的对应表

In [None]:
import os
import json
import pickle

def prepare_data(dir_):
    chars={'<BOS>':0,'<EOS>':1,'<UNK>':2}
    sentences=[]
    sids=[]
    files=os.listdir(dir_)
    for file_ in files:
        al=os.path.join(dir_,file_)
        print al
        with open(al,'r') as f:
            lines=f.readlines()
            for line in lines:
                data=json.loads(line)
                text=data['text']
                sen=text.split('\n')
                for s in sen:
                    if len(s.strip())>0:
                        sentences.append(s)
                        sid=encode_sentence(s,chars)
                        sids.append(sid)
    n_char=len(chars)
    print 'vocabulary_size=%d data_size=%d'%(n_char,len(sids))
    pickle.dump(chars,open('chars.pkl','wb'))
    
    return sentences,sids,chars

In [None]:
sentences,sids,chars=prepare_data("corpus")

## 训练神经网络
首先设置一些超参数

In [None]:
class Args(object):
    max_length=256
    n_emb=80
    vocab_size=12000
    n_hidden=512
    batch_size=16

## 开始训练流程

In [None]:
from lstm import LSTMLM
import numpy as np
import copy

def train(sids):
    args=Args()
    lstm=LSTMLM(args)
    lstm.build_model()
    
    for i in range(40000):
        batch_sen=np.random.choice(sids,size=args.batch_size)
        batch_sen=[copy.copy(s) for s in batch_sen]
        loss=lstm.train(batch_sen)
        if i%10==0:
            print 'step=%d, loss=%.3f'%(i,loss)
        if i%1000==0 and i!=0:
            lstm.save_model('model')

train(sids)

## 测试和使用
语言模型可以判断任意字符串是自然语言的概率，有非常多的用处。
### 判断几句话中哪句更通顺

In [None]:
from lstm import LSTMLM
import numpy as np

def get_prob(sen):
    sen=sen.decode('utf-8')
    args=Args()
    lstm=LSTMLM(args)
    lstm.build_model()
    lstm.load_model('model')
    chars=pickle.load(open('chars.pkl','rb'))
    
    prob=0.
    segments=[c for c in sen]
    segments.insert(0,'<BOS>')
    segments.append('<EOS>')
    sid=[(chars[c] if c in chars else 2)for c in segments]
    eprob=0
    for i in range(1,len(sid)):
        dist=lstm.next_char([sid[:i]])[0]
        eprob+=np.log(sid[i])
        epp=-eprob/i
        print sen[:i].encode('utf-8'),epp,eprob
    return epp


In [None]:
sen1="分哈啊词腌可"
print get_prob(sen1)
sen2="数学是一门历史悠久的学科。"
print get_prob(sen2)

### 将句子补齐

In [None]:
from lstm import LSTMLM
import numpy as np
import copy

def maximum_generate(prefix):
    prefix=prefix.decode('utf-8')
    args=Args()
    lstm=LSTMLM(args)
    lstm.build_model()
    lstm.load_model('model')
    chars=pickle.load(open('chars.pkl','rb'))
    rchars={chars[c]:c for c in chars}
    
    segments=[c for c in prefix]
    segments.insert(0,'<BOS>')
    sid=[(chars[c] if c in chars else 2)for c in segments]
    str_=prefix
    while sid[-1]!=1 and len(sid)<64:
        dist=lstm.next_char([copy.copy(sid)])[0]
        nxt=np.random.choice(range(args.vocab_size),p=dist)
        sid.append(nxt)
        c2=rchars[nxt]
        str_+=c2
        print str_.encode('utf-8')
    return str_

In [None]:
print maximum_generate("数学").encode('utf-8')