    从头实现rnn
    rnn循环结构

In [7]:
import csv
import nltk
import itertools
import numpy as np
import datetime

In [8]:
#定义一些基本的常量
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
start_token = "SENTENCE_START"
end_token = "SENTENCE_END"

In [9]:
#处理数据，这里的数据是从reddit上面下载的英文的评论，预先使用nltk工具处理后，
#得到原始的训练数据集
file_path = "reddit-comments-2015-08.csv"
tokenized = []
original = []
with open(file_path, 'r') as f:
    #读取内容
    reader = csv.reader(f, skipinitialspace=True)
    for x in reader:
        #每个句子前后都加上开始和结束标记，然后再使用nltk分词
        sentences = nltk.sent_tokenize(x[0].lower())
        for sentence in sentences:
            original.append(sentence)
            sentence = start_token + " " + sentence + " " + end_token
            tokenized.append(nltk.word_tokenize(sentence))
    f.close()
print(tokenized[0:50])

[['SENTENCE_START', 'body', 'SENTENCE_END'], ['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END'], ['SENTENCE_START', 'it', "'s", 'a', 'slight', 'ppr', 'league-', '.2', 'ppr', '.', 'SENTENCE_END'], ['SENTENCE_START', 'standard', 'besides', '1', 'points', 'for', '15', 'yards', 'receiving', ',', '.2', 'points', 'per', 'completion', ',', '6', 'points', 'per', 'td', 'thrown', ',', 'and', 'some', 'bonuses', 'for', 'rec/rush/pass', 'yardage', '.', 'SENTENCE_END'], ['SENTENCE_START', 'my', 'question', 'is', ',', 'is', 'it', 'wildly', 'clear', 'that', 'qb', 'has', 'the', 'highest', 'potential', 'for', 'points', '?', 'SENTENCE_END'], ['SENTENCE_START', 'i', 'put', 'in', 'the', 'rules', 'at', 'a', 'ranking', 'site', 'and', 'noticed', 'that', 'top', 'qbs', 'had', '300', 'points', 'more', 'than', 'the', 'top', 'rb/wr', '.', 'SENTENCE_END'], ['SENTENCE_START', 'would', 'i

In [10]:
print(original[0:2])

['body', "i joined a new league this year and they have different scoring rules than i'm used to."]


In [11]:
#统计词频
word_freq = nltk.FreqDist(itertools.chain(*tokenized))
print("Found {} unique words tokens".format(len(word_freq)))

Found 65499 unique words tokens


In [12]:
#获取常用词，然后构建词频向量
vocab = word_freq.most_common(vocabulary_size - 1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([w, i] for i, w in enumerate(index_to_word))
print(index_to_word)



In [13]:
print(word_to_index)



In [14]:
print("vacabulary size {}".format(vocabulary_size))
print("The least frequent word in our vocabulary is '{}' and appeared times {}".format(vocab[-1][0], vocab[-1][1]))

vacabulary size 8000
The least frequent word in our vocabulary is 'documentary' and appeared times 10


In [15]:
#将所有不在词汇表中的词替换为unknown token
for i, sent in enumerate(tokenized):
    tokenized[i] = [w if w in word_to_index else unknown_token for w in sent]
print("Example sentence:{}".format(original[1]))
print("after pre-processint:{}".format(tokenized[1]))

Example sentence:i joined a new league this year and they have different scoring rules than i'm used to.
after pre-processint:['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']


In [16]:
#创建训练数据
#训练的主要目的是预测下一个词，所以x的数据是对应y数据的前一个词
x_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized])
print(x_train[1])
print(y_train[1])

[0, 6, 3494, 7, 155, 795, 25, 222, 8, 32, 20, 202, 4954, 350, 91, 6, 66, 207, 5, 2]
[6, 3494, 7, 155, 795, 25, 222, 8, 32, 20, 202, 4954, 350, 91, 6, 66, 207, 5, 2, 1]


    数据准备好后开始构建RNN网络
    首先设置RNN网络的默认参数
    网址<a href="https://songhuiming.github.io/pages/2017/08/20/build-recurrent-neural-network-from-scratch"/>

In [17]:
#隐层神经元个数
hidden_dim = 100
#梯度下降批数
bptt_truncate = 1000
#s = tanh(ux + w(s-1))
#o = softmax(vs)
u = np.random.uniform(-np.sqrt(1/vocabulary_size), np.sqrt(1/vocabulary_size), (hidden_dim, vocabulary_size))
v = np.random.uniform(-np.sqrt(1/hidden_dim), np.sqrt(1/hidden_dim), (vocabulary_size, hidden_dim))
w = np.random.uniform(-np.sqrt(1/hidden_dim), np.sqrt(1/hidden_dim), (hidden_dim, hidden_dim))

In [18]:
#前向传播计算结果
#softmax函数是计算x矩阵每一个x的softmax函数值
def softmax(x):
    xt = np.exp(x - np.sum(np.fromiter(x, "float")))
    return xt / np.sum(np.fromiter(xt, "float"))

def forward_propagation(x):
    global u, v, w
    T = len(x)
    s = np.zeros((T + 1, hidden_dim))
    s[-1] = np.zeros(hidden_dim)
    o = np.zeros((T, vocabulary_size))
    
    for t in np.arange(T):
        s[t] = np.tanh(u[:,x[t]] + w.dot(s[t - 1]))
        o[t] = softmax(v.dot(s[t]))
    return (o, s)

In [19]:
#预测函数
#根据o输出值中最大概率的判断结果
def prediction(x):
    o, s = forward_propagation(x)
    return np.argmax(o, axis=1)

In [20]:
#损失函数
#定义交叉熵损失函数
#单个token的预测结果熵，对于一个x预测有y类结果，y大小为词库大小
def calculate_loss(x, y):
    L = 0
    for t in np.arange(len(y)):
        o, s = forward_propagation(x[t])
        correct_word_predictions = o[np.arange(len(y[t])), x[t]]
        L += -1 * np.sum(np.fromiter((np.log(correct_word_predictions)), "float"))
    return L 

#计算总的，就是把每一个token的熵都加起来
def total_loss(x, y):
    n = np.sum(np.fromiter((len(y_i) for y_i in y), "float"))
    return calculate_loss(x, y) / n
# print("expected loss : {},actual loss : {}".format(np.log(vocabulary_size), total_loss(x_train[:1000], y_train[:1000])))

In [21]:
#反向传播训练模型
#使用梯度下降法
#因为是循环神经网络，网络中的参数记录有时序信息，所以递推公式中包含有前面的数据信息
def bptt(x, y):
    global u, v, w
    T = len(y)
    o, s = forward_propagation(x)
    #初始化参数
    dLdu = np.zeros(u.shape)
    dLdv = np.zeros(v.shape)
    dLdw = np.zeros(w.shape)
    #初始化参数
    delta_o = o 
    #y^ = y^ - y
    delta_o[np.arange(len(y)), y] -= 1
    for t in np.arange(T):
        dLdv += np.outer(delta_o[t], s[t].T)
        delta_t = v.T.dot(delta_o[t]) * (1 - (s[t] * s[t]))
        #根据公式从后往前推
        for bptt_step in np.arange(max(0, t - bptt_truncate), t + 1)[::-1]:
            dLdw += np.outer(delta_t, s[bptt_step - 1])
            dLdu[:, x[bptt_step]] += delta_t
            delta_t = w.T.dot(delta_t) * (1 - s[bptt_step - 1] * s[bptt_step - 1])
    return (dLdu, dLdv, dLdw)

In [22]:
#梯度更新
def sgd_step(x, y, learning_rate):
    dLdu, dLdv, dLdw = bptt(x, y)
    global u, v, w
    u -= learning_rate * dLdu
    v -= learning_rate * dLdv
    w -= learning_rate * dLdw

In [23]:
#模型训练
def train_with_sgd(x_train, y_train, learning_rate = 0.005, nepoch = 100, evaluate_loss_after = 5):
    losses = []
    num_examples_seen = 0 
    for epoch in range(nepoch):
        if epoch % evaluate_loss_after == 0:
            loss = total_loss(x_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print("{} loss after num_examples_seen={} epoch={}:{}".format(time, num_examples_seen, epoch, loss))
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5
                print("setting learning rate to {}".format(learning_rate))
        for i in range(len(y_train)):
            sgd_step(x_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [24]:
losses = train_with_sgd(x_train[:100], y_train[:100], nepoch = 10, evaluate_loss_after = 1)

2019-09-01 18:25:25 loss after num_examples_seen=0 epoch=0:8.9872068821151
2019-09-01 18:25:35 loss after num_examples_seen=100 epoch=1:8.98552615293417
2019-09-01 18:25:45 loss after num_examples_seen=200 epoch=2:8.983090642311604
2019-09-01 18:25:55 loss after num_examples_seen=300 epoch=3:8.977366760876784
2019-09-01 18:26:05 loss after num_examples_seen=400 epoch=4:7.4244368431676975
2019-09-01 18:26:15 loss after num_examples_seen=500 epoch=5:6.625243664967802
2019-09-01 18:26:25 loss after num_examples_seen=600 epoch=6:6.331072642659455
2019-09-01 18:26:34 loss after num_examples_seen=700 epoch=7:6.168752788451529
2019-09-01 18:26:44 loss after num_examples_seen=800 epoch=8:6.071442979663433
2019-09-01 18:26:54 loss after num_examples_seen=900 epoch=9:6.0117639310193445


In [30]:
#使用训练好的模型生成随机新文本
def generate_text(dict_words, index_of_words):
    #dict_words时type list,index of words 是type dict
    sent = [index_of_words[start_token]]
    #根据start_token预测后面的词，直到句子的结束
    while not sent[-1] == index_of_words[end_token]:
        next_probs, _ = forward_propagation(sent)
        sample_word = index_of_words[unknown_token]
        
        #前向预测概率后，取概率最大的单词
        while sample_word == index_of_words[unknown_token]:
            samples = np.random.multinomial(1, next_probs[-1])
            sample_word = np.argmax(samples)
        sent.append(sample_word)
        
    new_sent = [dict_words[i] for i in sent[1:-1]]
    new_sent_str = ' '.join(new_sent)
    
    return new_sent_str

In [34]:
sent_str = generate_text(index_to_word, word_to_index)
print("generate sentence:{}".format(sent_str))

generate sentence:relations be efficiently think single .
