In [5]:
import numpy as np
import pandas as pd
import re
import jieba
from multiprocessing import cpu_count, Pool
from utils.pickle_io import *
import utils.config as config
import paddle.fluid as fluid

In [14]:
import paddle.fluid as fluid
import numpy as np

class Encoder(fluid.dygraph.Layer):
    def __init__(self, 
                 name_scope, 
                 enc_units, 
                 batch_size, 
                 vocab_size=1e5, 
                 word_vector=None,
                 param_attr=None,
                 bias_attr=None,
                 is_reverse=False,
                 gate_activation='sigmoid',
                 candidate_activation='tanh',
                 h_0=None,
                 origin_mode=False):
        '''
            Encoder初始化
            :param name_scope: 所在命名空间
            :param enc_units:GRU单元维度
            :param batch_size: Batch数量
            :param wordvector: 自定义词向量
            
        '''
        super(Encoder, self).__init__(name_scope)
        self.enc_units = int(enc_units)
        self.batch_size = int(batch_size)
        self.vocab_size = int(vocab_size)
        self.word_vector = word_vector
        
        # Embedding
        if word_vector is not None:
            w_param_attrs = fluid.ParamAttr(
                                name="emb_weight",
                                learning_rate=0.5,
                                initializer=fluid.initializer.NumpyArrayInitializer(self.word_vector),
                                trainable=True)
            self._embedding = fluid.dygraph.Embedding(
                                name_scope='embedding',
                                size=list(self.word_vector.shape),
                                param_attr= w_param_attrs,
                                is_sparse=False)
            # 如果有自定义词向量维度不符合 D*3，需要添加一层FC
            if self.word_vector.shape[1] != self.enc_units*3:
                self._fc = fluid.dygraph.FC('fc_for_gru', self.enc_units*3)
        else:
            self._embedding = fluid.dygraph.Embedding(
                                name_scope='embedding',
                                size=[self.vocab_size, self.enc_units*3],
                                param_attr='emb.w',
                                is_sparse=False)
        
        # GRU
        self._gru = fluid.dygraph.GRUUnit(
            self.full_name(),
            size=self.enc_units * 3,
            param_attr=param_attr,
            bias_attr=bias_attr,
            activation=candidate_activation,
            gate_activation=gate_activation,
            origin_mode=origin_mode)
        self.h_0 = h_0 if h_0 is not None else self.initialize_hidden_state()
        self.is_reverse = is_reverse
                                                
    
    def forward(self, inputs, hidden=None):
        '''
        调用Encoder时的计算
        :param inputs: variable类型的输入数据，维度（ N, T, D ）
        :param hidden: 隐藏层输入h_0
        :return output,state: output = hidden拼接向量，维度（ N, T, H ）
                              state = hidden时间维度的最后一个向量
        '''
        hidden = self.h_0 if hidden is None else hidden
        res = []
        for i in range(inputs.shape[1]):
            if self.is_reverse:
                i = inputs.shape[1] - 1 - i
            input_ = inputs[:, i:i + 1, :]
            input_ = fluid.layers.reshape(
                input_, [-1, input_.shape[2]], inplace=False)
            input_ = self._embedding(inputs)
            if hasattr(self, '_fc'):
                input_ = self._fc(input_)
            hidden, reset, gate = self._gru(input_, hidden)
            hidden_ = fluid.layers.reshape(
                hidden, [-1, 1, hidden.shape[1]], inplace=False)
            res.append(hidden_)
        if self.is_reverse:
            res = res[::-1]
        res = fluid.layers.concat(res, axis=1)
        return res, res[:,-1,:]
    
    def initialize_hidden_state(self):
        return fluid.layers.zeros((self.batch_size, self.enc_units), dtype='float32')

In [26]:
class BahdanauAttention(fluid.dygraph.Layer):
    def __init__(self, name_scope, units):
        super(BahdanauAttention, self).__init__(name_scope)
        self.W1 = fluid.dygraph.FC('attention_fc1', units, num_flatten_dims=2)
        self.W2 = fluid.dygraph.FC('attention_fc2', units, num_flatten_dims=2)
        self.V = fluid.dygraph.FC('attention_v', 1, num_flatten_dims=2)

    def forward(self, query, values):
        # query shape == (batch_size, 1, hidden_size)
        # values shape == (batch_size, max_length, hidden_size)
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        temp = (self.W1(values) + self.W2(query))
        score = self.V(fluid.layers.tanh(temp))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = fluid.layers.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        # paddlepaddle will not do broadcast autom atically, use elementwise_mul API
        context_vector = fluid.layers.elementwise_mul(values, attention_weights)
        context_vector = fluid.layers.reduce_sum(context_vector, dim=1)

        return context_vector, attention_weights

In [8]:
class Decoder(fluid.dygraph.Layer):
    def __init__(self, 
                 name_space, 
                 dec_units,
                 batch_size,
                 vocab_size=1e5, 
                 word_vector=None,
                 param_attr=None,
                 bias_attr=None,
                 is_reverse=False,
                 gate_activation='sigmoid',
                 candidate_activation='tanh',
                 h_0=None,
                 origin_mode=False):
        super(Decoder, self).__init__(name_space)
        self.batch_size = int(batch_size)
        self.dec_units = int(dec_units)
        self.vocab_size = int(vocab_size)
        self.word_vector = word_vector
        self.attention_units = attention_units
        
       # Embedding
        if word_vector is not None:
            w_param_attrs = fluid.ParamAttr(
                                name="emb_weight",
                                learning_rate=0.5,
                                initializer=fluid.initializer.NumpyArrayInitializer(self.word_vector),
                                trainable=True)
            self._embedding = fluid.dygraph.Embedding(
                                name_scope='embedding',
                                size=list(self.word_vector.shape),
                                param_attr= w_param_attrs,
                                is_sparse=False)
            self.vocab_size = self._embedding.shape[0]
        else:
            self._embedding = fluid.dygraph.Embedding(
                                name_scope='embedding',
                                size=[self.vocab_size, self.dec_units*3],
                                param_attr='emb.w',
                                is_sparse=False)
        
        # GRU
        self._gru = fluid.dygraph.GRUUnit(
            self.full_name(),
            size=self.dec_units * 3,
            param_attr=param_attr,
            bias_attr=bias_attr,
            activation=candidate_activation,
            gate_activation=gate_activation,
            origin_mode=origin_mode)
        
        # 如果维度不符合 D*3 不能传导，需要添加一层FC，保证维度是 D*3
        self._fc4gru = fluid.dygraph.FC('fc_for_gru', self.dec_units*3)
        
        # FC (N,H)==>(N,V)
        self._fc = fluid.dygraph.FC('fc', self.vocab_size)
        
    def forward(self, inputs, hidden, context_vector):
        # enc_output shape == (batch_size, max_length, hidden_size)
        # initial hidden is the last step of enc_output, hidden shape == (batch_size, hidden_size)
        inputs = self._embedding(inputs)
        # after concat shape is (batch_size, context_size + embedding_size)
        inputs = fluid.layers.concat([context_vector, inputs], axis=-1)
        inputs = self._fc4gru(inputs)
        
        hidden, reset, gate = self._gru(inputs, hidden)
        
        output = self._fc(hidden)
        
        return output, hidden
        
#         res = []
#         gates = []
#         for i in range(inputs.shape[1]):
# #             if self.is_reverse:
# #                 i = inputs.shape[1] - 1 - i
# #             input_ = inputs[:, i:i + 1, :]
            
#             # get Attention context_vector, shape = (batch_size, hidden_size)
#             attention_input = fluid.layers.reshape(hidden, [-1, 1, hidden.shape[1]])
#             context_vector, _ = self._attention(attention_input, enc_output)
            
#             # emb_vector shape after passing through embedding == (batch_size, 1, embedding_dim)
# #             input_ = fluid.layers.reshape(input_, [-1, input_.shape[2]])
#             emb_vector = self._embedding(inputs)
            
#             # concat context_vector and embedding_vector
#             # input_ shape == (batch_size, 1, context_size + embedding_size)
#             input_ = fluid.layers.concat([emb_vector, context_vector], axis=-1)
#             input_ = fluid.layers.reshape(input_, [-1, 1, input_.shape[1]])
            
#             # According to paddlepaddle API, input_ size need to be 3 * H(dec_units)
#             input_ = self._fc4gru(input_)
#             hidden, reset, gate = self._gru(input_, hidden)
#             res_one_step = self._fc(hidden)
#             res_one_step = fluid.layers.reshape(res_one_step, [-1, 1, res_one_step.shape[1]])
#             res.append(res_one_step)
            
#             gate_ = fluid.layers.reshape(
#                 gate, [-1, 1, gate.shape[1]], inplace=False)
#             gates.append(gate_)
#         if self.is_reverse:
#             res = res[::-1]
#             gates = gates[::-1]
#         res = fluid.layers.concat(res, axis=1)
#         gates = fluid.layers.concat(gates, axis=1)
#         return res, gates

In [7]:
import utils.wv_loader as wv_loader

In [None]:
class Seq2Seq(fluid.dygraph.Layer):
    def __init__(self, name_scope, params):
        super(Seq2Seq, self).__init__(name_scope)
        self.embedding_matrix = wv_loader.load_embedding_matrix()
        self.params = params
        self.encoder = Encoder(name_scope  = 'encoder',
                               enc_units   = params["enc_units"],
                               batch_size  = params["batch_size"],
                               word_vector = self.embedding_matrix)

        self.attention = BahdanauAttention(params["attn_units"])

        self.decoder = Decoder(params["vocab_size"],
                               params["embed_size"],
                               self.embedding_matrix,
                               params["dec_units"],
                               params["batch_size"])

    def call_encoder(self, enc_inp):
        enc_hidden = self.encoder.initialize_hidden_state()
        enc_output, enc_hidden = self.encoder(enc_inp, enc_hidden)
        return enc_output, enc_hidden

    def call_decoder_onestep(self, dec_input, dec_hidden, enc_output):
        context_vector, attention_weights = self.attention(dec_hidden, enc_output)

        pred, dec_hidden = self.decoder(dec_input,
                                        None,
                                        context_vector)
        return pred, dec_hidden, context_vector, attention_weights

    def call(self, dec_hidden, enc_output, dec_target):
        predictions = []
        attentions = []

        context_vector, _ = self.attention(dec_hidden, enc_output)
        dec_input = fluid.layers.reshape(dec_target[:, 0], [dec_target.shape[0],1,-1])

        for t in range(1, dec_target.shape[1]):
            pred, dec_hidden = self.decoder(dec_input,
                                            dec_hidden,
                                            context_vector)

            context_vector, attn = self.attention(dec_hidden, enc_output)
            # using teacher forcing
            dec_input = fluid.layers.reshape(dec_target[:, t], [dec_target.shape[0],1,-1])

            predictions.append(pred)
            attentions.append(attn)

            predictions = fluid.layers.concat(predictions, axis=1)
            attentions = fluid.layers.concat(attentions, axis=1)

        return predictions, attentions 

### 测试桩

In [12]:
import utils.wv_loader as wv_loader
emb_matrix = wv_loader.load_embedding_matrix()
emb_matrix.shape

(34252, 300)

In [19]:
dec_hidden.shape, enc_outputs.shape

([100, 64], [100, 10, 64])

In [24]:
fluid.layers.reshape(dec_hidden, shape=[dec_hidden.shape[0],1,-1]).shape

[100, 1, 64]

In [34]:
use_emb = True
with fluid.dygraph.guard():
    # Encoder
    enc_units = 64
    batch_size = 100
    T = 10
    if use_emb:
        encoder = Encoder('encoder', enc_units=enc_units, batch_size=batch_size, word_vector=emb_matrix)
    else:
        encoder = Encoder('encoder', enc_units=enc_units, batch_size=batch_size)
    X = fluid.dygraph.base.to_variable(np.random.randint(emb_matrix.shape[0], size=(batch_size, T, 1)))
    hidden = encoder.initialize_hidden_state()
    enc_outputs, enc_states = encoder(X, hidden)
    print("Encoder output shape is {}".format(enc_outputs.numpy().shape))
    print("Encoder state shape is {}".format(enc_states.numpy().shape))
    attention = BahdanauAttention('attention', 10)
    
    # Decoder
    dec_units = 64
    batch_size = 100
    if use_emb:
        decoder = Decoder('decoder', dec_units=dec_units, batch_size=batch_size, word_vector=emb_matrix)
    else:
        decoder = Decoder('decoder', dec_units=dec_units, batch_size=batch_size)
    X = fluid.dygraph.base.to_variable(np.random.randint(emb_matrix.shape[0], size=(batch_size, T, 1)))
    # initial dec_hidden with encoder output hidden
    dec_hidden = enc_states
    preds = []
    for t in range(T):
        context_vector, _ = attention(fluid.layers.reshape(dec_hidden, shape=[dec_hidden.shape[0],1,dec_hidden.shape[1]]), enc_outputs)
        x = X[:,t,:]
        dec_hidden, _ = decoder(x, dec_hidden, context_vector)
        preds.append(fluid.layers.reshape(dec_hidden, [batch_size, 1, dec_units]))
    preds = fluid.layers.concat(preds, axis=1)
    print("Decoder preds shape is {}".format(preds.numpy().shape))

Encoder output shape is (100, 10, 64)
Encoder state shape is (100, 64)
Decoder preds shape is (100, 10, 64)


In [66]:
from gensim.models import FastText
wv_model = FastText.load('output/fasttext/fasttext.model')

In [15]:
# 优化器调用api
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.2, regularization=fluid.regularizer.L2Decay)

In [39]:
# 定义loss函数
def loss_function(real, pred):
    # 判断logit为1和0的数量
    real = fluid.layers.cast(real, dtype=np.int64)
    mask = fluid.layers.logical_not(fluid.layers.equal(real, fluid.layers.ones_like(real)))
    # 计算decoder的长度
    dec_lens = fluid.layers.reduce_sum(fluid.layers.cast(mask, dtype=np.float32), dim=-1)
    # 计算loss值
    loss_ = fluid.layers.cross_entropy(input=pred, label=real)
    # 转换mask的格式
    mask = fluid.layers.cast(mask, dtype=loss_.dtype)
    # 调整loss
    loss_ *= mask
    # 确认下是否有空的摘要别加入计算
    loss_ = fluid.layers.reduce_sum(loss_, dim=-1) / real.shape[0]
    return fluid.layers.reduce_mean(loss_)

In [40]:
with fluid.dygraph.guard():
    real = fluid.dygraph.base.to_variable(np.array([0,1,2]))
    real = fluid.layers.reshape(real, [-1, 1])
    pred = fluid.dygraph.base.to_variable(np.array([[.9, .05, .05], [.05, .89, .06], [.05, .01, .94]]))
    loss = loss_function(real, pred)
    print(loss)

name tmp_8, dtype: VarType.FP64 shape: [1] 	lod: {}
	dim: 1
	layout: NCHW
	dtype: double
	data: [0.03153]



In [1]:
from utils.data_loader import *

Building prefix dict from the default dictionary ...
2019-12-14 17:08:01,309 : DEBUG : Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/w3/yc8mtbd91vs80rp79zfgk8x00000gn/T/jieba.cache
2019-12-14 17:08:01,311 : DEBUG : Loading model from cache /var/folders/w3/yc8mtbd91vs80rp79zfgk8x00000gn/T/jieba.cache
Loading model cost 0.630 seconds.
2019-12-14 17:08:01,940 : DEBUG : Loading model cost 0.630 seconds.
Prefix dict has been built succesfully.
2019-12-14 17:08:01,941 : DEBUG : Prefix dict has been built succesfully.


In [2]:
X_train,Y_train,X_test = load_dataset()

In [35]:
X_train = X_train.astype('float')
Y_train = Y_train.astype('float')
X_test = X_test.astype('float')

In [36]:
save_file((X_train,Y_train,X_test),'data/dataset.pkl')

In [3]:
from utils.wv_loader import *
w2v_model = load_w2v_model(config.save_wv_model_path)

2019-12-14 17:03:44,051 : INFO : loading Word2Vec object from /Users/mac/Documents/baidu-ai-course/competition/BaiduAIProject-car/output/word2vec/word2vec.model
2019-12-14 17:03:44,632 : INFO : loading wv recursively from /Users/mac/Documents/baidu-ai-course/competition/BaiduAIProject-car/output/word2vec/word2vec.model.wv.* with mmap=None
2019-12-14 17:03:44,633 : INFO : setting ignored attribute vectors_norm to None
2019-12-14 17:03:44,634 : INFO : loading vocabulary recursively from /Users/mac/Documents/baidu-ai-course/competition/BaiduAIProject-car/output/word2vec/word2vec.model.vocabulary.* with mmap=None
2019-12-14 17:03:44,634 : INFO : loading trainables recursively from /Users/mac/Documents/baidu-ai-course/competition/BaiduAIProject-car/output/word2vec/word2vec.model.trainables.* with mmap=None
2019-12-14 17:03:44,634 : INFO : setting ignored attribute cum_table to None
2019-12-14 17:03:44,636 : INFO : loaded /Users/mac/Documents/baidu-ai-course/competition/BaiduAIProject-car/ou

In [4]:
# Word2vec Embedding Matrix
print('构建Word2vec Vocab...')
w2v_matrix = w2v_model.wv.vectors
save_file(w2v_matrix, config.embedding_matrix_path)

# 保存id2word和word2id
vocab = w2v_model.wv.vocab
id2word = {i:x[0] for i,x in enumerate(vocab.items()) }
word2id = {x:i for i,x in id2word.items()}
save_file(id2word, config.id2word_path)
save_file(word2id, config.word2id_path)

构建Word2vec Vocab...


In [5]:
vocab['<UNK>']

<gensim.models.keyedvectors.Vocab at 0x1a314434a8>

In [6]:
id2word = load_file(config.id2word_path)

In [8]:
word2id = load_file(config.word2id_path)

In [9]:
word2id['<UNK>']

34251

In [38]:
# df_raw_train = pd.read_csv(config.train_data_path)
df_raw_train['Dialogue'][1]

'技师说：你这个有没有电脑检测故障代码。|车主说：有|技师说：发一下|车主说：发动机之前亮故障灯、显示是失火、有点缺缸、现在又没有故障、发动机多少有点抖动、检查先前的故障是报这个故障|车主说：稍等|车主说：显示图片太大传不了|技师说：[语音]|车主说：这个对发动机的抖动、失火、缺缸有直接联系吗？|技师说：[语音]|车主说：还有就是报（左右排气凸轮轴作动电磁铁）对正极短路、对地短路、对导线断路|技师说：[语音]|车主说：这几个电磁阀和问您的第一个故障有直接关系吧|技师说：[语音]|车主说：这个有办法检测它好坏吗？|技师说：[语音]|车主说：谢谢|技师说：不客气'

In [10]:
train_df['X'][0]

NameError: name 'train_df' is not defined

In [48]:
len(train_df['X'][0].split(' '))

302

In [None]:
w2v_model = 