In [142]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import gensim
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding

model_path = {
    'raw_data':'raw_data.csv',
    'model_w2v':'w2v_model_50_V2',
    'model_nn':'model_V2.json',
    'model_nn_h5':'model_V2.h5',
    'w2v_corpus':'train_corpus_V2',
}

para = {
    'epoch': 300
}
class SemanticIntentionClassify():
    
    def __init__(self,model_path,para):
        self.raw_data = pd.read_csv(model_path['raw_data'])
        self.model_w2v = model_path['model_w2v']
        self.model_nn = model_path['model_nn']
        self.model_nn_h5 = model_path['model_nn_h5']
        self.w2v_corpus = model_path['w2v_corpus']
        self.epoch = para['epoch']

    def w2v_corpus_generate(self):
        """词向量训练语聊生成"""
        with open(self.w2v_corpus, 'a+') as f:
            for sentence in list(self.raw_data['X']):
                for char in sentence:
                    f.write(char + ' ')
        print ('finish!!')
    
    def _load_word2vec_model(self):
        """载入词向量模型和词的索引"""
        
        import gensim
        model_w2v = gensim.models.Word2Vec.load(self.model_w2v)
        i = 0
        word_to_index = {}
        for word in model_w2v.wv.vocab.keys():
            word_to_index[word] = i
            i += 1
        return model_w2v,word_to_index
    
    def _load_nn_model(self):
        """载入预测模型"""
        
        from keras.models import model_from_json
        json_file = open(self.model_nn, 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        loaded_model = model_from_json(loaded_model_json)
        loaded_model.load_weights(self.model_nn_h5)
        return loaded_model

    def _sentences_to_indices(self,x, word_to_index, max_len):
        """批量句子转换为词向量字典对应的索引"""
        
        m = x.shape[0]
        x_indices = np.zeros((m, max_len))
        for i in range(m):
            sentence_words = x[i]
            j = 0
            for w in sentence_words:
                if w != ' ' and w in word_to_index.keys():
                    x_indices[i, j] = word_to_index[w]
                    j = j + 1
        return x_indices

    def data_generate(self):
        """
        功能：
        批量转化原始数据，用来预测或者训练
        
        用法：
        model = SemanticIntentionClassify(model_path)
        x,y = model.data_generate()
        """
        
        raw_data = self.raw_data
        _,word_to_index = self._load_word2vec_model()
        x = []
        for index in raw_data.index:
            x_raw = raw_data.iloc[index]['X']
            sentence = []
            for each in x_raw:
                if each == ' ':
                    continue
                else:
                    sentence.append(each)
            x.append(sentence)
        x = np.array(x)
        x = self._sentences_to_indices(x, word_to_index, max_len=30)
        le = preprocessing.LabelEncoder()
        le.fit(raw_data['Y'])
        y_word_to_indice={}
        for i in range(len(le.classes_)):   
            y_word_to_indice[i] = le.classes_[i]
        y = le.transform(raw_data['Y'])
        return x,y,y_word_to_indice

    def error_analysis(self,x,y):
        """
        功能：
        用于对输入的数据做误差检查，返回预测不对的数据索引
        
        参数：
        x：方法data_generate生成的x
        y：方法data_generate生成的y
        
        用法：
        model = SemanticIntentionClassify(model_path)
        x,y = model.data_generate()
        fault_indice = model.error_analysis(x,y)
        """
        
        nn_model = self._load_nn_model()
        model_w2v,_ = self._load_word2vec_model()
        fault_indice = []
        num = 0
        for i in range(len(x)):
            pred = nn_model.predict(np.array([x[i]]))
            pred_y = np.argmax(pred)
            if y[i] == pred_y:
                num += 1
            else:
                fault_indice.append(i)
        return fault_indice
    
    def predict(self,x,nn_model,model_w2v, word_to_index):
        """
        功能：
        用于预测句子的意图
        
        参数：
        x：是要预测句子的字符串
        
        返回：
        句子的意图
        
        用法：
        model = SemanticIntentionClassify(model_path)
        x = "你的库里面有悦阅的部门吗？"
        num = model.error_analysis(x,y)
        model.predict(x)
        """
        
        #_,_,y_word_to_indice = self.data_generate()
        y_word_to_indice = {0: '找职位', 1: '找邮箱', 2: '找部门', 3: '找领导'}
        #nn_model = self._load_nn_model()
        #model_w2v, word_to_index = self._load_word2vec_model()
        inputs_list = []
        for each in x:
            if each in model_w2v.wv.vocab:
                inputs_list.append(each)
        x_test = np.array([inputs_list])        
        x_test_indices = self._sentences_to_indices(x_test, word_to_index, 30)
        pred = nn_model.predict(x_test_indices)
        class_ = y_word_to_indice[np.argmax(pred)]
        
        return class_
        
    def train_w2v(self):
        """
        作用：
        训练语料的词向量模型
        
        例子：
        model = SemanticIntentionClassify(model_path)
        model_w2v = model.train_w2v()
        model_w2v.save("{}".format(path_of_model_you_want_to_save))
        """
        from gensim.models import word2vec
        from gensim import models
        import gensim
        import logging
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)
        sentences = word2vec.Text8Corpus(self.w2v_corpus)
        model = gensim.models.Word2Vec(size=50, window=5, min_count=1)
        model.build_vocab(sentences)
        model.train(sentences, total_examples=model.corpus_count, epochs=100)
        model.save("{}".format('w2v_model_50_V2'))
        
    def _pretrained_embedding_layer(self,word_to_vec_map, word_to_index):
        """
        Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.

        Arguments:
        word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
        word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

        Returns:
        embedding_layer -- pretrained layer Keras instance
        """
    
        vocab_len = len(word_to_index) + 1                 
        emb_dim = word_to_vec_map['i'].shape[0]   
        emb_matrix = np.zeros((vocab_len,emb_dim))
        for word, index in word_to_index.items():
            emb_matrix[index, :] = word_to_vec_map[word]
        embedding_layer = Embedding(vocab_len,emb_dim,trainable=False)
        embedding_layer.build((None,))
        embedding_layer.set_weights([emb_matrix])
        return embedding_layer
        
    def _keras_model(self, input_shape, word_to_vec_map, word_to_index):
        """
        Function creating the Emojify-v2 model's graph.

        Arguments:
        input_shape -- shape of the input, usually (max_len,)
        word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
        word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

        Returns:
        model -- a model instance in Keras
        """

        sentence_indices = Input(input_shape, dtype='int32')
        embedding_layer = self._pretrained_embedding_layer(word_to_vec_map, word_to_index)
        embeddings = embedding_layer(sentence_indices)   
        X = LSTM(128, return_sequences=True)(embeddings)
        X = Dropout(0.5)(X)
        X = LSTM(128, return_sequences=False)(X)
        X = Dropout(0.5)(X)
        X = Dense(4)(X)
        X = Activation('softmax')(X)
        keras_model = Model(inputs=sentence_indices, outputs=X)
        return keras_model
    
    def _embedding_dict(self,model_w2v):
        embedding_dict = {}
        for i in range(len(model_w2v.wv.vocab)):
            embedding_vector = model_w2v.wv[model_w2v.wv.index2word[i]]
            if embedding_vector is not None:
                embedding_dict[model_w2v.wv.index2word[i]] = embedding_vector
        return embedding_dict
    
    def _convert_to_one_hot(self, y, c):
        y = np.eye(c)[y.reshape(-1)]
        return y
    
    def train_classifier(self):
        """
        作用:
        训练语义意图理解的分类器
        
        例子：
        模型调用：
        model = SemanticIntentionClassify(model_path,para)
        model.train_classifier()
        模型保存：
        from keras.models import model_from_json
        model_json = model.to_json()
        with open("model.json", "w") as json_file:
            json_file.write(model_json)
        model.save_weights("model.h5")
        """
        
        x,y,_ = self.data_generate()
        y = self._convert_to_one_hot(y, len(set(y)))
        model_w2v,word_to_index = self._load_word2vec_model()
        word_to_vec_map = self._embedding_dict(model_w2v)
        maxLen = 30
        model = self._keras_model((maxLen,), word_to_vec_map, word_to_index)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.fit(x, y, epochs = self.epoch, shuffle=True)
        from keras.models import model_from_json
        model_json = model.to_json()
        with open("model_V2.json", "w") as json_file:
            json_file.write(model_json)
        model.save_weights("model_V2.h5")
        
    

In [143]:
model = SemanticIntentionClassify(model_path,para)
#model.train_classifier()

### 模型实验：
#### 通过人找找领导
1. xxx的上司是谁？  
fault_list：'张信', '曹信福', '谢子勃'

2. 帮我找一下xxx的领导？  
需要深度优化

3. xxx的老大是谁啊？  
fault_list: '王永新', '谢青卓', '谢正南', '王新', '谢毅舜', '谢妍', '张子荣', '邹子轩', '谢小平', '谢智', '贺子芮', '王子涵', '杨子亮', '李子振', '谢茂龙', '谢朋程', '谢丽胜', '王有圆', '卢发展', '谢敏', '谢名辉', '谢元珍', '王震新', '谢晓宇', '张信', '谢广振', '王会新', '谢志华', '要亚刚', '谢辉', '谢华强', '谢闻奇', '谢小霞', '谢红艳', '谢佳鑫', '曹信福', '谢东', '谢吉敬', '谢阳', '谢秋朋', '严润发', '谢子勃', '潘子龙', '郑子阳', '谢昆仑', '谢乔', '纪子寒', '刘子权', '蒲圆圆', '谢杰', '谢海鲸', '赵子斌', '金基勇', '谢婕', '谢国辉', '金子涵', '郭发军', '王子旭', '谢淑兰', '阮子扬', '谢时', '梁微子'

4. 金基勇是{}上级吗?  
fault_list: '王友明', '张子荣', '邹子轩', '贺子芮', '王子涵', '杨子亮', '张信', '曹信福', '谢子勃', '潘子龙', '侯明远', '安明明', '郑子阳', '纪子寒', '刘子权', '蒲圆圆', '赵子斌', '金子涵', '王子旭', '阮子扬'

5. 如果你能找到那谁的部门经理，你就牛逼了  
fault_list: '王子涵', '张信', '曹信福', '谢子勃', '蒲圆圆', '赵子斌', '金子涵', '王子旭']

6. xxx在谁手底下干活  
fault_list:  '路有人', '张子荣', '邹子轩', '贾月圆', '贺子芮', '王子涵', '杨子亮', '王有圆', '赵士芳', '张信', '要亚刚', '曹信福', '谢子勃', '潘子龙', '郑子阳', '纪子寒', '刘子权', '蒲圆圆', '王天一', '王立亚', '赵子斌', '金基勇', '金子涵', '王子旭', '李宝有', '阮子扬', '梁微子']

7. xxx向谁报告工作？  
fault_list: '张信', '曹信福', '谢子勃']

8. xxx所在的部门经理能找到吗  
fault_list: '张信', '曹信福']

9. 找xxx汇报的认  
fault_list: '张信', '曹信福', '谢子勃']

10. 谁是xxx的直属经理?  

11. '我不想找部门和职位信息，我想查崔月猛的领导是谁，你能行么?'  
深度优化

12. {}的所在部门的经理是谁？  
fault_list: '张信', '曹信福', '谢子勃']

#### 通过部门找领导
1. xxx是谁负责的？

2. xxx经理是谁？  
fault_list: '电子发票与税务服务事业部', '电子发票与税务服务事业部-运营部']

3. xxx的老大是谁?  
fault_list: '职能管理组织审批-杜宇', '电子发票与税务服务事业部', 'NC电商资产产品部', 'NC智能制造实施组', '伙伴学院', 'NC智能制造研发组', '研发共享中心', '电商通', '职能管理组织审批-蔡治国', '大数据研发技术部', '职能管理组织审批-王健', '大数据解决方案创新部', '电子发票与税务服务事业部-运营部', '地产与交通公用事业部', '股份公司软件业务', '股份公司', '职能管理组织', '伙伴招募与发展部']

4. xxx的部门经理是谁?   
fault_list: '职能管理组织审批-杜宇', '电子发票与税务服务事业部', '职能管理组织审批-蔡治国', '职能管理组织审批-王健', '电子发票与税务服务事业部-运营部', '职能管理组织']

5. xxx总经理是谁？  
fault_list: '电子发票与税务服务事业部', '电子发票与税务服务事业部-运营部']

#### 通过人找邮箱
1. 能帮我找一下xxx的工作邮箱么？  
深度优化

2. 我想查找{}的电子邮箱？  
fault_list: ['张信', '曹信福']

3. 我要给{}发给邮件，地址是多少？  
fault_list: ['张信', '曹信福']

4. 王立芹的email是什么？  
深度优化

5. 我想发个email给xxx，你能告诉我她的地址吗？？    
fault_list: ['崔月猛']

6. 你有查找xxx电子邮箱这个查询功能吗？  
fault_list: ['李向明', '吴士中', '张信', '曹信福']

#### 找职位
1. xxx在公司是干什么的？  
深度优化

2. {}是JAVA开发工程师吗？  
深度优化

3. xxx的工作职位是什么？  

4. xxx在公司承担什么样的工作？  
深度优化

#### 找部门
1. 我想知道{}在用友的那个部门工作

2. xxx在那个组？
深度优化

3. 你的库里友xxx的部门吗？
fault_list: ['吴士中', '崔月猛']

4. 请帮我找一下{}所属的部门？

In [147]:
model = SemanticIntentionClassify(model_path,para)
nn_model = model._load_nn_model()
model_w2v, word_to_index = model._load_word2vec_model()
x = "你能告诉我王立芹的工作邮箱吗？"
model.predict(x,nn_model,model_w2v, word_to_index)


2018-06-22 15:36:15,860 : INFO : loading Word2Vec object from w2v_model_50_V2
2018-06-22 15:36:15,863 : INFO : loading wv recursively from w2v_model_50_V2.wv.* with mmap=None
2018-06-22 15:36:15,864 : INFO : setting ignored attribute vectors_norm to None
2018-06-22 15:36:15,865 : INFO : loading vocabulary recursively from w2v_model_50_V2.vocabulary.* with mmap=None
2018-06-22 15:36:15,866 : INFO : loading trainables recursively from w2v_model_50_V2.trainables.* with mmap=None
2018-06-22 15:36:15,868 : INFO : setting ignored attribute cum_table to None
2018-06-22 15:36:15,868 : INFO : loaded w2v_model_50_V2


'找邮箱'

In [67]:
dptm = pd.read_csv('usr_dict_department')
person = pd.read_csv('/home/cuiym/桌面/chat_bot/chatbot_test/chatbot_predict/src/classifier/KG_contact/templates/model/usr_dict_person')

In [176]:
sentence = list(pd.read_csv('raw_data.csv')['X'])
label = list(pd.read_csv('raw_data.csv')['Y'])

In [180]:
#模型预测统计
total_fault = {}
for i in range(len(sentence)):
    print (i)
    fault_list = []
    for p in list(person['person']):
        x = sentence[i].format(p)
        y = label[i]
        pred_y = model.predict(x,nn_model,model_w2v, word_to_index)
        if pred_y == y:
            continue
        else:
            fault_list.append(p)
    total_fault[i] = fault_list

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118


In [173]:
x = '崔月猛的职位？'
model.predict(x,nn_model,model_w2v, word_to_index)

'找职位'

In [184]:
for key,value in total_fault.items():
    print (key,len(value))

0 3
1 23
2 234
3 0
4 0
5 162
6 0
7 8
8 6
9 2
10 16
11 0
12 20
13 0
14 0
15 0
16 5
17 0
18 0
19 0
20 0
21 0
22 0
23 6
24 0
25 0
26 0
27 0
28 0
29 0
30 0
31 0
32 0
33 0
34 0
35 0
36 0
37 1
38 0
39 0
40 0
41 0
42 0
43 0
44 0
45 0
46 0
47 0
48 0
49 0
50 0
51 0
52 16
53 0
54 0
55 0
56 0
57 0
58 0
59 0
60 0
61 0
62 11
63 28
64 0
65 0
66 0
67 0
68 2
69 0
70 0
71 0
72 0
73 0
74 0
75 0
76 0
77 0
78 0
79 0
80 176
81 17
82 13
83 0
84 0
85 0
86 0
87 62
88 0
89 184
90 0
91 0
92 0
93 0
94 0
95 0
96 0
97 0
98 0
99 0
100 0
101 0
102 0
103 0
104 0
105 0
106 0
107 0
108 0
109 0
110 0
111 0
112 0
113 312
114 16
115 0
116 0
117 3
118 0
