In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.models.word2vec import LineSentence
import logging
from utils.utils import * # 自己的工具类

## 加载Vocab

In [2]:
merger_data_path = 'data/merged_train_test_seg_data.csv'
merged_df = pd.read_csv(merger_data_path, header=None)
print('词典数量：{}'.format(len(merged_df)))
merged_df.head()

词典数量：102871


Unnamed: 0,0
0,方向机 重 助力泵 方向机 都 换 新 都 换 助力泵 方向机 换 方向机 带 助力 重 这...
1,奔驰 ML500 排气凸轮轴 调节 错误 有没有 电脑 检测 故障 代码 有发 一下 发动机...
2,2010 款 宝马 X12011 年 出厂 20 排量 通用 6L45 变速箱 原地 换挡 ...
3,30V6 发动机号 位置 照片 最好 右侧 排气管 上方 缸体 上 靠近 变速箱 是不是 号...
4,2012 款 奔驰 c180 维修保养 动力 值得 拥有 家庭 用车 入手 维修保养 费用 ...


In [3]:
# 生成gensim训练需要的list，并保存
w2v_data = merged_df[0].apply(lambda x : x.split(' '))
w2v_data = w2v_data.to_list()
print('词典数量：{}'.format(len(w2v_data)))
save_file(w2v_data, 'output/w2v_data')

词典数量：102871


## 2. 训练模型

In [4]:
# 引入日志配置
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

line_gen = LineSentence(merger_data_path) # generator，节省内存
model = Word2Vec(line_gen, size=300, window=5, min_count=5, workers=8)
model.save("word2vec.model")

2019-11-23 18:33:10,295 : INFO : collecting all words and their counts
2019-11-23 18:33:10,297 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-23 18:33:10,546 : INFO : PROGRESS: at sentence #10000, processed 932051 words, keeping 37237 word types
2019-11-23 18:33:10,799 : INFO : PROGRESS: at sentence #20000, processed 1877470 words, keeping 54830 word types
2019-11-23 18:33:11,042 : INFO : PROGRESS: at sentence #30000, processed 2811696 words, keeping 67964 word types
2019-11-23 18:33:11,276 : INFO : PROGRESS: at sentence #40000, processed 3718866 words, keeping 79130 word types
2019-11-23 18:33:11,525 : INFO : PROGRESS: at sentence #50000, processed 4686367 words, keeping 89249 word types
2019-11-23 18:33:11,790 : INFO : PROGRESS: at sentence #60000, processed 5713987 words, keeping 99387 word types
2019-11-23 18:33:12,061 : INFO : PROGRESS: at sentence #70000, processed 6764560 words, keeping 109127 word types
2019-11-23 18:33:12,313 : INFO : PROGRE

In [18]:
model = Word2Vec.load('word2vec.model')
model.wv.most_similar('本田')

2019-11-23 19:25:00,699 : INFO : loading Word2Vec object from word2vec.model
2019-11-23 19:25:01,182 : INFO : loading wv recursively from word2vec.model.wv.* with mmap=None
2019-11-23 19:25:01,182 : INFO : setting ignored attribute vectors_norm to None
2019-11-23 19:25:01,183 : INFO : loading vocabulary recursively from word2vec.model.vocabulary.* with mmap=None
2019-11-23 19:25:01,184 : INFO : loading trainables recursively from word2vec.model.trainables.* with mmap=None
2019-11-23 19:25:01,184 : INFO : setting ignored attribute cum_table to None
2019-11-23 19:25:01,185 : INFO : loaded word2vec.model
2019-11-23 19:25:01,249 : INFO : precomputing L2-norms of word weight vectors


[('凌派', 0.7981020212173462),
 ('crv', 0.7812198400497437),
 ('缤智', 0.7617830038070679),
 ('思域', 0.756554901599884),
 ('十代', 0.7484124898910522),
 ('东风本田', 0.7482534646987915),
 ('冠道', 0.7459962964057922),
 ('广汽', 0.7427908182144165),
 ('飞度', 0.7418052554130554),
 ('CRV', 0.7385894060134888)]

In [6]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

line_gen = LineSentence(merger_data_path) # generator，节省内存
model2 = FastText(line_gen, size=300, min_count=5, workers=8)
model2.save('fasttext.model')

2019-11-23 18:33:57,937 : INFO : resetting layer weights
2019-11-23 18:34:28,511 : INFO : collecting all words and their counts
2019-11-23 18:34:28,591 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-23 18:34:28,916 : INFO : PROGRESS: at sentence #10000, processed 932051 words, keeping 37237 word types
2019-11-23 18:34:29,211 : INFO : PROGRESS: at sentence #20000, processed 1877470 words, keeping 54830 word types
2019-11-23 18:34:29,486 : INFO : PROGRESS: at sentence #30000, processed 2811696 words, keeping 67964 word types
2019-11-23 18:34:29,734 : INFO : PROGRESS: at sentence #40000, processed 3718866 words, keeping 79130 word types
2019-11-23 18:34:29,998 : INFO : PROGRESS: at sentence #50000, processed 4686367 words, keeping 89249 word types
2019-11-23 18:34:30,288 : INFO : PROGRESS: at sentence #60000, processed 5713987 words, keeping 99387 word types
2019-11-23 18:34:30,575 : INFO : PROGRESS: at sentence #70000, processed 6764560 words, keeping 

## 3.构建vocab的Embedding Matrix

In [17]:
vocab = load_file('output/id2word.pkl')
type(vocab)

dict

In [22]:
embedding_matrix = np.zeros((len(vocab), model.wv.vector_size))
embedding_matrix.shape

(29533, 300)

In [24]:
vocab = {i-1:x for i,x in vocab.items()}
vocab

{0: '<UNK>',
 1: '<PAD>',
 2: '<EOS>',
 3: '<BOS>',
 4: '问题',
 5: '不',
 6: '没有',
 7: '好',
 8: '都',
 9: '需要',
 10: '故障',
 11: '一下',
 12: '检查',
 13: '发动机',
 14: '换',
 15: '更换',
 16: '会',
 17: '正常',
 18: '车',
 19: '情况',
 20: '没',
 21: '说',
 22: '机油',
 23: '去',
 24: '谢谢',
 25: '看',
 26: '电脑',
 27: '这种',
 28: '现在',
 29: '建议',
 30: '不会',
 31: '可能',
 32: '声音',
 33: '应该',
 34: '变速箱',
 35: '清洗',
 36: '有没有',
 37: '下',
 38: '出现',
 39: '检查一下',
 40: '上',
 41: '是否',
 42: '后',
 43: '是不是',
 44: '节气门',
 45: '影响',
 46: '不是',
 47: '启动',
 48: '还',
 49: '车子',
 50: '看看',
 51: '跑',
 52: '码',
 53: '公里',
 54: '左右',
 55: '很',
 56: '大',
 57: '做',
 58: '车辆',
 59: '火花塞',
 60: '油',
 61: '师傅',
 62: '再',
 63: '请问',
 64: '位置',
 65: '传感器',
 66: '原因',
 67: '开',
 68: '年',
 69: '维修',
 70: '保养',
 71: '比较',
 72: '感觉',
 73: '行驶',
 74: '知道',
 75: '线路',
 76: '才',
 77: '里面',
 78: '时',
 79: '导致',
 80: '先',
 81: '轮胎',
 82: '4s店',
 83: '不好',
 84: '客气',
 85: '坏',
 86: '怠速',
 87: '拆',
 88: '款',
 89: '修理厂',
 90: '不能',
 91: '使用',
 92:

In [None]:
for i,w in vocab.items():
    embedding_matrix[i] = 