In [1]:
import sys
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.models.word2vec import LineSentence
import utils.config as config
from utils.multi_proc_utils import parallelize
from utils.wv_loader import get_vocab
from utils.pickle_io import *
import re
import jieba
jieba.load_userdict(config.user_dict)
import logging

Building prefix dict from the default dictionary ...
2019-12-29 01:42:30,042 : DEBUG : Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/w3/yc8mtbd91vs80rp79zfgk8x00000gn/T/jieba.cache
2019-12-29 01:42:30,044 : DEBUG : Loading model from cache /var/folders/w3/yc8mtbd91vs80rp79zfgk8x00000gn/T/jieba.cache
Loading model cost 0.647 seconds.
2019-12-29 01:42:30,690 : DEBUG : Loading model cost 0.647 seconds.
Prefix dict has been built succesfully.
2019-12-29 01:42:30,691 : DEBUG : Prefix dict has been built succesfully.


In [2]:
def seq2seq_pre(sentence):
    # 要加空格，否则会连接在一起
    sentence = re.sub('车主说', ' TOKEN1 ', sentence, flags=re.MULTILINE)
    sentence = re.sub('技师说', ' TOKEN2 ', sentence, flags=re.MULTILINE)
    sentence = re.sub('\[图片\]', ' TOKEN3 ', sentence, flags=re.MULTILINE)
    sentence = re.sub('\[语音\]', ' TOKEN4 ', sentence, flags=re.MULTILINE)
    sentence = re.sub('(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|#|\-)*\b', ' TOKEN5 ', sentence, flags=re.MULTILINE)
    sentence = re.sub('\|','',sentence, flags=re.MULTILINE)
    # 训练词向量时，已加载词典到jieba，直接调用
    words = jieba.cut(sentence)
    return ' '.join(words)

def process_seq2seq(df):
    '''
    seq2seq批量处理方法
    :param df: 数据集
    :return:处理好的数据集
    '''
    # 批量预处理 训练集和测试集
    for col_name in ['Question', 'Dialogue']:
        df[col_name] = df[col_name].apply(seq2seq_pre)

    if 'Report' in df.columns:
        # 训练集 Report 预处理
        df['Report'] = df['Report'].apply(seq2seq_pre)
    return df

def mark_proc(sentence, max_len, vocab, update=False):
    '''
    < start > < end > < pad > < unk >
    '''
    # 0.按空格统计切分出词
    words = sentence.strip().split(' ')
    # 1.过滤过多空格导致的空值''
    words = [x for x in words if len(x)]
    # 2. 截取规定长度的词数
    words = words[:max_len]
    if update:
        sentence = words
    else:
        # 5. 填充< unk > ,判断是否在vocab中, 不在填充 < unk >
        sentence = [word if word in vocab else '<UNK>' for word in words]
        # 3. 填充< start > < end >
        sentence = ['<START>'] + sentence + ['<STOP>']
        # 4. 判断长度，填充　< pad >
        sentence = sentence + ['<PAD>'] * (max_len - len(words))
    return ' '.join(sentence)

def get_max_len(data):
    """
    获得合适的最大长度值
    :param data: 待统计的数据  train_df['Question']
    :return: 最大长度值
    """
    max_lens = data.apply(lambda x: x.count(' ')+1)
    return int(np.mean(max_lens) + 2 * np.std(max_lens))

In [3]:
train_df = pd.read_csv(config.train_data_path)
test_df = pd.read_csv(config.test_data_path)
train_df.dropna(subset=['Question','Dialogue','Report'], how='any', inplace=True)
test_df.dropna(subset=['Question','Dialogue'], how='any', inplace=True)

In [4]:
train_df = parallelize(train_df, process_seq2seq)
test_df = parallelize(test_df, process_seq2seq)

In [None]:
# 构建DataSet
train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)