In [149]:
from sklearn.model_selection import train_test_split
from gensim.models.word2vec import Word2Vec
import numpy as np
import pandas as pd
import jieba
import re
from sklearn.externals import joblib # 结构化数据以二进制形式丢出来
from sklearn.svm import SVC
import sys
import os
from tqdm import tqdm

In [152]:
"""载入数据，清理数据，做分词处理，切分训练集与测试集"""
# 正则清洗句子中的各种符号
def regular_clean(sen):
    sen = re.sub('…{2,100}', '…', sen)
    sen = re.sub(',{2,100}', ',', sen)
    sen = re.sub('，{2,100}', ',', sen)
    sen = re.sub('\.{3,100}', '...', sen)
    sen = re.sub('。{2,100}', '。', sen)
    sen = re.sub('\?{3,100}', '?', sen)
    sen = re.sub('？{2,100}', '？', sen)
    sen = re.sub('!{2,100}', '!', sen)
    sen = re.sub('！{2,100}', '!', sen)
    sen = re.sub('、{2,100}', '、', sen)
    sen = re.sub('-{2,100}', '-', sen)
    sen = re.sub('[ ]{2,100}', ' ', sen)
    return sen

# 创建停用词列表
def stopwordslist():
    stopwords_file = os.path.join('.','datas','chinesestopwords.txt')
    stopwords = [line.strip() for line in open(stopwords_file,encoding='UTF-8').readlines()]
    return stopwords

# 返回分词结果
def cutwords(sten):
    sten = regular_clean(sten)
    tokens = jieba.cut(sten,cut_all=True)
    stopwords = stopwordslist()
    stopwords.extend([' ','\t','\n'])
    return [x for x in tokens if x not in stopwords]
#     return [x for x in tokens]

def load_file_and_preprocessing():
    neg_words,pos_words = [],[]
    datafile = os.path.join('.','datas','goods_zh.tsv')
    with open(datafile,encoding="utf-8") as f:
        for line in tqdm(f):
            parts = line.split(",,")
            if parts[-1].strip() == "0":
                neg_words.append(cutwords(parts[0].strip()))
            elif parts[-1].strip() == "1":
                pos_words.append(cutwords(parts[0].strip()))
            else:
                continue
    return pos_words,neg_words

In [153]:
print("加载文件中...")
pos_words,neg_words = load_file_and_preprocessing()
tags = np.concatenate((np.ones(len(pos_words)),np.zeros(len(neg_words))))
x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_words,neg_words)),
                                                    tags,
                                                    test_size=0.2)
np.save('./datas/svm_data/x_train.npy',x_train) # x就是数据，y就是标签
np.save('./datas/svm_data/x_test.npy',x_test)
np.save('./datas/svm_data/y_train.npy',y_train)
np.save('./datas/svm_data/y_test.npy',y_test)
print("x_train, x_test, y_train, y_test保存完成!")

1it [00:00,  9.91it/s]

加载文件中...


101058it [04:45, 353.76it/s]


x_train, x_test, y_train, y_test保存完成!


In [154]:
for i in range(10):
    print("/".join(x_train[i]),y_train[i])

睡衣/收到/男朋友/朋友/喜欢/款式/简单/大方/面料/软/舒服/睡衣/他家/买 1.0
鞋子/不错/味道/不错/透气/透气性/气性/棒/晒/几张/分享 1.0
没/发/耳机/评价/耳机/客服/回应 0.0
说/懒得/退换/星星/配送/小哥/态度/服务 0.0
质量 0.0
喜欢/图片/一模一样 1.0
黑色/手机/配/白色/充电/充电器/电器/耳机/转换/绝/! 0.0
用户/未填写/填写/评价/价内/内容 1.0
不好/降价/保价/太/气人 0.0
老/顾客/这款/店家/搞活/活动/拍下/划算 1.0


In [155]:
"""对每个句子的所有词向量取均值，生成一个句子的vector"""
def build_sentence_vector(text,size,imdb_w2v):
    vec = np.zeros(size).reshape((1,size))
    count = 0
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1,size))
            count += 1
        except KeyError:
            continue
    if count != 0:
        vec /= count 
    return vec

In [156]:
"""计算词向量"""
def get_train_vecs(x,x_train,x_test):
    n_dim = 300
    # 初始化模型和词表
    imdb_w2v = Word2Vec(size=n_dim,min_count=10,seed=1)
    imdb_w2v.build_vocab(x)# 建模
    imdb_w2v.save("./datas/svm_data/w2v_model.pkl")
    # 在训练集上
    imdb_w2v.train(x_train,total_examples=imdb_w2v.corpus_count, epochs=50)
    train_vecs = np.concatenate([build_sentence_vector(z,n_dim,imdb_w2v) for z in x_train])
#     train_vecs = scale(train_vecs)
    np.save("./datas/svm_data/train_vecs.npy",train_vecs)
    print("[train_vecs.shape]:",train_vecs.shape)
    
    # 在测试集上训练
    imdb_w2v.train(x_test,total_examples=imdb_w2v.corpus_count, epochs=50)
    test_vecs = np.concatenate([build_sentence_vector(z,n_dim,imdb_w2v) for z in x_test])
#     test_vecs = scale(test_vecs)
    np.save("./datas/svm_data/test_vecs.npy",test_vecs)
    print("[test_vecs.shape]:",test_vecs.shape)
get_train_vecs(np.concatenate((pos_words, neg_words)),x_train,x_test)

  import sys


[train_vecs.shape]: (80846, 300)
[test_vecs.shape]: (20212, 300)


In [157]:
# 加载保存的文件数据
def get_data():
    train_vecs = np.load('./datas/svm_data/train_vecs.npy')
    y_train = np.load('./datas/svm_data/y_train.npy')
    test_vecs = np.load('./datas/svm_data/test_vecs.npy')
    y_test = np.load('./datas/svm_data/y_test.npy')
    return train_vecs,y_train,test_vecs,y_test

# 训练svm模型train svm model with sklearn
def svm_train(train_vecs, y_train, test_vecs, y_test):
    clf = SVC(kernel='rbf', verbose=True)
    clf.fit(train_vecs, y_train)
    joblib.dump(clf, './datas/svm_data/model.pkl')
    print(clf.score(test_vecs, y_test))
train_vecs,y_train,test_vecs,y_test = get_data()
svm_train(train_vecs,y_train,test_vecs,y_test)

[LibSVM]0.9309321195329507


In [166]:
# 预测句子
# load word2vec and smv model and use them to predict
print("use svm model to predict...")
def svm_predict(str):
    clf = joblib.load('./datas/svm_data/model.pkl')
    model = Word2Vec.load('./datas/svm_data/w2v_model.pkl')
    str_sege = cutwords(str)
    words = [x for x in str_sege]
    n_dim = 300
    words_vecs = build_sentence_vector(words, n_dim, model)
    result = clf.predict(words_vecs)
    print(result,result[0])
pre_str = "手机刚买来设个密码，结果手机出了问题，输正确的密码却打不开，过了两三个小时才恢复正常"
svm_predict(pre_str)
pre_str = "一颗星都是给多了，简直是麻烦死了！！！！！必须要激活，不然不能使用，不是试用就是购买！！！！！！！！！！！"
svm_predict(pre_str)

use svm model to predict...


  import sys


[1.] 1.0
[1.] 1.0


  import sys
