In [15]:
# -*- coding:utf-8 -*-
import jieba
import jieba.analyse
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from snownlp import SnowNLP
from sklearn.externals import joblib
from snownlp import sentiment

# 生成模型，测试意见内容

In [16]:
# 导入训练数据
data = pd.read_excel('feedback.xlsx')

data = np.array(data)

# 获取分类集和数据集
classification = np.array(data[:, 0], dtype='int')
comment = np.array(data[:, 1], dtype='object')

# 对数据集（文本）进行分词
commentStack = [];
for number in range(len(comment)):
    commentWord = jieba.cut(str(comment[number]))
    commentWord = " ".join(commentWord)
    commentStack.append(commentWord)

#转化成数组，格式是["词语1 词语2 词语3"， ...] 一个元素代表一个文本
vectorizer = CountVectorizer()    # 计算文本的词频矩阵
arr = vectorizer.fit_transform(commentStack)  # 矩阵元素a[i][j] 表示j词在i个文本下的词频

transformer = TfidfTransformer()
tfidf = transformer.fit_transform(arr) # 传入词频向量计算tf-idf

print(tfidf)

  (1, 94)	1.0
  (2, 94)	1.0
  (3, 95)	1.0
  (4, 97)	1.0
  (5, 97)	1.0
  (6, 98)	1.0
  (7, 99)	1.0
  (8, 100)	1.0
  (9, 102)	1.0
  (10, 103)	1.0
  (11, 104)	1.0
  (12, 105)	1.0
  (13, 106)	1.0
  (15, 119)	1.0
  (16, 128)	0.802818814511
  (16, 2601)	0.596223071566
  (17, 142)	1.0
  (18, 142)	1.0
  (19, 142)	1.0
  (20, 142)	1.0
  (21, 94)	0.661883286007
  (21, 147)	0.749606907455
  (22, 94)	0.681481731797
  (22, 165)	0.731835124346
  (23, 177)	1.0
  :	:
  (1844, 882)	0.214542817705
  (1844, 335)	0.230192063101
  (1844, 669)	0.247200518022
  (1844, 814)	0.237621852949
  (1844, 1280)	0.260700888193
  (1844, 250)	0.260700888193
  (1844, 850)	0.260700888193
  (1844, 2745)	0.521401776385
  (1845, 2601)	0.38031490946
  (1845, 2616)	0.394909305752
  (1845, 1681)	0.394909305752
  (1845, 1130)	0.364854227376
  (1845, 1072)	0.384827293089
  (1845, 1856)	0.512096863262
  (1846, 2502)	0.361443985539
  (1846, 648)	0.687895647228
  (1846, 726)	0.629410695684
  (1847, 1499)	0.366196875458
  (1847, 336)	

In [17]:
# 朴素贝叶斯统计

mnb = MultinomialNB()
mnb.fit(tfidf, classification)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
sentiment.train('feedback_negative.txt', 'feedback_positive.txt')
sentiment.save('sentiment.marshal')

In [19]:
test_comment = u'最满意的炒股软件'

test_comment = jieba.cut(test_comment)
test_comment = " ".join(test_comment)

test_comment = [test_comment]

test_vectorizer_wordFrequency = vectorizer.transform(test_comment)

test_tfidf = transformer.transform(test_vectorizer_wordFrequency).toarray()

predict_classification = mnb.predict(test_tfidf)
predict_classification_proba = mnb.predict_proba(test_tfidf)

print(predict_classification_proba)

snownlp = SnowNLP(str(test_comment))
snow_score = snownlp.sentiments

print(snow_score)

# 如果snow判断大于0.8或朴素贝叶斯预测值大于0.8，则选择作为分类
# snow_tag_stack = {0:0.25, 1:0.5, 2:0.75, 3:1}

naive_score = predict_classification_proba[0][predict_classification[0]]     # 朴素贝叶斯得到的最大分类概率

if (snow_score >= 0.5):
    if (snow_score >= naive_score):      # 如果情感分析得分大于朴素贝叶斯最大概率，则选择该标签
        final_score = snow_score
        if (snow_score > 0.25):
            if (snow_score > 0.5):
                if (snow_score > 0.75):
                    tag = 3              # 标记为3分满意
                else:
                    tag = 2              # 标记为中肯
            else:
                tag = 1
        else:
            tag = 0
    else:
        final_score = naive_score
        tag = predict_classification[0]
else:
    snow_score = 1-snow_score
    if (snow_score >= naive_score):      # 如果情感分析得分大于朴素贝叶斯最大概率，则选择该标签
        final_score = snow_score
        if (snow_score > 0.5):
            if (snow_score > 0.75):
                tag = 0
            else:
                tag = 1
    else:
        final_score = naive_score
        tag = predict_classification[0]
    
print(tag)

[[ 0.05073435  0.32995416  0.04215937  0.57715211]]
0.9808587058313183
3


In [68]:
joblib.dump(mnb, '10jqka_mnb')
joblib.dump(transformer, '10jqka_transformer')
joblib.dump(vectorizer, '10jqka_vectorizer')

['10jqka_vectorizer']