In [1]:
'''
计算情感得分，基于kansei情感词。调整计算规则，防止情感得分过大。
'''
import jieba
import pandas as pd

sentences = pd.read_csv('../data/sentence_sentiment_2.csv', sep='\t')
sentences_zero = sentences[sentences['sentiment'] == 0]
sentences_not_zero = sentences[sentences['sentiment'] != 0]
sentences_zero.shape, sentences_not_zero.shape

((957357, 2), (439194, 2))

In [2]:
# 读取文件，文件读取函数
def read_file(filename):
    # with open(filename, 'rb')as f:
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()
        # 返回list类型数据
        text = text.split('\n')
    return text


# 读取所需文件
most = read_file("../data/lexicons/most.txt")
very = read_file("../data/lexicons/very.txt")
more = read_file("../data/lexicons/more.txt")
ish = read_file("../data/lexicons/ish.txt")
insufficiently = read_file("../data/lexicons/insufficiently.txt")
inverse = read_file("../data/lexicons/inverse.txt")

# 读取停用词表
stop_words = read_file(r"../data/baidu_stopwords.txt")
print('origin stop length: ' + str(len(stop_words)))

# 去掉停用词中的情感词
# 情感词与停用词有重合导致一些文本分数为0
stop_df = pd.DataFrame(stop_words)
senti_df = pd.read_excel('../data/8_BosonNLP_sentiment_lexicon.xlsx')
stop_df.columns = ['word']
duplicated = pd.merge(stop_df, senti_df, on='word')['word'].tolist()
stop_words = list(filter(lambda x: x not in duplicated, stop_words))
print('remove sentiment stop length: ' + str(len(stop_words)))

# 去掉停用词中的程度词
# 合并程度词
degree_word = most + very + more + ish + insufficiently + inverse
stop_words = list(filter(lambda x: x not in degree_word, stop_words))
print('remove degree stop length: ' + str(len(stop_words)))


# 读取情感词及分数
def get_senti_word():
    sentiment_dict = senti_df.set_index(keys='word')['sentiment'].to_dict()
    return sentiment_dict


# 去停用词函数
def del_stopwords(words):
    # 去除停用词后的句子
    new_words = []
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words


# 获取六种权值的词，根据要求返回list，这个函数是为了配合Django的views下的函数使用
def weighted_value(request):
    result_dict = []
    if request == "most":
        result_dict = most
    elif request == "very":
        result_dict = very
    elif request == "more":
        result_dict = more
    elif request == "ish":
        result_dict = ish
    elif request == "insufficiently":
        result_dict = insufficiently
    elif request == "inverse":
        result_dict = inverse
    elif request == 'senti':
        result_dict = get_senti_word()
    # elif request == 'pos_dict':
    #     result_dict = get_senti_word(polar='pos')
    # elif request == 'neg_dict':
    #     result_dict = get_senti_word(polar='neg')
    else:
        pass
    return result_dict


print("reading sentiment dict .......")
# 读取情感词典
senti_dict = weighted_value('senti')

# 读取程度副词词典
# 权值为2
most_dict = weighted_value('most')
# 权值为1.75
very_dict = weighted_value('very')
# 权值为1.50
more_dict = weighted_value('more')
# 权值为1.25
ish_dict = weighted_value('ish')
# 权值为0.25
insufficient_dict = weighted_value('insufficiently')
# 权值为-1
inverse_dict = weighted_value('inverse')


# 程度副词处理，对不同的程度副词给予不同的权重
def match_adverb(word, sentiment_value):
    # 最高级权重为
    if word in most_dict:
        sentiment_value *= 2
    # 比较级权重
    elif word in very_dict:
        sentiment_value *= 1.75
    # 比较级权重
    elif word in more_dict:
        sentiment_value *= 1.5
    # 轻微程度词权重
    elif word in ish_dict:
        sentiment_value *= 1.25
    # 相对程度词权重
    elif word in insufficient_dict:
        sentiment_value *= 0.25
    # 否定词权重
    elif word in inverse_dict:
        sentiment_value *= -1
    else:
        sentiment_value *= 1
    return sentiment_value


# 每个句子打分
def single_sentiment_score(sent):
    if pd.isna(sent):
        return -2
    # 预处理
    words = list(jieba.cut(sent))
    seg_words = del_stopwords(words)
    senti_pos = []
    score = []
    # 记录情感词位置
    for i, word in enumerate(seg_words):
        if word in senti_dict.keys():
            senti_pos.append(i)

    # 计算情感分数
    for i in range(len(senti_pos)):
        pos = senti_pos[i]
        senti_word = seg_words[pos]
        word_score = senti_dict.get(senti_word)
        # 每个情感词的程度词范围为此情感词与上个情感词之间
        if i == 0:
            last_pos = 0
        else:
            last_pos = senti_pos[i - 1]

        # 程度词范围
        degree_range = seg_words[last_pos + 1: pos]
        # 对程度词范围去重，出现多个相同程度词时只计算一次
        degree_range = set(degree_range)
        for w in degree_range:
            word_score = match_adverb(w, word_score)
        score.append(word_score)

    sentiment_score = sum(score)
    return sentiment_score

origin stop length: 1395
remove sentiment stop length: 243
remove degree stop length: 243
reading sentiment dict .......


In [3]:
from tqdm import tqdm

tqdm.pandas()
sentences_zero['sentiment'] = sentences_zero['sentence'].progress_apply(single_sentiment_score)
sentences_zero[sentences_zero['sentiment'] == 0].shape

  0%|          | 0/957357 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\62774\AppData\Local\Temp\jieba.cache
Loading model cost 0.747 seconds.
Prefix dict has been built successfully.
100%|██████████| 957357/957357 [00:54<00:00, 17703.19it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentences_zero['sentiment'] = sentences_zero['sentence'].progress_apply(single_sentiment_score)


(31203, 2)

In [4]:
boson_zero = sentences_zero[sentences_zero['sentiment'] == 0]
boson_not_zero = sentences_zero[sentences_zero['sentiment'] != 0]
boson_zero.shape, boson_not_zero.shape

((31203, 2), (926154, 2))

In [5]:
sentences_not_zero['dict'] = 'kansei'
boson_not_zero['dict'] = 'boson'
sentence_score = pd.concat([sentences_not_zero, boson_not_zero])
sentence_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentences_not_zero['dict'] = 'kansei'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  boson_not_zero['dict'] = 'boson'


Unnamed: 0,sentence,sentiment,dict
0,很不错,1.000000,kansei
3,很方便,0.913670,kansei
4,交通很方便,1.598922,kansei
5,房间很干净,1.750000,kansei
6,虽然房间面积普遍都不大,-0.818779,kansei
...,...,...,...
1396543,适合孩子,0.233304,boson
1396544,就是房间漏水,-0.264336,boson
1396545,没有热水,-0.113943,boson
1396546,巴适的板,0.450991,boson


In [6]:
sentence_score.to_csv('../data/sentiment_score_2.csv', sep='\t', index=False)
boson_zero.to_csv('../data/sentiment_score_zero_2.csv', sep='\t', index=False)