In [20]:
'''
计算情感得分，基于kansei情感词。
'''
import jieba
import pandas as pd

sentences = pd.read_csv('../data/only_sentence.txt', sep='\t')
sentences

Unnamed: 0,sentence
0,很不错
1,服务很好
2,很好很方便
3,交通很方便
4,房间很干净
...,...
1304779,没有热水
1304780,巴适的板
1304781,非常干净整洁
1304782,离三里屯近位置不错


In [1]:
# 读取文件，文件读取函数
def read_file(filename):
    # with open(filename, 'rb')as f:
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()
        # 返回list类型数据
        text = text.split('\n')
    return text


# 读取所需文件
# 读取停用词表
stop_words = read_file(r"../data/baidu_stopwords.txt")
# 去掉停用词中的情感词
# 情感词与停用词有重合导致一些文本分数为0
stop_df = pd.DataFrame(stop_words)
senti_df = pd.read_excel('../data/5_Kansei_word_sentiment_lexicon.xlsx')
stop_df.columns = ['word']
duplicated = pd.merge(stop_df, senti_df, on='word')['word'].tolist()
stop_words = list(filter(lambda x: x not in duplicated, stop_words))

most = read_file("../data/lexicons/most.txt")
very = read_file("../data/lexicons/very.txt")
more = read_file("../data/lexicons/more.txt")
ish = read_file("../data/lexicons/ish.txt")
insufficiently = read_file("../data/lexicons/insufficiently.txt")
inverse = read_file("../data/lexicons/inverse.txt")


# 读取情感词及分数
def get_senti_word(polar):
    """
    读取情感词，Boson或Kansei
    :param polar: pos or neg
    :return: {sentiment word: score}
    """

    if polar == 'pos':
        pos_senti = senti_df[senti_df['sentiment'] > 0]
        senti_dict = pos_senti.set_index(keys='word')['sentiment'].to_dict()
        return senti_dict
    elif polar == 'neg':
        neg_senti = senti_df[senti_df['sentiment'] < 0]
        senti_dict = neg_senti.set_index(keys='word')['sentiment'].to_dict()
        return senti_dict


# 去停用词函数
def del_stopwords(words):
    # 去除停用词后的句子
    new_words = []
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words


# 获取六种权值的词，根据要求返回list，这个函数是为了配合Django的views下的函数使用
def weighted_value(request):
    result_dict = []
    if request == "most":
        result_dict = most
    elif request == "very":
        result_dict = very
    elif request == "more":
        result_dict = more
    elif request == "ish":
        result_dict = ish
    elif request == "insufficiently":
        result_dict = insufficiently
    elif request == "inverse":
        result_dict = inverse
    elif request == 'pos_dict':
        result_dict = get_senti_word(polar='pos')
    elif request == 'neg_dict':
        result_dict = get_senti_word(polar='neg')
    else:
        pass
    return result_dict


print("reading sentiment dict .......")
# 读取情感词典
pos_dict = weighted_value('pos_dict')
neg_dict = weighted_value('neg_dict')
# 读取程度副词词典
# 权值为2
most_dict = weighted_value('most')
# 权值为1.75
very_dict = weighted_value('very')
# 权值为1.50
more_dict = weighted_value('more')
# 权值为1.25
ish_dict = weighted_value('ish')
# 权值为0.25
insufficient_dict = weighted_value('insufficiently')
# 权值为-1
inverse_dict = weighted_value('inverse')


# 程度副词处理，对不同的程度副词给予不同的权重
def match_adverb(word, sentiment_value):
    # 最高级权重为
    if word in most_dict:
        sentiment_value *= 8
    # 比较级权重
    elif word in very_dict:
        sentiment_value *= 6
    # 比较级权重
    elif word in more_dict:
        sentiment_value *= 4
    # 轻微程度词权重
    elif word in ish_dict:
        sentiment_value *= 2
    # 相对程度词权重
    elif word in insufficient_dict:
        sentiment_value *= 0.5
    # 否定词权重
    elif word in inverse_dict:
        sentiment_value *= -1
    else:
        sentiment_value *= 1
    return sentiment_value


# 对每一条微博打分
def single_sentiment_score(sent):
    if pd.isna(sent):
        return -2
    # 分词
    words = list(jieba.cut(sent))
    seg_words = del_stopwords(words)
    # i，s 记录情感词和程度词出现的位置
    i = 0  # 记录扫描到的词位置
    s = 0  # 记录情感词的位置
    pos_score = []  # 记录正向情感分数
    neg_score = []  # 记录负向情感分数

    # 逐个查找情感词
    for word in seg_words:
        # 如果为积极词汇
        if word in pos_dict.keys():
            pos_word_score = pos_dict.get(word)
            # 在情感词前面寻找程度副词
            for w in seg_words[s:i]:
                pos_word_score = match_adverb(w, pos_word_score)
            pos_score.append(pos_word_score)
            s = i + 1  # 记录情感词位置
            # 如果是消极情感词
        elif word in neg_dict.keys():
            neg_word_score = neg_dict.get(word)
            for w in seg_words[s:i]:
                neg_word_score = match_adverb(w, neg_word_score)
            neg_score.append(neg_word_score)
            s = i + 1
        i += 1  # 定位情感词的位置
    # 计算情感值
    sentiment_score = sum(pos_score) + sum(neg_score)

    return sentiment_score

NameError: name 'pd' is not defined

In [26]:
sentences['sentiment'] = sentences['sentence'].apply(single_sentiment_score)
sentences

Unnamed: 0,sentence,sentiment
0,很不错,3.273251
1,服务很好,0.000000
2,很好很方便,53.832072
3,交通很方便,1.495335
4,房间很干净,4.909877
...,...,...
1304779,没有热水,0.000000
1304780,巴适的板,0.000000
1304781,非常干净整洁,1.473783
1304782,离三里屯近位置不错,1.002088


In [27]:
zeros = sentences[sentences['sentiment'] == 0]
zeros['word'] = zeros['sentence'].apply(lambda x: list(jieba.cut(x)))
zeros

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zeros['word'] = zeros['sentence'].apply(lambda x: list(jieba.cut(x)))


Unnamed: 0,sentence,sentiment,word
1,服务很好,0.0,"[服务, 很, 好]"
6,但是布局合理,0.0,"[但是, 布局合理]"
7,没有太拥挤的感觉,0.0,"[没有, 太, 拥挤, 的, 感觉]"
9,很喜欢这个花洒,0.0,"[很, 喜欢, 这个, 花洒]"
11,必须五星好评,0.0,"[必须, 五星, 好评]"
...,...,...,...
1304777,适合孩子,0.0,"[适合, 孩子]"
1304778,就是房间漏水,0.0,"[就是, 房间, 漏水]"
1304779,没有热水,0.0,"[没有, 热水]"
1304780,巴适的板,0.0,"[巴适, 的, 板]"


In [28]:
sentences.to_csv('../data/sentence_sentiment.csv', sep='\t', index=False)

In [30]:
sentences.to_csv('../data/sentence_sentiment.txt', sep='\t', index=False)