In [2]:
# 基于情感词典做情感分析
# BosonNLP是基于微博、新闻、论坛等数据来源构建的情感词典
# （应该比较适合）

from time import sleep
from tqdm import tqdm


In [3]:
# !pip install tqdm
from collections import defaultdict
import os
import re
import jieba
import codecs
import pandas as pd
from tqdm import tqdm


In [4]:
# 去除停用词
# 要注意将停用词表中的否定词或是程度副词的词典过滤掉

#生成stopword表，需要去除一些否定词和程度词汇
stopwords = set()
fr = open('stopwords2.txt','r',encoding='utf-8')
for word in fr:
	stopwords.add(word.strip())#Python strip() 方法用于移除字符串头尾指定的字符（默认为空格或换行符）或字符序列。
#读取否定词文件
not_word_file = open('否定词.txt','r+',encoding='utf-8')
not_word_list = not_word_file.readlines()
not_word_list = [w.strip() for w in not_word_list]
#读取程度副词文件
degree_file = open('程度级别词语_带程度值.txt','r+')
degree_list = degree_file.readlines()
degree_list = [item.split(',')[0] for item in degree_list]
#生成新的停用词表
with open('stopwords_emotion.txt','w',encoding='utf-8') as f:
    for word in stopwords:
        if(word not in not_word_list) and (word not in degree_list):
            f.write(word+'\n')


In [5]:
# 正式使用jieba分词并去除新的停用词

def seg_word(sentence):
    seg_list = jieba.cut(sentence)
    seg_result = []
    for i in seg_list:
        seg_result.append(i)
    stopwords = set()
    with open('stopwords_emotion.txt','r') as fr:
        for i in fr:
            stopwords.add(i.strip())
    return list(filter(lambda x :x not in stopwords,seg_result))


In [6]:
#找出文本中的情感词、否定词和程度副词

def classify_words(word_list):
    #读取情感词典文件
    sen_file = open('BosonNLP_sentiment_score.txt','r+',encoding='utf-8')
    #获取词典文件内容
    sen_list = sen_file.readlines()
    #创建情感字典
    sen_dict = defaultdict()
    #读取词典每一行的内容，将其转换成字典对象，key为情感词，value为其对应的权重
    for i in sen_list:
        if len(i.split(' '))==2:
            sen_dict[i.split(' ')[0]] = i.split(' ')[1]

#读取否定词文件
    not_word_file = open('否定词.txt','r+',encoding='utf-8')
    not_word_list = not_word_file.readlines()
    #读取程度副词文件
    degree_file = open('程度级别词语_带程度值.txt','r+')
    degree_list = degree_file.readlines()
    degree_dict = defaultdict()
    for i in degree_list:
        degree_dict[i.split(',')[0]] = i.split(',')[1]

    sen_word = dict()
    not_word = dict()
    degree_word = dict()
    #分类
    for i in range(len(word_list)):
        word = word_list[i]
        if word in sen_dict.keys() and word not in not_word_list and word not in degree_dict.keys():
            # 找出分词结果中在情感字典中的词
            sen_word[i] = sen_dict[word]
        elif word in not_word_list and word not in degree_dict.keys():
            # 分词结果中在否定词列表中的词
            not_word[i] = -1
        elif word in degree_dict.keys():
            # 分词结果中在程度副词中的词
            degree_word[i]  = degree_dict[word]
        
    #关闭打开的文件
    sen_file.close()
    not_word_file.close()
    degree_file.close()
    #返回分类结果
    return sen_word,not_word,degree_word


In [7]:
#计算情感词的分数
def score_sentiment(sen_word,not_word,degree_word,seg_result):
    #权重初始化为1
    W = 1
    score = 0
    #情感词下标初始化
    sentiment_index = -1
    #情感词的位置下标集合
    sentiment_index_list = list(sen_word.keys())
    #遍历分词结果
    for i in range(0,len(seg_result)):
        #如果是情感词
        if i in sen_word.keys():
            #权重*情感词得分
            score += W*float(sen_word[i])
            #情感词下标加一，获取下一个情感词的位置
            sentiment_index += 1
            if sentiment_index < len(sentiment_index_list)-1:
                #判断当前的情感词与下一个情感词之间是否有程度副词或否定词
                for j in range(sentiment_index_list[sentiment_index],sentiment_index_list[sentiment_index+1]):
                    #更新权重，如果有否定词，权重取反
                    if j in not_word.keys():
                        W *= -1
                    elif j in degree_word.keys():
                        W *= float(degree_word[j])	
        #定位到下一个情感词
        if sentiment_index < len(sentiment_index_list)-1:
            i = sentiment_index_list[sentiment_index+1]
    return score


In [8]:
#计算得分
def sentiment_score(sentence): #注意和score_sentiment区分开
    #1.对文档分词
    seg_list = seg_word(sentence)
    #2.将分词结果转换成字典，找出情感词、否定词和程度副词
    sen_word,not_word,degree_word = classify_words(seg_list)
    #3.计算得分
    score = score_sentiment(sen_word,not_word,degree_word,seg_list)
    return score


In [9]:
# 设置循环，把情感得分加入到dataframe中去
df = pd.read_excel('./BiliBiliComments.xlsx')

#面对报错'int' object has no attribute 'decode'，查到的解决方式是直接强制转换成str类型
df['content'] = df['content'].astype(str)


# 数据清洗版content列
# 删除前面的“回复 @ :”
df['content'].replace(regex=True,inplace=True,to_replace=r'\回.*\:',value=r'')
# 要不要删除表情符号？
df['content'].replace(regex=True,inplace=True,to_replace=r'\[.*\]',value=r'')

df

# df['content'] = df.content.apply(sentiment_score)

# print("我今天很高兴也非常开心    ",sentiment_score("我今天很高兴也非常开心"))
# print('天灰蒙蒙的，路上有只流浪狗，旁边是破旧不堪的老房子   ',sentiment_score('天灰蒙蒙的，路上有只流浪狗，旁边是破旧不堪的老房子'))
# print('愤怒、悲伤和埋怨解决不了问题    ',sentiment_score('愤怒、悲伤和埋怨解决不了问题'))
# print('要每天都开心快乐    ',sentiment_score('要每天都开心快乐'))
# print('我不喜欢这个世界，我只喜欢你    ',sentiment_score('我不喜欢这个世界，我只喜欢你'))


Unnamed: 0.1,Unnamed: 0,root,uid,uname,usex,ulevel,rpid,like,time,content
0,0,0,3907,奔跑呀,保密,6,4720941640,4550,2021-06-13 17:36:28,全靠自来水给你们流量，那给你们的宣传经费都弄哪了？都花在制作上了啊，哦，那没事了。
1,1,4720941640,29041209,朝朝暮暮长,女,5,4722257822,220,2021-06-13 21:17:50,大概是保密局宣传的吧
2,2,4720941640,11969756,Tiger123456,保密,6,4722793998,16,2021-06-13 22:42:34,卧槽，你要笑死我
3,3,4720941640,392092219,花四怂不怂,保密,5,4726184812,2,2021-06-14 14:15:42,宣发不给力啊
4,4,4720941640,237243184,追星星的鱼yu,女,5,4728204580,75,2021-06-14 20:17:02,老艺术家是白嫖的，青年演员是兼职的，连热度都是观众贡献的（不愧是我河能省）
...,...,...,...,...,...,...,...,...,...,...
52275,126,4713408025,1152150487,河南春晚官方,保密,6,4713428860,0,2021-06-12 12:19:41,时刻准备着
52276,127,0,51045380,今天一定不修仙,保密,5,4713407767,0,2021-06-12 12:16:23,粽子哥哥大可爱了
52277,128,0,67363233,MekkyZY,女,6,4713407693,0,2021-06-12 12:16:21,期待期待！
52278,129,0,39756445,幽古剑,女,5,4713407130,0,2021-06-12 12:16:02,可爱！！！


In [23]:
# # for i in tqdm(range(1000)):  
# #      df["content_sentiment"] = df.content.apply(sentiment_score)
# df["content_sentiment"] = df.content.apply(sentiment_score)
# df


Unnamed: 0.1,Unnamed: 0,root,uid,uname,usex,ulevel,rpid,like,time,content,content_sentiment
0,0,0,3907,奔跑呀,保密,6,4720941640,4550,2021-06-13 17:36:28,全靠自来水给你们流量，那给你们的宣传经费都弄哪了？都花在制作上了啊，哦，那没事了。,-0.007265
1,1,4720941640,29041209,朝朝暮暮长,女,5,4722257822,220,2021-06-13 21:17:50,大概是保密局宣传的吧,1.234155
2,2,4720941640,11969756,Tiger123456,保密,6,4722793998,16,2021-06-13 22:42:34,卧槽，你要笑死我,-1.599903
3,3,4720941640,392092219,花四怂不怂,保密,5,4726184812,2,2021-06-14 14:15:42,宣发不给力啊,0.539706


In [10]:
df["content_sentiment"] = df.content.apply(sentiment_score)


df.head(10)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/ll/xx5jng3s4fs5dqmstx6z2_hw0000gn/T/jieba.cache
Loading model cost 0.301 seconds.
Prefix dict has been built successfully.


Unnamed: 0.1,Unnamed: 0,root,uid,uname,usex,ulevel,rpid,like,time,content,content_sentiment
0,0,0,3907,奔跑呀,保密,6,4720941640,4550,2021-06-13 17:36:28,全靠自来水给你们流量，那给你们的宣传经费都弄哪了？都花在制作上了啊，哦，那没事了。,-0.007265
1,1,4720941640,29041209,朝朝暮暮长,女,5,4722257822,220,2021-06-13 21:17:50,大概是保密局宣传的吧,1.234155
2,2,4720941640,11969756,Tiger123456,保密,6,4722793998,16,2021-06-13 22:42:34,卧槽，你要笑死我,-1.599903
3,3,4720941640,392092219,花四怂不怂,保密,5,4726184812,2,2021-06-14 14:15:42,宣发不给力啊,0.539706
4,4,4720941640,237243184,追星星的鱼yu,女,5,4728204580,75,2021-06-14 20:17:02,老艺术家是白嫖的，青年演员是兼职的，连热度都是观众贡献的（不愧是我河能省）,6.920905
5,5,4720941640,262510697,姝诺,保密,5,4728204671,10,2021-06-14 20:17:05,俺们大卫说俺们没钱,-2.135467
6,6,4720941640,31866046,我的被被呢,女,6,4728296102,10,2021-06-14 20:31:31,自从首页刷到《祈》之后，瞬间首页推荐都是端午奇了,4.059373
7,7,4720941640,501700348,肚饿真君Gold,保密,3,4729181195,1,2021-06-14 22:51:33,没钱才能有好作品？,0.830405
8,8,4720941640,5244342,y阳洋,保密,6,4729208339,30,2021-06-14 22:56:29,快给个门路，让我们给你们众筹！！！各种热搜霸屏搞起来！！不然生气了！！,-4.00334
9,9,4720941640,607075,我是散兵大人唯一的狗,女,6,4729954243,186,2021-06-15 01:22:10,绝对是保密局。我今天才被人安利2006年河南的晚会。超震撼。\n终于知道河南不缺资金的时候能...,11.681042


In [11]:
df.to_csv('sentimentscore.csv',index = False)