# 红楼梦——数据预处理
#### 河南城建学院——信息管理与信息系统
#### masonsxu: masonsxu@qq.com

In [2]:
# 导入需要的包
import re
import jieba
import numpy as np
import pandas as pd

In [45]:
# 读取数据
RedMansion_df = pd.read_json('./data/RedMansion_Data.json')
RedMansion_df.head()

Unnamed: 0,Chapter,LeftName,RightName,Chapter_new,ChapterName,StartIndex,EndIndex,ChapterLength,Artical,CountWord,CutWord
0,《红楼梦》第一回,甄士隐梦幻识通灵,贾雨村风尘怀闺秀,1,"甄士隐梦幻识通灵,贾雨村风尘怀闺秀",0,22,22,――此开卷第一回也。作者自云：曾历过一番梦幻之后，故将真事隐去，而借通灵说此《石头记》一书也...,6336,"[开卷, 第一, 第一回, 一回, 作者, 一番, 梦幻, 之后, 真事, 隐去, 石头, ..."
1,《红楼梦》第二回,贾夫人仙逝扬州城,冷子兴演说荣国府,2,"贾夫人仙逝扬州城,冷子兴演说荣国府",23,38,15,却说封肃听见公差传唤，忙出来陪笑启问，那些人只嚷：“快请出甄爷来。”封肃忙陪笑道：“小人姓封...,5637,"[却说, 封肃, 听见, 公差, 传唤, 陪笑, 请出, 封肃, 陪笑, 小人, 当日, 小..."
2,《红楼梦》第三回,托内兄如海荐西宾,接外孙贾母惜孤女,3,"托内兄如海荐西宾,接外孙贾母惜孤女",39,61,22,却说雨村忙回头看时，不是别人，乃是当日同僚一案参革的张如圭。他系此地人，革后家居，今打听得都...,7856,"[却说, 回头, 乃是, 当日, 同僚, 一案, 张如圭, 家居, 打听, 起复, 复旧, ..."
3,《红楼梦》第四回,薄命女偏逢薄命郎,葫芦僧判断葫芦案,4,"薄命女偏逢薄命郎,葫芦僧判断葫芦案",62,77,15,却说黛玉同姐妹们至王夫人处，见王夫人正和兄嫂处的来使计议家务，又说姨母家遭人命官司等语。因见...,5336,"[却说, 黛玉, 姐妹, 王夫人, 夫人, 王夫人, 夫人, 兄嫂, 计议, 家务, 姨母,..."
4,《红楼梦》第五回,贾宝玉神游太虚境,警幻仙曲演红楼梦,5,"贾宝玉神游太虚境,警幻仙曲演红楼梦",78,110,32,第四回中既将薛家母子在荣府中寄居等事略已表明，此回暂可不写了。如今且说林黛玉自在荣府，一来贾...,7245,"[第四, 第四回, 四回, 回中, 家母, 母子, 荣府, 寄居, 事略, 表明, 可不, ..."


### 对每一回进行可视化词云

In [46]:
index = 1
name = RedMansion_df.Chapter[index] + ':' + RedMansion_df.LeftName[index] + ',' + RedMansion_df.RightName[index]
words = RedMansion_df.CutWord[index]
# 统计词频
CountWord_df = pd.DataFrame({'Word': words})
CountWord_stat = CountWord_df.groupby(by=['Word'])['Word'].agg({np.size}).rename(columns={'size':'number'})
CountWord_stat = CountWord_stat.reset_index().sort_values(by='number', ascending=False)
CountWord_stat['wordlen'] = CountWord_stat.Word.apply(len)
CountWord_stat.head()

Unnamed: 0,Word,number,wordlen
4,一个,14,2
428,女儿,11,2
827,老爷,11,2
438,如今,10,2
99,两个,10,2


In [5]:
from pyecharts.globals import ThemeType
import pyecharts.options as opts

from pyecharts.charts import WordCloud

wordcloud = (
    WordCloud(init_opts=opts.InitOpts(theme=ThemeType.DARK))
    .add("", [list(z) for z in zip(CountWord_stat.Word, CountWord_stat.number)])
    .set_global_opts(title_opts=opts.TitleOpts(title=name))
)
wordcloud.render_notebook()

### 绘制所有章节（回）的词云图

In [44]:
from pyecharts.charts import Tab

tab_wordcloud = Tab(page_title='红楼梦所有章节词云图')

for index in np.arange(3):
    # 整合数据
    name = RedMansion_df.Chapter[index] + ':' + RedMansion_df.LeftName[index] + ',' + RedMansion_df.RightName[index]
    words = RedMansion_df.CutWord[index]
    # 统计词频
    CountWord_df = pd.DataFrame({'Word': words})
    CountWord_stat = CountWord_df.groupby(by=['Word'])['Word'].agg({np.size}).rename(columns={'size':'number'})
    CountWord_stat = CountWord_stat.reset_index().sort_values(by='number', ascending=False)
    CountWord_stat['wordlen'] = CountWord_stat.Word.apply(len)
    # 绘制词云图
    wordcloud_name = 'wordcloud_' + str(index)
    wordcloud_name = (
        WordCloud(init_opts=opts.InitOpts(theme=ThemeType.DARK))
        .add("", [list(z) for z in zip(CountWord_stat.Word, CountWord_stat.number)])
        .set_global_opts(title_opts=opts.TitleOpts(title=name, pos_left='center'))
    )
    # 添加到 HTML 文件内进行展示
    tab_wordcloud.add(wordcloud_name, name)
tab_wordcloud.render('RedMansion_Data_Ana_WordCloud.html')
tab_wordcloud.render_notebook()

## 分析每一章的人物出场次数（频率）

In [47]:
RedMansion_Character = pd.read_csv('./data/RedMansion_Character.txt', header=None, names=['name'])
RedMansion_Character.head()

Unnamed: 0,name
0,艾官
1,安国公
2,白老媳妇
3,白老媳妇儿
4,白玉钏


In [48]:
index = 1
name = RedMansion_df.Chapter[index] + ':' + RedMansion_df.LeftName[index] + ',' + RedMansion_df.RightName[index]
words = RedMansion_df.CutWord[index]
# 统计词频
CountWord_df = pd.DataFrame({'Word': words})
CountWord_stat = CountWord_df.groupby(by=['Word'])['Word'].agg({np.size}).rename(columns={'size':'number'})
CountWord_stat = CountWord_stat.reset_index().sort_values(by='number', ascending=False)
# 整合
Character_name = CountWord_stat.loc[CountWord_stat.Word.isin(RedMansion_Character.iloc[:, 0].values)].reset_index(drop=True)
Character_name.head()

Unnamed: 0,Word,number
0,封肃,7
1,娇杏,4
2,甄家娘子,4
3,林如海,3
4,贾赦,2


### 绘制直方图

In [30]:
from pyecharts.charts import Bar

bar = (
    Bar(init_opts = opts.InitOpts(theme = ThemeType.DARK))
    .add_xaxis(Character_name.Word.tolist())
    .add_yaxis("频数", [int(x) for x in Character_name.number],category_gap="50%")
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(title_opts=opts.TitleOpts(title=name, pos_left='center'),
    xaxis_opts=opts.AxisOpts(name='人名',
    axislabel_opts=opts.LabelOpts(rotate=30, interval=0)),
    yaxis_opts=opts.AxisOpts(name='频数', name_location='middle'),
    datazoom_opts=[opts.DataZoomOpts(), opts.DataZoomOpts(type_='inside')], 
    legend_opts=opts.LegendOpts(pos_left='right')
    )   
)
bar.render_notebook()

In [33]:
tab_bar = Tab(page_title='红楼梦所有章节柱状图')

for index in np.arange(3):
    # 整合数据
    name = RedMansion_df.Chapter[index] + ':' + RedMansion_df.LeftName[index] + ',' + RedMansion_df.RightName[index]
    words = RedMansion_df.CutWord[index]
    # 统计词频
    CountWord_df = pd.DataFrame({'Word': words})
    CountWord_stat = CountWord_df.groupby(by=['Word'])['Word'].agg({np.size}).rename(columns={'size':'number'})
    CountWord_stat = CountWord_stat.reset_index().sort_values(by='number', ascending=False)
    Character_name = CountWord_stat.loc[CountWord_stat.Word.isin(RedMansion_Character.iloc[:, 0].values)].reset_index(drop=True)
    # 绘制柱状图
    bar_name = 'bar_' + str(index)
    bar_name = (
    Bar(init_opts = opts.InitOpts(theme = ThemeType.DARK))
    .add_xaxis(Character_name.Word.tolist())
    .add_yaxis("频数", [int(x) for x in Character_name.number],category_gap="50%")
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(title_opts=opts.TitleOpts(title=name, pos_left='center'),
    xaxis_opts=opts.AxisOpts(name='人名',
    axislabel_opts=opts.LabelOpts(rotate=30, interval=0)),
    yaxis_opts=opts.AxisOpts(name='频数', name_location='middle'),
    datazoom_opts=[opts.DataZoomOpts(), opts.DataZoomOpts(type_='inside')], 
    legend_opts=opts.LegendOpts(pos_left='right')
    )   
)
    # 添加到 HTML 文件内进行展示
    tab_bar.add(bar_name, name)
tab_bar.render('RedMansion_Data_Ana_Bar.html')
tab_bar.render_notebook()