In [None]:
import os
import pandas as pd
os.chdir(r"/home/miu")

df = pd.read_excel("result.xlsx")
print(df)

In [None]:
# 读入停用词表
os.chdir(r"/home/miu/pros/acc/pc/stopwords-master")
with open('cn_stopwords.txt','r',encoding='utf-8') as f:
    stopwords = f.read().splitlines()
    # 添加自定义停用词
    add_stopwords = ["年", "月", "日", "近日", "回应", "称", "表示"]
    stopwords.extend(add_stopwords)
    # print(stopwords)

# Redefine cut
import jieba
def cut(sentence):
    been_cut = jieba.lcut(sentence)
    been_cut = [i for i in been_cut if i not in stopwords and ('\u4e00'<= i <='\u9fa5')]
    return been_cut

In [None]:
## 若无简介则对标题进行分析
for i in range(0, len(df)):
    if df[i:i+1]["内容简介"].item() == "暂无简介":
        df.loc[i, "内容简介"] = df.loc[i, "标题"]
df

In [None]:
df_solved = df['内容简介'].apply(cut)
print(df_solved)

In [None]:
# 词频统计
def count(line):
    times = {}
    for word in line:
        if word in times:
            times[word] += 1
        else:
            times[word] = 1
    times = sorted(times.items(), key = lambda x: x[1], reverse = True)
    return times

words = list(df_solved)
words = [w for sentence in words for w in sentence]
# print(words)

# 打印出现次数最多的十个
print("词频最高的10个词是: ", count(words)[0:10])

In [None]:
## WordCloud
import wordcloud
import matplotlib
import jieba.analyse as analyse

summary = " ".join(words)
keywords = dict(analyse.extract_tags(summary, topK = 50, withWeight = True))
print(keywords)

os.chdir(r"../")

In [None]:
## Cloud1 without weight
cloud1 = wordcloud.WordCloud(max_words = 100, width = 600, height = 600, margin = 2, background_color = "white", font_path = '/usr/share/fonts/SimHei/SimHei.ttf').generate(summary)
matplotlib.pyplot.axis("off")
matplotlib.pyplot.imshow(cloud1, interpolation = "bilinear")
cloud1.to_file('./内容摘要1.png')

In [None]:
## Cloud2 with frequencies_weight
cloud2 = wordcloud.WordCloud(max_words = 100, width = 600, height = 600, margin = 2, background_color = "white", font_path = '/usr/share/fonts/SimHei/SimHei.ttf').generate_from_frequencies(keywords)
matplotlib.pyplot.axis("off")
matplotlib.pyplot.imshow(cloud2, interpolation = "bilinear")
cloud2.to_file('./内容摘要2.png')

In [None]:
from snownlp import SnowNLP
df["感情得分"] = df["内容简介"].apply(lambda x: SnowNLP(x).sentiments)
df["感情得分"].describe()
df["感情得分"][0:5]

In [None]:
import numpy as np
df_emotion = df.sort_values(by = ["感情得分"], ascending = False)

df_emotion["正负面"] = np.where(df_emotion["感情得分"] > 0.7, "正面新闻", np.where(df_emotion["感情得分"] < 0.3, "负面新闻", "中性新闻"))
print(df_emotion)

In [None]:
## Intialize
langnet = [i[0] for i in keywords.items()][0:20]
matrix = np.empty((len(langnet), len(langnet)))
matrix[np.diag_indices_from(matrix)] = np.nan

## Count
for i in range(0, len(langnet)):
    for j in range(i+1, len(langnet)):
        cnt = 0
        for sentence in df["内容简介"]:
            ifbool = (langnet[i] in sentence and langnet[j] in sentence)
            if (ifbool):
                cnt += 1
        matrix[i][j] = cnt
        matrix[j][i] = cnt # set the opposite

## Transfer to Dataframe
mat_df = pd.DataFrame(matrix)
mat_df.columns = langnet
mat_df.index = langnet
print(mat_df)

In [None]:
## Compress the matrix and sort
mat_df = mat_df.replace(0.0, np.nan)
mat_df_stacked = mat_df.stack().reset_index() # 堆叠稀疏矩阵
mat_df_stacked.rename(columns = {"level_0": "词语1", "level_1": "词语2", 0: "共同出现次数"}, inplace = True)
mat_df_stacked.sort_values(by = ["共同出现次数"], ascending = False, inplace = True)

mat_df_stacked

In [None]:
import networkx

## Set matplotlib parameters
matplotlib.pyplot.rcParams['font.sans-serif'] = ["SimHei"]
matplotlib.pyplot.rcParams['axes.unicode_minus'] = False
graph = networkx.DiGraph() # Intialize graph

node_pre = list(mat_df_stacked["词语1"])
node_next = list(mat_df_stacked["词语2"]) # Set vertexes
weight = list(mat_df_stacked["共同出现次数"]) # Set weight
for i in range(0, min(20, 2*len(node_pre))): # 有时候关键词连20个都没有
    graph.add_edge(node_pre[i], node_next[i], weight = weight) # Set edges

position = networkx.circular_layout(graph)
matplotlib.pyplot.figure(figsize=(10,8)) # Adjust the graph size
networkx.draw(graph, pos = position, node_size = 2000, node_color = range(len(graph)),cmap = matplotlib.pyplot.cm.Pastel1,
        font_size = 14, edge_color = 'grey', with_labels = True)