In [None]:
from pathlib import Path
from typing import Literal

import histcite

#### 输入文件夹路径及数据源类型

In [None]:
folder_path = Path("/Users/.../Downloads/dataset")
output_path = folder_path / "result"
source: Literal["wos", "cssci", "scopus"] = "wos"

#### 读取并处理文件

In [None]:
docs_df = histcite.ReadFile(folder_path, source).read_all()
docs_df

In [None]:
# 查看参考文献表
refs_df = histcite.BuildRef(docs_df, source).build()
refs_df

In [None]:
# 查看引用关系表
citation_matrix = histcite.BuildCitation(docs_df, refs_df, source).build()
citation_matrix

#### 导出描述性统计数据

In [None]:
cm = histcite.ComputeMetrics(docs_df, citation_matrix, source)
cm.write2excel(output_path / "descriptive.xlsx")

#### 导出引文网络图文件
使用 [Graphviz 在线编辑器](http://magjac.com/graphviz-visual-editor/) 或下载到本地的 [Graphviz](https://graphviz.org/) 生成引文网络图。 

In [None]:
graph = histcite.GraphViz(docs_df, citation_matrix, source)

In [None]:
# 选取 LSC 最高的 50 篇文献
node_list = (
    citation_matrix[citation_matrix["LCS"] > 0]
    .sort_values("LCS", ascending=False)
    .index[:50]
    .tolist()
)
graph_dot_file = graph.generate_dot_file(node_list)
print(graph_dot_file)

In [None]:
# 选取 LSC 大于等于 5 的文献
node_list = citation_matrix[citation_matrix["LCS"] >= 5].index.tolist()
graph_dot_file = graph.generate_dot_file(node_list)
print(graph_dot_file)

In [None]:
# 查看编号为 10 的文献的参考文献网络图，禁用时间线
graph_dot_file = graph.generate_dot_file(10, edge_type="cited", show_timeline=False)
print(graph_dot_file)

In [None]:
# 查看编号为 10 的文献的引用文献网络图
graph_dot_file = graph.generate_dot_file(10, edge_type="citing")
print(graph_dot_file)

In [None]:
# 查看编号为 10 的文献的引文网络图，包含参考文献和引用文献
graph_dot_file = graph.generate_dot_file(10)
print(graph_dot_file)

In [None]:
# 导出 dot 文件
with open(output_path / "graph.dot", "w") as f:
    f.write(graph_dot_file)

In [None]:
# 查看图节点信息
graph_node_info = graph.generate_graph_node_info()
graph_node_info

In [None]:
# 导出图节点信息
graph_node_info.to_excel(output_path / "graph_node_info.xlsx", index=False)