In [None]:
"""
we have already processed the extraction of the keywords, now we try to visualize
the results

the first step is to integrate temp/keyword_clusters.json and temp/keywords_extracted.json
"""

'\nwe have already processed the extraction of the keywords, now we try to visualize\nthe results\n\nthe first step is to integrate temp/keyword_clusters.json and temp/keywords_extracted.json\n\nand then we create a graph of paper based on the references relations in keyword.ipynb\n'

In [6]:
# basic imports

import json
import networkx as nx
from ipysigma import Sigma
import networkx as nx
import networkx as nx
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import defaultdict, deque
import datetime
import networkx as nx


In [7]:
# First we clean the temp/keywords_extracted.json using the temp/keyword_clusters.jsonl with
"""
   temp/keyword_clusters.json
    "vortex cells": [
        "vortex cells",
        "vortex filament",
        "vortex filaments",
        "vortex gas"
    ],

    and
    temp/keywords_extracted.jsonl
   {"paper_id": "0302109", "title": "The C-Deformation of Gluino and Non-planar Diagrams", "keywords": ["supersymmetric gauge theories", "genus partition function", "non - planar diagrams"]}
"""
# and we save the result in a new file called temp/keywords_cleaned.json

def clean_keywords(keywords_extracted_path, keyword_clusters_path, output_path='temp/keywords_cleaned.json'):
    import json

    # 加载关键词提取结果
    with open(keywords_extracted_path, 'r') as f:
        extracted_keywords = [json.loads(line) for line in f]

    # 加载关键词簇映射
    with open(keyword_clusters_path, 'r') as f:
        keyword_clusters = json.load(f)

    # 构建一个反向映射：每个同义词 -> 它的标准关键词（cluster key）
    synonym_to_cluster = {}
    for cluster_name, synonyms in keyword_clusters.items():
        for synonym in synonyms:
            synonym_to_cluster[synonym.lower()] = cluster_name  # 小写匹配

    cleaned_keywords = {}

    for paper in extracted_keywords:
        paper_id = paper["paper_id"]
        title = paper["title"]
        raw_keywords = paper["keywords"]

        # 将所有关键词映射为标准关键词
        standardized_keywords = set()
        for kw in raw_keywords:
            canonical = synonym_to_cluster.get(kw.lower(), kw)
            standardized_keywords.add(canonical)

        cleaned_keywords[paper_id] = {
            "title": title,
            "keywords": sorted(standardized_keywords)
        }

    # 保存结果
    with open(output_path, 'w') as f:
        json.dump(cleaned_keywords, f, indent=4)


clean_keywords('temp/keywords_extracted.jsonl', 'temp/keyword_clusters.json')


In [8]:
# now we use the cleaned keywords to create a graph of paper based on the references relations in Cit-HepTh.txt
"""
# Directed graph (each unordered pair of nodes is saved once): Cit-HepTh.txt 
# Paper citation network of Arxiv High Energy Physics Theory category
#
#the original data omits all the zeros in front of a node, we use the regex to correct it.
#
# Nodes: 27770 Edges: 0352807
# FromNodeId	ToNodeId
0001001	9304045
0001001	9308122
0001001	9309097
0001001	9311042
0001001	9401139
0001001	9404151
"""

def create_graph_from_citation_file(citation_file):
    """
    Create a graph from the citation file
    """
    # load the citation file
    with open(citation_file, 'r') as f:
        lines = f.readlines()

    # create a directed graph
    G = nx.DiGraph()

    # iterate over the lines in the citation file
    for line in lines[1:]:
        # skip the lines with #
        if line.startswith('#'):
            continue
        # split the line into two nodes
        node1, node2 = line.strip().split('\t')
        # add the nodes to the graph
        G.add_edge(node1, node2)
        
    # add information about the nodes from the temp/keywords_cleaned.json
    """
        "9211012": {
        "title": "Some Constant Solutions to Zamolodchikov's Tetrahedron Equations",
        "keywords": [
            "baxter equation",
            "tetrahedron equations",
            "zamolodchikov"
        ]
    },
    """

    with open('temp/keywords_cleaned.json', 'r') as f:
        keywords_cleaned = json.load(f)

    for node in G.nodes():
        # add the node to the graph
        if node in keywords_cleaned:
            # add the title and keywords to the node
            G.nodes[node]['title'] = keywords_cleaned[node]['title']
            G.nodes[node]['keywords'] = keywords_cleaned[node]['keywords']
        else:
            # if the node is not in the keywords_cleaned.json, set the title and keywords to None
            G.nodes[node]['title'] = None
            G.nodes[node]['keywords'] = None

    return G

citation_file = 'assets/Cit-HepTh.txt'
G = create_graph_from_citation_file(citation_file)

# sigma = Sigma(G)
# sigma


In [9]:

# we can use the networkx library to calculate the betweenness centrality of the graph
betweenness_approx = nx.betweenness_centrality(G, k=100, seed=42, normalized=True)


# we can use the networkx library to calculate the betweenness centrality of the graph
top10 = sorted(betweenness_approx.items(), key=lambda x: -x[1])[:10]
for pid, score in top10:
    # print id and title of the paper the title is in the keywords_cleaned.json
    title = G.nodes[pid]['title'] if pid in G.nodes else 'Unknown'
    
    print(f'Paper ID: {pid}, Title: {title}, Betweenness Centrality: {score:.4f}')


Paper ID: 9905111, Title: Large N Field Theories, String Theory and Gravity, Betweenness Centrality: 0.1033
Paper ID: 9810008, Title: Conformal Anomaly for Dilaton Coupled Theories from AdS/CFT Correspondence, Betweenness Centrality: 0.0800
Paper ID: 0206223, Title: Constant Curvature Black Hole and Dual Field Theory, Betweenness Centrality: 0.0800
Paper ID: 9509140, Title: Non-Perturbative Green's Functions in Theories with Extended Superconformal Symmetry, Betweenness Centrality: 0.0503
Paper ID: 9607239, Title: Superconformal Ward Identities and N=2 Yang-Mills Theory, Betweenness Centrality: 0.0375
Paper ID: 9803001, Title: Macroscopic strings as heavy quarks: Large-N gauge theory and anti-de Sitter supergravity, Betweenness Centrality: 0.0374
Paper ID: 9912210, Title: Scalar Quartic Couplings in Type IIB Supergravity on $AdS_5\times S^5$, Betweenness Centrality: 0.0313
Paper ID: 9902121, Title: A Stress Tensor for Anti-de Sitter Gravity, Betweenness Centrality: 0.0197
Paper ID: 990

In [None]:
# we use algorithms to find the communities in the graph
# find communities in the graph
from networkx.algorithms.community import louvain_communities

# Louvain community detection
lc_Ghep = louvain_communities(G,resolution=0.1, seed=123)

#print the community number
print(len(lc_Ghep))

158


In [21]:


def assign_domains_by_cluster(G, n_domains=15):
    # 提取节点列表
    node_list = list(G.nodes())
    # 把每个节点的关键词转为字符串
    texts = [" ".join(G.nodes[n].get("keywords") or []) for n in node_list]

    # TF-IDF 向量化
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(texts)

    # KMeans 聚类
    kmeans = KMeans(n_clusters=n_domains, random_state=42)
    labels = kmeans.fit_predict(X)

    # 给每个节点分配领域编号和颜色
    domain_palette = [
        "#FF6F61", "#6B5B95", "#88B04B", "#F7CAC9", "#92A8D1",
        "#955251", "#B565A7", "#009B77", "#DD4124", "#45B8AC",
        "#EFC050", "#5B5EA6", "#9B2335", "#DFCFBE", "#BC243C"
    ]

    for i, node in enumerate(node_list):
        G.nodes[node]["domain"] = int(labels[i])
        G.nodes[node]["color"] = domain_palette[int(labels[i]) % len(domain_palette)]

    return G

def extract_time_from_id(paper_id):
    try:
        year = int(paper_id[:2])
        month = int(paper_id[2:4])
        if year >= 90:
            year += 1900
        else:
            year += 2000
        return year, month
    except:
        return None, None

def assign_positions_by_domain_time(G):
    # 横向按领域排
    domain_ids = sorted(set(nx.get_node_attributes(G, "domain").values()))
    domain_x_base = {d: i * 10.0 for i, d in enumerate(domain_ids)}

    # 暂存每个 domain, (year, month) 下的节点列表
    bucket = defaultdict(list)
    for node in G.nodes():
        domain = G.nodes[node].get("domain", 0)
        year, month = extract_time_from_id(node)
        if year and month:
            bucket[(domain, year, month)].append(node)
        else:
            bucket[(domain, 0, 0)].append(node)  # fallback

    # 为每组节点分配聚类排布
    for (domain, year, month), nodes in bucket.items():
        x_base = domain_x_base[domain]
        y = year + (month - 1) / 12.0
        for i, node in enumerate(nodes):
            # 稍微左右分散一点，避免重叠
            offset = (i - len(nodes) / 2) * 0.3
            G.nodes[node]["x"] = x_base + offset
            G.nodes[node]["y"] = y*20
            # use the degree of the node to set the size
            G.nodes[node]["size"] = G.degree(node) * 10

G = assign_domains_by_cluster(G, n_domains=15)
#add a new attribute of time to the nodes
def assign_time_to_nodes(G):
    for node in G.nodes():
        year, month = extract_time_from_id(node)
        if year and month:
            G.nodes[node]["time"] = f"{year:04d}-{month:02d}"
        else:
            G.nodes[node]["time"] = "Unknown"
assign_time_to_nodes(G)
assign_positions_by_domain_time(G)

sigma = Sigma(
    G,
    node_color="color",
    edge_color="color",
    node_label="title",
    node_size="size",
    default_edge_type="curve"
)
sigma





Sigma(nx.DiGraph with 27,770 nodes and 352,807 edges)

In [20]:


def get_core_id(pid):
    return pid.split("_")[0]

def extract_time_from_id(paper_id):
    try:
        core_id = get_core_id(paper_id)
        if len(core_id) != 7 or not core_id[:4].isdigit():
            return None
        year = int(core_id[:2])
        month = int(core_id[2:4])
        year += 1900 if year >= 90 else 2000
        return datetime.datetime(year, month, 1)
    except Exception:
        return None

def extract_time(node):
    return extract_time_from_id(node)

def build_subtree(G, top10_ids, direction="forward", x_spacing=6000, d=250, delta=8, max_month_diff=120):
    G_sub = nx.DiGraph()
    core_top10_ids = {get_core_id(tid) for tid in top10_ids}


    align_year = 2003
    align_month = 4

    for i, root_id in enumerate(top10_ids):
        root_time = extract_time(root_id)
        if root_time is None:
            continue

        # 计算偏移，使得 align_date 的 y = 0
        tree_offset_y = d * ((root_time.year - align_year) * 12 + (root_time.month - align_month))

        x_center = i * x_spacing
        root_time = extract_time(root_id)
        if root_time is None:
            continue

        root_clone = root_id
        G_sub.add_node(
            root_clone,
            title=G.nodes[root_id].get("title", root_id),
            x=x_center,
            y=-tree_offset_y,
            color=G.nodes[root_id].get("color", "gray"),
            size=10
        )

        layer_counts = defaultdict(int)
        visited = set()
        queue = deque([(root_id, root_clone)])

        while queue:
            orig_current, current_id = queue.popleft()
            current_time = extract_time(orig_current)
            if current_time is None:
                continue

            neighbors = (
                G.successors(orig_current) if direction == "forward"
                else G.predecessors(orig_current)
            )

            for neighbor in neighbors:
                core_neighbor = get_core_id(neighbor)
                if (neighbor, root_id) in visited or core_neighbor in core_top10_ids:
                    continue
                visited.add((neighbor, root_id))

                if 'title' not in G.nodes[neighbor] or not G.nodes[neighbor]['title'].strip():
                    continue
                neighbor_time = extract_time(neighbor)
                if neighbor_time is None:
                    continue

                delta_months = (neighbor_time.year - root_time.year) * 12 + (neighbor_time.month - root_time.month)

                if direction == "forward" and delta_months >= 0:
                    continue
                if direction == "backward" and delta_months <= 0:
                    continue
                if abs(delta_months) > max_month_diff:
                    continue

                y_base = -d * delta_months
                y_offset = 0

                if neighbor_time.year == current_time.year and neighbor_time.month == current_time.month:
                    y_offset = +d * 0.2 if direction == "forward" else -d * 0.2

                y = y_base + y_offset - tree_offset_y

                n_in_layer = layer_counts[delta_months]
                x = x_center + delta * n_in_layer * ((-1) ** n_in_layer)
                layer_counts[delta_months] += 1

                node_id = f"{neighbor}_from_{root_id}"

                if node_id not in G_sub:
                    G_sub.add_node(
                        node_id,
                        title=G.nodes[neighbor].get("title", neighbor),
                        x=x,
                        y=y,
                        color=G.nodes[neighbor].get("color", "gray"),
                        size=5,
                        time=neighbor_time.strftime("%Y-%m")
                    )

                if direction == "forward":
                    G_sub.add_edge(current_id, node_id, color="gray")
                else:
                    G_sub.add_edge(node_id, current_id, color="gray")

                queue.append((neighbor, node_id))

    return G_sub

def build_recursive_temporal_tree(G, top10_ids, **kwargs):
    G_forward = build_subtree(G, top10_ids, direction="forward", **kwargs)
    G_backward = build_subtree(G, top10_ids, direction="backward", **kwargs)
    return nx.compose(G_forward, G_backward)



top10_ids = [pid for pid, _ in top10]

G_structured = build_recursive_temporal_tree(G, top10_ids)

sigma = Sigma(
    G_structured,
    node_color="color",
    edge_color="color",
    node_label="title",
    node_size="size",
    default_edge_type="curve"
)
sigma








Sigma(nx.DiGraph with 146,919 nodes and 146,909 edges)