In [4]:
import networkx as nx
from ipysigma import Sigma
import numpy as np

import os
import json
import re
import csv


In [88]:
# Solution for exercise 11

"""
explanation of meta data:
------------------------------------------------------------------------------
\\
Paper: hep-th/9201018
From: OGURAWA@VTCC1.CC.VT.EDU
Date: Thu, 9 Jan 1992 18:18:54 -0500 (EST)   (20kb)
Date (revised): Mon, 13 Jan 1992 15:35:36 -0500 (EST)

Title: Discrete and Continuum Virasoro Constraints in Two-Cut Hermitian Matrix
  Models
Authors: Waichi Ogura
Comments: 25 pages
Journal-ref: Prog.Theor.Phys. 89 (1993) 1311-1330
\\
  Continuum Virasoro constraints in the two-cut hermitian matrix models are
derived from the discrete Ward identities by means of the mapping from the
$GL(\infty )$ Toda hierarchy to the nonlinear Schr\"odinger (NLS) hierarchy.
The invariance of the string equation under the NLS flows is worked out. Also
the quantization of the integration constant $\alpha$ reported by Hollowood et
al. is explained by the analyticity of the continuum limit.
\\
"""

"""
We will do the following steps:
    1. extract all the authors as nodes, and the collaboration as edges with the weight of the number of collaborations.
    2. extract the title as the name of the paper, the number of arxiv and attribute them to the nodes.
    3. find communities in the graph.
    5. use citation data to find the most important communities.
"""



  """


'\nWe will do the following steps:\n    1. extract all the authors as nodes, and the collaboration as edges with the weight of the number of collaborations.\n    2. extract the title as the name of the paper, the number of arxiv and attribute them to the nodes.\n    3. find communities in the graph.\n    5. use citation data to find the most important communities.\n'

In [None]:

def extract_author(text):
    match = re.search(r'Author:(.*?)(?:\\n\\n|Report-no:|Comments:|Title:)', text, re.DOTALL)
    if match:
        authors = match.group(1).replace("\n", " ")  # 处理换行符
        authors = re.sub(r'\(.*?\)', '', authors)  # 去除括号及其内容
        authors = [a.strip() for a in re.split(r',| and ', authors)]  # 拆分多个作者
        authors = [re.sub(r'[^a-zA-Z .]', '', a) for a in authors]  # 只保留拉丁字母、空格和点
        authors = [re.sub(r'\s+', ' ', a) for a in authors]  # 合并多个空格
        #将所有词的首字母大写, 其他字母小写
        authors = [a.title() for a in authors]
        authors = [a.replace("Nieegawa", "Niegawa") for a in authors]  # 统一名字格式
        authors = merge_name_variants(authors)  # 统一名字变体
        authors = resolve_abbreviations(authors)  # 处理缩写和全称重复
        authors = list(filter(None, authors))  # 去除空项
        return sorted(authors)
    return []

def merge_name_variants(authors):
    """合并同一作者的不同写法，如 Juan Maldacena 和 Juan M. Maldacena"""
    name_variants = {}
    
    for author in authors:
        base_name = re.sub(r'\b[A-Z]\.', '', author).strip()  # 移除首字母缩写
        if base_name in name_variants:
            name_variants[base_name].add(author)
        else:
            name_variants[base_name] = {author}
    
    return [max(variants, key=len) for variants in name_variants.values()]

def resolve_abbreviations(authors):
    """检查是否有缩写和全称重复的情况，若有，则保留全称"""
    full_names = set()
    abbreviations = set()
    
    for author in authors:
        if '.' in author:
            abbreviations.add(author)
        else:
            full_names.add(author)
    
    # 只保留非缩写形式的作者名
    final_authors = full_names.union(a for a in abbreviations if a not in full_names)
    return list(final_authors)

def extract_arxiv_info(abs_file):
    with open(abs_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    paper_id_match = re.search(r'Paper:\s*hep-th/(\d+)', content)
    title_match = re.search(r'Title:\s*(.+)', content)
    authors = extract_author(content)
    
    if paper_id_match and title_match and authors:
        return paper_id_match.group(1), {
            "title": title_match.group(1),
            "authors": authors
        }
    return None

def process_abstracts(directory):
    arxiv_data = {}
    authors_set = set()
    
    for year in os.listdir(directory):
        year_path = os.path.join(directory, year)
        if not os.path.isdir(year_path):
            continue
        
        for file in os.listdir(year_path):
            if file.endswith(".abs"):
                abs_path = os.path.join(year_path, file)
                result = extract_arxiv_info(abs_path)
                if result:
                    arxiv_id, data = result
                    arxiv_data[arxiv_id] = data
                    authors_set.update(data["authors"])
    
    with open("arxiv_data.json", "w", encoding="utf-8") as json_file:
        json.dump(arxiv_data, json_file, indent=4, ensure_ascii=False)
    
    sorted_authors = sorted(authors_set)
    with open("authors_list.csv", "w", encoding="utf-8", newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Author"])
        for author in sorted_authors:
            writer.writerow([author])
    
    print("Extraction completed. Data saved to arxiv_data.json and authors_list.csv")

# 示例调用
process_abstracts("./cit-HepTh-abstracts")


Extraction completed. Data saved to arxiv_data.json and authors_list.csv


In [90]:
# to clean the authors list 

import pandas as pd
from collections import defaultdict
from difflib import SequenceMatcher
import itertools

# ========== 读取数据 ==========
file_path = "/Users/yuguang/Projet/modal/Modal_INF473G/authors_list.csv"
similarity_threshold = 0.85

# ========== 读取数据 ==========
df = pd.read_csv(file_path)
authors = df['Author'].dropna().unique()
authors_clean = [a.strip() for a in authors]

# ========== 相似度函数 ==========
def similarity_ratio(a, b):
    return SequenceMatcher(None, a, b).ratio()

# ========== 找出相似名字并保留最长 ==========
# 先做完全相同/非常相似的名字分组
groups_similar = []

used = set()
for i, name1 in enumerate(authors_clean):
    if name1 in used:
        continue
    group = [name1]
    used.add(name1)
    for name2 in authors_clean[i+1:]:
        if name2 not in used and similarity_ratio(name1, name2) >= 0.90:
            group.append(name2)
            used.add(name2)
    groups_similar.append(group)

# 每组保留最长的名字
cleaned_names = []
for group in groups_similar:
    longest = max(group, key=len)
    cleaned_names.append(longest)

# ========== 按姓氏首字母分组 ==========
groups = defaultdict(list)
for name in cleaned_names:
    last_name = name.split()[-1] if name.split() else ''
    key = last_name[0].lower() if last_name else 'other'
    groups[key].append(name)

# ========== 分组内模糊匹配 ==========
grouped_possible_duplicates = []
for key, names in groups.items():
    for a1, a2 in itertools.combinations(names, 2):
        similarity = similarity_ratio(a1, a2)
        if similarity >= similarity_threshold and a1 != a2:
            grouped_possible_duplicates.append((a1, a2, round(similarity * 100, 1)))

# ========== 结果整理 ==========
grouped_fuzzy_matches = pd.DataFrame(
    grouped_possible_duplicates,
    columns=['Name 1', 'Name 2', 'Similarity']
)

# ========== 输出 ==========
print(grouped_fuzzy_matches)

pd.DataFrame({'Cleaned Names': cleaned_names}).to_csv(file_path, index=False)


                       Name 1                    Name 2  Similarity
0               A. Strominger           Andy Strominger        85.7
1           Andrew Strominger           Andy Strominger        87.5
2                  B. Schroer             B.J. Schroers        87.0
3                H. Samtleben          J.A.H. Samtleben        85.7
4                   H. Suzuki                 S. Suzuki        88.9
5             John H. Schwarz              John Schwarz        88.9
6                Myck Schwetz           Myckola Schwetz        88.9
7              N. I. Stoilova               N. Stoilova        88.0
8               R. Schimmrigk           Rolf Schimmrigk        85.7
9              Igor A. Bandos               Igor Bandos        88.0
10             J. Bockenhauer          Jens Bockenhauer        86.7
11               Thomas Banks                 Tom Banks        85.7
12               Andrey Dubin          Andrey Yu. Dubin        85.7
13                Brian Dolan            Brian P

In [5]:

""" 
example of Cit-HepTh.txt:
# Directed graph (each unordered pair of nodes is saved once): Cit-HepTh.txt 
# Paper citation network of Arxiv High Energy Physics Theory category
# Nodes: 27770 Edges: 352807
# FromNodeId	ToNodeId
0001001	9304045
0001001	9308122
0001001	9309097
0001001	9311042
0001001	9401139

for each paper, we will add a new attribute of list that shows all the papers that this paper cite, with its arxiv number.
We will add it iff both papers of a citation are in the paper. And save it to arxiv_papers.json.

example of papers_standardized.json

  {
    "paper_id": "hep-th/9211063",
    "from": "Malcolm Perry <M.J.Perry@damtp.cambridge.ac.uk>",
    "submitted": "Sat, 14 Nov 92 17:58:35 GMT (27kb)",
    "title": "Topological Conformal Gravity in Four Dimensions",
    "authors": [
      "Malcolm J. Perry",
      "Edward Teo"
    ],
    "comments": "35 pages, harvmac, DAMTP R92/42",
    "report_no": null,
    "journal_ref": "Nucl.Phys. B401 (1993) 206-238",
    "subject_class": null,
    "proxy": null,
    "abstract": "In this paper, we present a new formulation of topological conformal gravity in four dimensions. Such a theory was first considered by Witten as a possible gravitational counterpart of topological Yang-Mills theory, but several problems left it incomplete. The key in our approach is to realise a theory which describes deformations of conformally self-dual gravitational instantons. We first identify the appropriate elliptic complex which does precisely this. By applying the Atiyah-Singer index theorem, we calculate the number of independent deformations of a given gravitational instanton which preserve its self-duality. We then quantise topological conformal gravity by BRST gauge-fixing, and discover how the quantum theory is naturally described by the above complex. Indeed, it is a process which closely parallels that of the Yang-Mills theory, and we show how the partition function generates an uncanny gravitational analogue of the first Donaldson invariant.",
    "year": 1992
  },


"""
#load data from papers_standardized.json
with open("papers_standardized.json", "r", encoding="utf-8") as f:
    papers = {paper["paper_id"]: paper for paper in json.load(f)}


# Load the citation data
citations = {}
with open("Cit-HepTh.txt", "r") as f:
    # Skip the lines with comments
    while True:
        line = f.readline()
        if not line.startswith("#"):
            break
    for line in f:
        source, target = map(int, line.strip().split())
        source = str(source)
        target = str(target)
        if source in papers and target in papers:
            if source not in citations:
                citations[source] = []
            citations[source].append(target)

# Add the citation of the papers and the paper cited by the papers
for paper_id, data in papers.items():
    if paper_id in citations:
        data["citations"] = citations[paper_id]
    else:
        data["citations"] = []


# Save the updated data
with open("arxiv_data.json", "w", encoding="utf-8") as f:
    json.dump(papers, f, ensure_ascii=False, indent=4)



In [6]:
# create a graph with authors as nodes and collaborations as edges
import networkx as nx
Ghep = nx.Graph()
for paper in papers.values():
    for author in paper["authors"]:
        if author not in Ghep:
            Ghep.add_node(author, papers=[])
            # Add the paper with its arxiv numbres to the author's list 
        Ghep.nodes[author]["papers"].append(paper["title"])

    for author1 in paper["authors"]:
        for author2 in paper["authors"]:
            if author1 != author2:
                if not Ghep.has_edge(author1, author2):
                    Ghep.add_edge(author1, author2, weight=0)
                Ghep[author1][author2]["weight"] += 1

print(f"Number of nodes: {Ghep.number_of_nodes()}")
print(f"Number of edges: {Ghep.number_of_edges()}")

#print all the authors with the number of collaborations in authors.csv in alphabetical order
with open("authors_list.csv", "w", encoding="utf-8") as f:
    f.write("Author,Number of collaborations\n")
    for author in sorted(Ghep.nodes):
        f.write(f"{author},{Ghep.degree(author)}\n")

Number of nodes: 8187
Number of edges: 19205


In [24]:
# find communities in the graph
from networkx.algorithms.community import louvain_communities

# Louvain community detection
lc_Ghep = louvain_communities(Ghep,resolution=40.0, seed=123)

#print the community number
print(len(lc_Ghep))
# Clique percolation method
from networkx.algorithms.community import k_clique_communities
cp_Ghep= k_clique_communities(Ghep,3)
#print the community number
print(len(list(cp_Ghep)))



# label propagation algorithm
from networkx.algorithms.community.label_propagation import label_propagation_communities
lp_Ghep = list(label_propagation_communities(Ghep))
#print the community number
print(len(lp_Ghep))


1542
1161
1994


In [25]:
#calculate the net citations numbers between communities directly from the arxiv_papers.json

"""
First few lines of the arxiv_papers.json:
{
    "9203077": {
        "title": "Finite W-algebras",
        "authors": [
            "T.Tjin"
        ],
        "citations": 13,
        "cited_by": []
    },
    "9203063": {
        "title": "The Spectrum of Sl(2, R)/U(1) Black Hole Conformal Field Theory",
        "authors": [
            "Dileep P. Jatkar"
        ],
        "citations": 0,
        "cited_by": []
    },
    "9212146": {
"""
communities = list(lc_Ghep)  # Use the Louvain communities
# Load the papers data
with open("arxiv_data.json", "r", encoding="utf-8") as f:
    papers = json.load(f)

# Create a dictionary of authors to communities
author_community = {}
for i, community in enumerate(communities):
    for author in community:
        author_community[author] = i

# Calculate the net citations between communities
net_citations = np.zeros((len(communities), len(communities)))
for paper in papers.values():
    if "citations" not in paper:
        continue
    # Ensure the source author exists in the author_community dictionary
    if paper["authors"]:
        source_community = author_community.get(paper["authors"][0], -1)
        if source_community != -1:
            for target in paper["citations"]:
                # Ensure the target paper exists and has authors
                if target in papers and papers[target]["authors"]:
                    target_community = author_community.get(papers[target]["authors"][0], -1)
                    if target_community != -1:
                        net_citations[source_community, target_community] += 1


# Find the most important communities by net citations/
community_citations = net_citations.sum(axis=1)
most_important_communities = np.argsort(community_citations)[::-1]  

print("Most important communities:")
for i in range(10):
    community = communities[most_important_communities[i]]
    print(f"Community {i + 1}: {len(community)} authors, {community_citations[most_important_communities[i]]} citations")
    for author in community:
        print(f"  {author}")




Most important communities:
Community 1: 23 authors, 3146.0 citations
  Ofer Aharony
  Y. Kinar
  Morten Krogh
  S. Yankielowizc
  A. Loewy
  Ori J. Ganor
  Y. Artstein
  Mordechai Spiegelglas
  Joanna L. Karczmarek
  Aaron Bergman
  Andreas Brandhuber
  Adi Armoni
  C. Sonnenschein
  Yitzhak Frishman
  N. Sochen J. Sonnenschein
  Yoav Lavi
  Michael E. Peskin
  Eugene A. Mirabelli
  Shimon Yankielowicz
  Vadim S. Kaplunovsky
  Nissan Itzhaki
  Shlomo S. Razamat
  Ehud Schreiber
Community 2: 14 authors, 2179.0 citations
  Mirjam Cvetiv C
  Jose R. Espinosa
  R. L. Davis
  Mirjam Cvetic
  Harald H. Soleng
  Philip J. Rosenthal
  Stephen Griffies
  Paul Langacker
  Mirjam Cvetivc
  Gerald B. Cleaver
  David C. Lewellen
  Lisa Everett
  Donam Youm
  Kwanleung Chan
Community 3: 15 authors, 2129.0 citations
  Shivaji L. Sondhi
  Arkadas Ozakin
  Akikazu Hashimoto
  Curtis G. Callan
  Peter Ouyang
  Steven S. Gubser
  Vyacheslav S. Rychkov
  Christof Schmidhuber
  A. M. Polyakov
  Ali Yegula

In [26]:
#create a graph of communities as summation of the information of the authors named after the most important author in the community.
# we will create a graph of communities as summation of the information of the authors named after the most important author in the community. 
# The number of citations as the weight of the edges.
G_communities = nx.Graph()
for i, community in enumerate(communities):
    most_important_author = max(community, key=lambda x: len(papers[x]["papers"]) if x in papers else 0)
    G_communities.add_node(i, name=most_important_author, size=len(community))
    #add the number of members in the community as the size of the node
    for author in community:
        if author in papers:
            G_communities.nodes[i][author] = papers[author]["papers"]
    #add the total citations in the community
    G_communities.nodes[i]["total_citations"] = community_citations[i]

# Create the edges between communities based on the net citations
for i in range(len(communities)):
    for j in range(i + 1, len(communities)):
        # Add the number of citations as the weight of the edge with weight of the number of citations
        if net_citations[i][j] > 0:
            G_communities.add_edge(i, j, weight=net_citations[i][j])

print(f"Number of nodes: {G_communities.number_of_nodes()}")
print(f"Number of edges: {G_communities.number_of_edges()}")


Number of nodes: 1542
Number of edges: 13053


In [38]:
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import networkx as nx
import numpy as np
from ipysigma import Sigma
from collections import defaultdict

# -------- ① 社区编号：把列表 communities 转换为 {node: community_id} 字典 --------

community_dict = {}
for comm_id, comm_nodes in enumerate(communities):
    for node in comm_nodes:
        community_dict[node] = comm_id

num_comms = len(set(community_dict.values()))
colormap = cm.get_cmap('tab10', num_comms)  # 可换 'tab20'、'plasma' 等

# -------- ② 统计每个社区总引用 --------

community_citations = defaultdict(int)
for node in G_communities.nodes:
    comm_id = community_dict.get(node, -1)
    community_citations[comm_id] += G_communities.nodes[node]["total_citations"]

max_cit = max(community_citations.values())

# -------- ③ 给节点分配颜色（颜色亮度 = 社区总引用） --------

for node in G_communities.nodes:
    comm_id = community_dict.get(node, -1)

    if comm_id == -1:
        G_communities.nodes[node]["colors"] = "#cccccc"  # 未分组灰色
    else:
        base_color = colormap(comm_id % num_comms)
        brightness = community_citations[comm_id] / max_cit
        r, g, b, _ = base_color
        # 增加最低亮度，避免太暗
        r = min(1.0, r * brightness + 0.3)
        g = min(1.0, g * brightness + 0.3)
        b = min(1.0, b * brightness + 0.3)
        G_communities.nodes[node]["colors"] = mcolors.to_hex((r, g, b))

# -------- ④ 边颜色：跨社区边浅灰，同社区边深色 --------

for i, j in G_communities.edges:
    comm_i = community_dict.get(i, -1)
    comm_j = community_dict.get(j, -1)

    if comm_i != comm_j:
        G_communities.edges[i, j]["color"] = "rgba(200,200,200,0.1)"  # 淡灰
    else:
        G_communities.edges[i, j]["color"] = "rgba(50,50,50,0.6)"      # 深色

# -------- ⑤ 社区分开布局：圆周排列 --------

# 圆周上为每个社区分配中心位置
angle = np.linspace(0, 2 * np.pi, num_comms + 1)
community_positions = {}
for comm_id in range(num_comms):
    x = 10 * np.cos(angle[comm_id])
    y = 10 * np.sin(angle[comm_id])
    community_positions[comm_id] = (x, y)

# 对每个社区分别布局并移动
for comm_id in range(num_comms):
    nodes_in_comm = [n for n in G_communities.nodes if community_dict.get(n, -1) == comm_id]
    subgraph = G_communities.subgraph(nodes_in_comm)

    # 社区内部布局
    pos_sub = nx.spring_layout(subgraph, k=0.5, seed=comm_id)

    # 圆周位置
    cx, cy = community_positions[comm_id]

    for n in subgraph.nodes:
        G_communities.nodes[n]["x"] = pos_sub[n][0] + cx
        G_communities.nodes[n]["y"] = pos_sub[n][1] + cy

# -------- ⑥ 节点大小：按 total_citations 控制 --------

for node in G_communities.nodes:
    G_communities.nodes[node]["size"] = G_communities.nodes[node]["total_citations"] / 100

# -------- ⑦ 标签：只显示引用高的节点 --------

for node in G_communities.nodes:
    if G_communities.nodes[node]["total_citations"] > 500:
        G_communities.nodes[node]["label"] = G_communities.nodes[node]["name"]
    else:
        G_communities.nodes[node]["label"] = ""

# -------- ⑧ 可视化：Sigma --------

sigma = Sigma(
    G_communities,
    node_color="colors",
    edge_color="color",
    node_label="label",
    node_size="size",
    label_font="cursive",
    default_edge_type="curve"
)

sigma


  colormap = cm.get_cmap('tab10', num_comms)  # 可换 'tab20'、'plasma' 等


Sigma(nx.Graph with 1,542 nodes and 13,053 edges)