
## Read a research article that investigates the potential of using modularity to detect polarization

You will find on Moodle a research article published at the conference ICWSM https://www.icwsm.org/2025/index.html
Take the time to read it and see how researchers test ideas on real applications and propose new metrics when they consider existing tools are not sufficient. In your project you will also have to investigate your hypothesis on the data of your choice! 

11. After reading the article, implement the notion of boundary described. Apply it on one of the graphs you have seen so far in this class, or on one graph from here (choose smaller graphs for efficiency): http://snap.stanford.edu/data/index.html


In [2]:
import networkx as nx
from ipysigma import Sigma
import numpy as np

In [3]:
# Solution for exercise 11

"""
explanation of meta data:
------------------------------------------------------------------------------
\\
Paper: hep-th/9201018
From: OGURAWA@VTCC1.CC.VT.EDU
Date: Thu, 9 Jan 1992 18:18:54 -0500 (EST)   (20kb)
Date (revised): Mon, 13 Jan 1992 15:35:36 -0500 (EST)

Title: Discrete and Continuum Virasoro Constraints in Two-Cut Hermitian Matrix
  Models
Authors: Waichi Ogura
Comments: 25 pages
Journal-ref: Prog.Theor.Phys. 89 (1993) 1311-1330
\\
  Continuum Virasoro constraints in the two-cut hermitian matrix models are
derived from the discrete Ward identities by means of the mapping from the
$GL(\infty )$ Toda hierarchy to the nonlinear Schr\"odinger (NLS) hierarchy.
The invariance of the string equation under the NLS flows is worked out. Also
the quantization of the integration constant $\alpha$ reported by Hollowood et
al. is explained by the analyticity of the continuum limit.
\\
"""

"""
We will do the following steps:
    1. extract all the authors as nodes, and the collaboration as edges with the weight of the number of collaborations.
    2. extract the title as the name of the paper, the number of arxiv and attribute them to the nodes.
    3. find communities in the graph.
    5. use citation data to find the most important communities.
"""



  """


'\nWe will do the following steps:\n    1. extract all the authors as nodes, and the collaboration as edges with the weight of the number of collaborations.\n    2. extract the title as the name of the paper, the number of arxiv and attribute them to the nodes.\n    3. find communities in the graph.\n    5. use citation data to find the most important communities.\n'

In [4]:
import os
import json
import re
import csv

def extract_author(text):
    match = re.search(r'Author:(.*?)(?:\\n\\n|Report-no:|Comments:|Title:)', text, re.DOTALL)
    if match:
        authors = match.group(1).replace("\n", " ")  # 处理换行符
        authors = re.sub(r'\(.*?\)', '', authors)  # 去除括号及其内容
        authors = [a.strip() for a in re.split(r',| and ', authors)]  # 拆分多个作者
        authors = [re.sub(r'[^a-zA-Z .]', '', a) for a in authors]  # 只保留拉丁字母、空格和点
        authors = [re.sub(r'\s+', ' ', a) for a in authors]  # 合并多个空格
        #将所有词的首字母大写, 其他字母小写
        authors = [a.title() for a in authors]
        authors = [a.replace("Nieegawa", "Niegawa") for a in authors]  # 统一名字格式
        authors = merge_name_variants(authors)  # 统一名字变体
        authors = resolve_abbreviations(authors)  # 处理缩写和全称重复
        authors = list(filter(None, authors))  # 去除空项
        return sorted(authors)
    return []

def merge_name_variants(authors):
    """合并同一作者的不同写法，如 Juan Maldacena 和 Juan M. Maldacena"""
    name_variants = {}
    
    for author in authors:
        base_name = re.sub(r'\b[A-Z]\.', '', author).strip()  # 移除首字母缩写
        if base_name in name_variants:
            name_variants[base_name].add(author)
        else:
            name_variants[base_name] = {author}
    
    return [max(variants, key=len) for variants in name_variants.values()]

def resolve_abbreviations(authors):
    """检查是否有缩写和全称重复的情况，若有，则保留全称"""
    full_names = set()
    abbreviations = set()
    
    for author in authors:
        if '.' in author:
            abbreviations.add(author)
        else:
            full_names.add(author)
    
    # 只保留非缩写形式的作者名
    final_authors = full_names.union(a for a in abbreviations if a not in full_names)
    return list(final_authors)

def extract_arxiv_info(abs_file):
    with open(abs_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    paper_id_match = re.search(r'Paper:\s*hep-th/(\d+)', content)
    title_match = re.search(r'Title:\s*(.+)', content)
    authors = extract_author(content)
    
    if paper_id_match and title_match and authors:
        return paper_id_match.group(1), {
            "title": title_match.group(1),
            "authors": authors
        }
    return None

def process_abstracts(directory):
    arxiv_data = {}
    authors_set = set()
    
    for year in os.listdir(directory):
        year_path = os.path.join(directory, year)
        if not os.path.isdir(year_path):
            continue
        
        for file in os.listdir(year_path):
            if file.endswith(".abs"):
                abs_path = os.path.join(year_path, file)
                result = extract_arxiv_info(abs_path)
                if result:
                    arxiv_id, data = result
                    arxiv_data[arxiv_id] = data
                    authors_set.update(data["authors"])
    
    with open("arxiv_data.json", "w", encoding="utf-8") as json_file:
        json.dump(arxiv_data, json_file, indent=4, ensure_ascii=False)
    
    sorted_authors = sorted(authors_set)
    with open("authors_list.csv", "w", encoding="utf-8", newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Author"])
        for author in sorted_authors:
            writer.writerow([author])
    
    print("Extraction completed. Data saved to arxiv_data.json and authors_list.csv")

# 示例调用
process_abstracts("./cit-HepTh-abstracts")


Extraction completed. Data saved to arxiv_data.json and authors_list.csv


In [5]:

""" 
example of Cit-HepTh.txt:
# Directed graph (each unordered pair of nodes is saved once): Cit-HepTh.txt 
# Paper citation network of Arxiv High Energy Physics Theory category
# Nodes: 27770 Edges: 352807
# FromNodeId	ToNodeId
1001	9304045
1001	9308122
1001	9309097
1001	9311042
1001	9401139

for each paper, we will add a new attribute of list that shows all the papers that this paper cite, with its arxiv number.
We will add it iff both papers of a citation are in the paper. And save it to arxiv_papers.json.


"""
#load data from arxiv_data.json
with open("arxiv_data.json", "r", encoding="utf-8") as f:
    papers = json.load(f)


# Load the citation data
citations = {}
with open("Cit-HepTh.txt", "r") as f:
    # Skip the lines with comments
    while True:
        line = f.readline()
        if not line.startswith("#"):
            break
    for line in f:
        source, target = map(int, line.strip().split())
        source = str(source)
        target = str(target)
        if source in papers and target in papers:
            if source not in citations:
                citations[source] = []
            citations[source].append(target)

# Add the citation of the papers and the paper cited by the papers
for paper_id, data in papers.items():
    if paper_id in citations:
        data["citations"] = citations[paper_id]
    else:
        data["citations"] = []


# Save the updated data
with open("arxiv_data.json", "w", encoding="utf-8") as f:
    json.dump(papers, f, ensure_ascii=False, indent=4)



In [6]:
# create a graph with authors as nodes and collaborations as edges
import networkx as nx
Ghep = nx.Graph()
for paper in papers.values():
    for author in paper["authors"]:
        if author not in Ghep:
            Ghep.add_node(author, papers=[])
            # Add the paper with its arxiv numbres to the author's list 
        Ghep.nodes[author]["papers"].append(paper["title"])

    for author1 in paper["authors"]:
        for author2 in paper["authors"]:
            if author1 != author2:
                if not Ghep.has_edge(author1, author2):
                    Ghep.add_edge(author1, author2, weight=0)
                Ghep[author1][author2]["weight"] += 1

print(f"Number of nodes: {Ghep.number_of_nodes()}")
print(f"Number of edges: {Ghep.number_of_edges()}")

#print all the authors with the number of collaborations in authors.csv in alphabetical order
with open("authors.csv", "w", encoding="utf-8") as f:
    f.write("Author,Number of collaborations\n")
    for author in sorted(Ghep.nodes):
        f.write(f"{author}\n")

Number of nodes: 2620
Number of edges: 1331


In [7]:
# find communities in the graph
from networkx.algorithms.community import louvain_communities

# Louvain community detection
lc_Ghep = louvain_communities(Ghep, seed=123)

#print the community number
print(len(lc_Ghep))
# Clique percolation method
from networkx.algorithms.community import k_clique_communities
cp_Ghep= k_clique_communities(Ghep, 3)
#print the community number
print(len(list(cp_Ghep)))



# label propagation algorithm
from networkx.algorithms.community.label_propagation import label_propagation_communities
lp_Ghep = list(label_propagation_communities(Ghep))
#print the community number
print(len(lp_Ghep))


1706
201
1740


In [8]:
#calculate the net citations numbers between communities directly from the arxiv_papers.json

"""
First few lines of the arxiv_papers.json:
{
    "9203077": {
        "title": "Finite W-algebras",
        "authors": [
            "T.Tjin"
        ],
        "citations": 13,
        "cited_by": []
    },
    "9203063": {
        "title": "The Spectrum of Sl(2, R)/U(1) Black Hole Conformal Field Theory",
        "authors": [
            "Dileep P. Jatkar"
        ],
        "citations": 0,
        "cited_by": []
    },
    "9212146": {
"""
communities = list(lc_Ghep)  # Use the Louvain communities
# Load the papers data
with open("arxiv_data.json", "r", encoding="utf-8") as f:
    papers = json.load(f)

# Create a dictionary of authors to communities
author_community = {}
for i, community in enumerate(communities):
    for author in community:
        author_community[author] = i

# Calculate the net citations between communities
net_citations = np.zeros((len(communities), len(communities)))
for paper in papers.values():
    if "citations" not in paper:
        continue
    source_community = author_community.get(paper["authors"][0], -1)
    for target in paper["citations"]:
        target_community = author_community.get(papers[target]["authors"][0], -1)
        if source_community != -1 and target_community != -1:
            net_citations[source_community, target_community] += 1


# Find the most important communities by net citations/
community_citations = net_citations.sum(axis=1)
most_important_communities = np.argsort(community_citations)[::-1]  

print("Most important communities:")
for i in range(10):
    community = communities[most_important_communities[i]]
    print(f"Community {i + 1}: {len(community)} authors, {community_citations[most_important_communities[i]]} citations")
    for author in community:
        print(f"  {author}")




Most important communities:
Community 1: 2 authors, 367.0 citations
  Jaydeep Majumder
  Ashoke Sen
Community 2: 9 authors, 256.0 citations
  Donam Youm
  Ingo Gaida
  Mirjam Cvetic
  Kwanleung Chan
  Vijay Balasubramanian
  Lorenzo Cornalba
  Dieter Luest
  Frank Wilczek
  Finn Larsen
Community 3: 11 authors, 251.0 citations
  Camillo Imbimbo
  Steven S. Gubser
  N. Itzhaki
  Sunil Mukhi
  Carlo M. Becchi
  Edward Witten
  Debashis Ghoshal
  Igor R. Klebanov
  Keshav Dasgupta
  Akikazu Hashimoto
  L. Susskind
Community 4: 3 authors, 208.0 citations
  Peter Langfelder
  Zurab Kakushadze
  Alberto Iglesias
Community 5: 1 authors, 182.0 citations
  John H. Schwarz
Community 6: 6 authors, 182.0 citations
  A.M. Polyakov
  M. Cvetic
  C.G. Callan
  S.S. Gubser
  A.A. Tseytlin
  I.R. Klebanov
Community 7: 22 authors, 149.0 citations
  C. Sonnenschein
  D. Freed
  O. Aharony
  E. Verlinde
  I.Antoniadis
  S. Theisen
  G. Moore
  S. Ferrara
  A. Strominger
  K.S. Narain
  A. Klemm
  S. Yankie

In [9]:
#create a graph of communities as summation of the information of the authors named after the most important author in the community.
# we will create a graph of communities as summation of the information of the authors named after the most important author in the community. 
# The number of citations as the weight of the edges.
G_communities = nx.Graph()
for i, community in enumerate(communities):
    most_important_author = max(community, key=lambda x: len(papers[x]["papers"]) if x in papers else 0)
    G_communities.add_node(i, name=most_important_author, size=len(community))
    #add the number of members in the community as the size of the node
    for author in community:
        if author in papers:
            G_communities.nodes[i][author] = papers[author]["papers"]
    #add the total citations in the community
    G_communities.nodes[i]["total_citations"] = community_citations[i]

# Create the edges between communities based on the net citations
for i in range(len(communities)):
    for j in range(i + 1, len(communities)):
        # Add the number of citations as the weight of the edge with weight of the number of citations
        if net_citations[i][j] > 0:
            G_communities.add_edge(i, j, weight=net_citations[i][j])

print(f"Number of nodes: {G_communities.number_of_nodes()}")
print(f"Number of edges: {G_communities.number_of_edges()}")


Number of nodes: 1706
Number of edges: 1246


In [10]:
# visualize the graph of G_communities with ipysigma

# Assign colors to each communities based on the numbers of members of communities from red to blue

for node in G_communities.nodes:
    r = max(0, min(255, 255 - 20 * len(communities[node])))
    g = max(0, min(255, 20 * len(communities[node])))
    G_communities.nodes[node]["colors"] = f"rgb({r}, O,{g})"

# assgin the color of the edges based on the citations of the community
for i, j in G_communities.edges:
    G_communities.edges[i, j]["color"] = f"rgb({255 - 20 * net_citations[i, j]}, {20 * net_citations[i, j]}, 0)"
    
# Assign the size of the nodes based on citations
for node in G_communities.nodes:
    G_communities.nodes[node]["size"] = G_communities.nodes[node]["total_citations"] / 100

# Visualize with ipysigma
sigma = Sigma(
    G_communities,
    node_color="colors",
    edge_color="color",
    node_label="name",
    node_size="size",
    label_font="cursive",
    default_edge_type="curve"
)

sigma

Sigma(nx.Graph with 1,706 nodes and 1,246 edges)