In [11]:
import networkx as nx
from ipysigma import Sigma
import numpy as np
import json


In [12]:


"""
explanation of meta data:
------------------------------------------------------------------------------
\\
Paper: hep-th/9201018
From: OGURAWA@VTCC1.CC.VT.EDU
Date: Thu, 9 Jan 1992 18:18:54 -0500 (EST)   (20kb)
Date (revised): Mon, 13 Jan 1992 15:35:36 -0500 (EST)

Title: Discrete and Continuum Virasoro Constraints in Two-Cut Hermitian Matrix
  Models
Authors: Waichi Ogura
Comments: 25 pages
Journal-ref: Prog.Theor.Phys. 89 (1993) 1311-1330
\\
  Continuum Virasoro constraints in the two-cut hermitian matrix models are
derived from the discrete Ward identities by means of the mapping from the
$GL(\infty )$ Toda hierarchy to the nonlinear Schr\"odinger (NLS) hierarchy.
The invariance of the string equation under the NLS flows is worked out. Also
the quantization of the integration constant $\alpha$ reported by Hollowood et
al. is explained by the analyticity of the continuum limit.
\\
"""

"""
We will do the following steps:
    1. extract all the authors as nodes, and the collaboration as edges with the weight of the number of collaborations.
    2. extract the title as the name of the paper, the number of arxiv and attribute them to the nodes.
    3. find communities in the graph.
    5. use citation data to find the most important communities.
"""



  """


'\nWe will do the following steps:\n    1. extract all the authors as nodes, and the collaboration as edges with the weight of the number of collaborations.\n    2. extract the title as the name of the paper, the number of arxiv and attribute them to the nodes.\n    3. find communities in the graph.\n    5. use citation data to find the most important communities.\n'

In [13]:

""" 
example of assets/Cit-HepTh.txt:
# Directed graph (each unordered pair of nodes is saved once): assets/Cit-HepTh.txt 
# Paper citation network of Arxiv High Energy Physics Theory category
# Nodes: 27770 Edges: 352807
# FromNodeId	ToNodeId
0001001	9304045
0001001	9308122
0001001	9309097
0001001	9311042
0001001	9401139

for each paper, we will add a new attribute of list that shows all the papers that this paper cite, with its arxiv number.
We will add it iff both papers of a citation are in the paper. And save it to arxiv_papers.json.

example of temp/papers_standardized.json

  {
    "paper_id": "hep-th/9211063",
    "from": "Malcolm Perry <M.J.Perry@damtp.cambridge.ac.uk>",
    "submitted": "Sat, 14 Nov 92 17:58:35 GMT (27kb)",
    "title": "Topological Conformal Gravity in Four Dimensions",
    "authors": [
      "Malcolm J. Perry",
      "Edward Teo"
    ],
    "comments": "35 pages, harvmac, DAMTP R92/42",
    "report_no": null,
    "journal_ref": "Nucl.Phys. B401 (1993) 206-238",
    "subject_class": null,
    "proxy": null,
    "abstract": "In this paper, we present a new formulation of topological conformal gravity in four dimensions. Such a theory was first considered by Witten as a possible gravitational counterpart of topological Yang-Mills theory, but several problems left it incomplete. The key in our approach is to realise a theory which describes deformations of conformally self-dual gravitational instantons. We first identify the appropriate elliptic complex which does precisely this. By applying the Atiyah-Singer index theorem, we calculate the number of independent deformations of a given gravitational instanton which preserve its self-duality. We then quantise topological conformal gravity by BRST gauge-fixing, and discover how the quantum theory is naturally described by the above complex. Indeed, it is a process which closely parallels that of the Yang-Mills theory, and we show how the partition function generates an uncanny gravitational analogue of the first Donaldson invariant.",
    "year": 1992
  },


"""
#load data from temp/papers_standardized.json
with open("temp/papers_standardized.json", "r", encoding="utf-8") as f:
    papers = {paper["paper_id"]: paper for paper in json.load(f)}


# Load the citation data
citations = {}
with open("assets/Cit-HepTh.txt", "r") as f:
    # Skip the lines with comments
    while True:
        line = f.readline()
        if not line.startswith("#"):
            break
    for line in f:
        source, target = map(int, line.strip().split())
        source = str(source)
        target = str(target)
        if source in papers and target in papers:
            if source not in citations:
                citations[source] = []
            citations[source].append(target)

# Add the citation of the papers and the paper cited by the papers
for paper_id, data in papers.items():
    if paper_id in citations:
        data["citations"] = citations[paper_id]
    else:
        data["citations"] = []


# Save the updated data
with open("temp/arxiv_data.json", "w", encoding="utf-8") as f:
    json.dump(papers, f, ensure_ascii=False, indent=4)



In [14]:
# create a graph with authors as nodes and collaborations as edges

Ghep = nx.Graph()
for paper in papers.values():
    for author in paper["authors"]:
        if author not in Ghep:
            Ghep.add_node(author, papers=[])
            # Add the paper with its arxiv numbres to the author's list 
        Ghep.nodes[author]["papers"].append(paper["title"])

    for author1 in paper["authors"]:
        for author2 in paper["authors"]:
            if author1 != author2:
                if not Ghep.has_edge(author1, author2):
                    Ghep.add_edge(author1, author2, weight=0)
                Ghep[author1][author2]["weight"] += 1

print(f"Number of nodes: {Ghep.number_of_nodes()}")
print(f"Number of edges: {Ghep.number_of_edges()}")

#print all the authors with the number of collaborations in temp/authors.csv in alphabetical order
with open("temp/authors_list.csv", "w", encoding="utf-8") as f:
    f.write("Author,Number of collaborations\n")
    for author in sorted(Ghep.nodes):
        f.write(f"{author},{Ghep.degree(author)}\n")

Number of nodes: 8187
Number of edges: 19205


In [15]:
# find communities in the graph
from networkx.algorithms.community import louvain_communities

# Louvain community detection
lc_Ghep = louvain_communities(Ghep,resolution=40.0, seed=123)

#print the community number
print(len(lc_Ghep))
# Clique percolation method
from networkx.algorithms.community import k_clique_communities
cp_Ghep= k_clique_communities(Ghep,3)
#print the community number
print(len(list(cp_Ghep)))



# label propagation algorithm
from networkx.algorithms.community.label_propagation import label_propagation_communities
lp_Ghep = list(label_propagation_communities(Ghep))
#print the community number
print(len(lp_Ghep))


1542
1161
1994


In [16]:
#calculate the net citations numbers between communities directly from the temp/arxiv_papers.json

"""
First few lines of the temp/arxiv_papers.json:
{
    "9203077": {
        "title": "Finite W-algebras",
        "authors": [
            "T.Tjin"
        ],
        "citations": 13,
        "cited_by": []
    },
    "9203063": {
        "title": "The Spectrum of Sl(2, R)/U(1) Black Hole Conformal Field Theory",
        "authors": [
            "Dileep P. Jatkar"
        ],
        "citations": 0,
        "cited_by": []
    },
    "9212146": {
"""
communities = list(lc_Ghep)  # Use the Louvain communities
# Load the papers data
with open("temp/arxiv_data.json", "r", encoding="utf-8") as f:
    papers = json.load(f)

# Create a dictionary of authors to communities
author_community = {}
for i, community in enumerate(communities):
    for author in community:
        author_community[author] = i

# Calculate the net citations between communities
net_citations = np.zeros((len(communities), len(communities)))
for paper in papers.values():
    if "citations" not in paper:
        continue
    # Ensure the source author exists in the author_community dictionary
    if paper["authors"]:
        source_community = author_community.get(paper["authors"][0], -1)
        if source_community != -1:
            for target in paper["citations"]:
                # Ensure the target paper exists and has authors
                if target in papers and papers[target]["authors"]:
                    target_community = author_community.get(papers[target]["authors"][0], -1)
                    if target_community != -1:
                        net_citations[source_community, target_community] += 1


# Find the most important communities by net citations/
community_citations = net_citations.sum(axis=1)
most_important_communities = np.argsort(community_citations)[::-1]  

print("Most important communities:")
for i in range(10):
    community = communities[most_important_communities[i]]
    print(f"Community {i + 1}: {len(community)} authors, {community_citations[most_important_communities[i]]} citations")
    for author in community:
        print(f"  {author}")




Most important communities:
Community 1: 23 authors, 3146.0 citations
  Joanna L. Karczmarek
  N. Sochen J. Sonnenschein
  Yoav Lavi
  Shlomo S. Razamat
  A. Loewy
  Eugene A. Mirabelli
  Morten Krogh
  Nissan Itzhaki
  Ori J. Ganor
  Adi Armoni
  Aaron Bergman
  Y. Kinar
  Andreas Brandhuber
  Shimon Yankielowicz
  S. Yankielowizc
  Yitzhak Frishman
  Mordechai Spiegelglas
  Ehud Schreiber
  C. Sonnenschein
  Ofer Aharony
  Vadim S. Kaplunovsky
  Y. Artstein
  Michael E. Peskin
Community 2: 14 authors, 2179.0 citations
  David C. Lewellen
  Gerald B. Cleaver
  R. L. Davis
  Mirjam Cvetic
  Mirjam Cvetiv C
  Philip J. Rosenthal
  Harald H. Soleng
  Paul Langacker
  Jose R. Espinosa
  Donam Youm
  Kwanleung Chan
  Stephen Griffies
  Mirjam Cvetivc
  Lisa Everett
Community 3: 15 authors, 2129.0 citations
  Curtis G. Callan
  A. M. Polyakov
  Ali Yegulalp
  Arkadas Ozakin
  Akikazu Hashimoto
  Krev Simir Demeterfi
  Michael Krasnitz
  Christof Schmidhuber
  Peter Ouyang
  Shivaji L. Sondh

In [17]:
#create a graph of communities as summation of the information of the authors named after the most important author in the community.
# we will create a graph of communities as summation of the information of the authors named after the most important author in the community. 
# The number of citations as the weight of the edges.
G_communities = nx.Graph()
for i, community in enumerate(communities):
    most_important_author = max(community, key=lambda x: len(papers[x]["papers"]) if x in papers else 0)
    G_communities.add_node(i, name=most_important_author, size=len(community))
    #add the number of members in the community as the size of the node
    #add the authors as attributes to the node
    G_communities.nodes[i]["authors"] = community

    #add the total citations in the community
    G_communities.nodes[i]["total_citations"] = community_citations[i]

# Create the edges between communities based on the net citations
for i in range(len(communities)):
    for j in range(i + 1, len(communities)):
        # Add the number of citations as the weight of the edge with weight of the number of citations
        if net_citations[i][j] > 0:
            G_communities.add_edge(i, j, weight=net_citations[i][j])

print(f"Number of nodes: {G_communities.number_of_nodes()}")
print(f"Number of edges: {G_communities.number_of_edges()}")


Number of nodes: 1542
Number of edges: 13053


In [18]:
# visualize the graph of G_communities with ipysigma

# Assign colors to each communities based on the numbers of members of communities from red to blue

for node in G_communities.nodes:
    r = max(0, min(255, 255 - 20 * len(communities[node])))
    g = max(0, min(255, 20 * len(communities[node])))
    G_communities.nodes[node]["colors"] = f"rgb({r}, O,{g})"

# assgin the color of the edges based on the citations of the community
for i, j in G_communities.edges:
    G_communities.edges[i, j]["color"] = f"rgb({255 - 20 * net_citations[i, j]}, {20 * net_citations[i, j]}, 0)"
    
# Assign the size of the nodes based on citations
for node in G_communities.nodes:
    G_communities.nodes[node]["size"] = G_communities.nodes[node]["total_citations"] / 100

# Visualize with ipysigma
sigma = Sigma(
    G_communities,
    node_color="colors",
    edge_color="color",
    node_label="name",
    node_size="size",
    label_font="cursive",
    default_edge_type="curve"
)

sigma

Sigma(nx.Graph with 1,542 nodes and 13,053 edges)