In [1]:
import json
import networkx as nx
import matplotlib.pyplot as plt

### Graph of communications modelling

In [16]:
# Read the citation_relations.json file and generate a graph
with open('./data/citation_relations.json', 'r') as f:
    cite_data = json.load(f)

In [17]:
# Check if there are duplicate entries in the graph
node_set = set()
for paper_id in cite_data.keys():
    node_set.add(paper_id)
print(f"Duplicates count = {len(cite_data.keys()) - len(node_set)}")

Duplicates count = 0


In [18]:
# Create a list of nodes in the format expected by networkX
node_list = []
index_dict = {} # Keep a dictionary to map paper_code to index
for index, paper_code in enumerate(cite_data.keys()):
    node_list.append({ 'id': index })
    index_dict[paper_code] = index

In [19]:
# Create a list of links in the format expected by networkX
link_list = []
for paper_id, paper_data in cite_data.items():
    for ref_paper_id in paper_data['references']:
        link_list.append({ 'source': index_dict[paper_id], 'target': index_dict[ref_paper_id] })

In [20]:
# Create an undirected graph from the node and link lists
graph_data = {
    'directed': False,
    'multigraph': False,
    'graph': {},
    'nodes': node_list,
    'links': link_list
}
G_comm = nx.node_link_graph(graph_data)

In [21]:
print(f"Number of nodes = {G_comm.number_of_nodes()}, number of edges = {G_comm.number_of_edges()}")

Number of nodes = 140799, number of edges = 658226


In [22]:
# Sample the graph to reduce its size 
from littleballoffur import RandomNodeSampler

total_nodes = G_comm.number_of_nodes()
sample_size = total_nodes // 10
sampler = RandomNodeSampler(number_of_nodes=sample_size, seed=42)
sampled_graph = sampler.sample(G_comm)

print(f"Sampled graph is connected = {nx.is_connected(sampled_graph)}")
print(f"Sampled graph is directed = {nx.is_directed(sampled_graph)}")
print(f"Number of nodes = {sampled_graph.number_of_nodes()}, number of edges = {sampled_graph.number_of_edges()}")

Sampled graph is connected = False
Sampled graph is directed = False
Number of nodes = 14079, number of edges = 6267


In [23]:
# Create a list of sampled nodes in the format expected by networkX
sampled_nodes = list(sampled_graph.nodes)
sampled_node_list = []
for node in sampled_nodes:
    sampled_node_list.append({ 'id': node, 'name': str(node) })

In [24]:
# Create a list of sampled links in the format expected by networkX
sampled_link_list = []
for paper_id, paper_data in cite_data.items():
    if index_dict[paper_id] not in sampled_graph.nodes:
        continue
    for ref_paper_id in paper_data['references']:
        if index_dict[ref_paper_id] in sampled_graph.nodes:
            sampled_link_list.append({ 'source': index_dict[paper_id], 'target': index_dict[ref_paper_id] })

In [25]:
# Create a graph from the sampled node and link lists
sampled_graph_data = {
    'directed': True,
    'multigraph': False,
    'graph': {},
    'nodes': sampled_node_list,
    'links': sampled_link_list
}
G_comm = nx.node_link_graph(sampled_graph_data)


### Create the network of communications

In [13]:
# Load the papers data
with open('./data/papers.SSN.jsonl', 'r') as f:
    papers = f.readlines()

In [27]:
papers[0]

'{"paper_id": "4650265", "title": "XGBoost: A Scalable Tree Boosting System", "abstract": ["tree boosting is a highly effective and widely used machine learning method .", "in this paper , we describe a scalable end - to - end tree boosting system called xgboost , which is used widely by data scientists to achieve state - of - the - art results on many machine learning challenges .", "we propose a novel sparsity - aware algorithm for sparse data and weighted quantile sketch for approximate tree learning .", "more importantly , we provide insights on cache access patterns , data compression and sharding to build a scalable tree boosting system .", "by combining these insights , xgboost scales beyond billions of examples using far fewer resources than existing systems ."], "section_names": ["introduction", "tree boosting in a nutshell", "regularized learning objective", "gradient tree boosting", "shrinkage and column subsampling", "basic exact greedy algorithm", "approximate algorithm", 

In [28]:
# Keep the abstracts of the papers that are in the sampled graph
abstracts = []
for paper in papers:
    paper_data = json.loads(paper)
    if index_dict[paper_data['paper_id']] not in sampled_nodes:
        continue
    abstracts.append(" ".join(paper_data['abstract']))

In [29]:
print(f"Number of abstracts = {len(abstracts)}")

Number of abstracts = 14079


In [30]:
# Calculate the similarity between the abstracts using n-grams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 3))
X = vectorizer.fit_transform(abstracts)
similarity_matrix = cosine_similarity(X, X)

In [31]:
# Create a graph from the similarity matrix
# Draw an edge between papers with similarity in the fourth quartile

# Calculate the 75th percentile of the similarity matrix
import numpy as np
percentile_75 = np.percentile(similarity_matrix, 75)
print(f"75th percentile = {percentile_75}")

G_sim = nx.Graph()
for i in range(similarity_matrix.shape[0]):
    for j in range(i+1, similarity_matrix.shape[1]):
        if similarity_matrix[i, j] > percentile_75:
            G_sim.add_edge(i, j)

75th percentile = 0.020171856661191995


In [32]:
# Save the graph to a CSV file
nx.write_edgelist(G_sim, './data/similarity_graph.csv', delimiter=',')

### Calculate the intersection between netowrks of commucication and networks of simialrity of communication

In [51]:
# Iterate over all nodes and find the neighbors of each node in G_comm and G_sim
intersection_dict = {}
for node in node_list:
    print(f"Processing node {node}")
    # Check if node is in both graphs
    # TODO: Figure out why some nodes are not in both graphs
    if node not in G_comm.nodes or node not in G_sim.nodes:
        print(f"Node {node} is not in both graphs")
        continue
    else:
        print(f"Node {node} is in both graphs")
    network_comm_neighbors = set(G_comm.neighbors(node))
    network_sim_neighbors = set(G_sim.neighbors(node))
    intersection = network_comm_neighbors.intersection(network_sim_neighbors)
    union = network_comm_neighbors.union(network_sim_neighbors)
    jaccard_similarity = len(intersection) / len(union)
    intersection_dict[node] = {
        'intersection': intersection,
        'union': union,
        'jaccard_similarity': jaccard_similarity
    }

# # Save the intersection dictionary to a JSON file
# with open('./data/intersection_dict.json', 'w') as f:
#     json.dump(intersection_dict, f, indent=4)

Processing node 65536
Node 65536 is not in both graphs
Processing node 131072
Node 131072 is not in both graphs
Processing node 2
Node 2 is in both graphs
Processing node 98308
Node 98308 is not in both graphs
Processing node 65541
Node 65541 is not in both graphs
Processing node 65542
Node 65542 is not in both graphs
Processing node 98311
Node 98311 is not in both graphs
Processing node 32774
Node 32774 is not in both graphs
Processing node 9
Node 9 is in both graphs
Processing node 32778
Node 32778 is not in both graphs
Processing node 65547
Node 65547 is not in both graphs
Processing node 131084
Node 131084 is not in both graphs
Processing node 131086
Node 131086 is not in both graphs
Processing node 98320
Node 98320 is not in both graphs
Processing node 131092
Node 131092 is not in both graphs
Processing node 65556
Node 65556 is not in both graphs
Processing node 65557
Node 65557 is not in both graphs
Processing node 32791
Node 32791 is not in both graphs
Processing node 24
Node 24

In [53]:
print(len(intersection_dict))

1343
