In [1]:
import json
import networkx as nx

### Graph of communications modelling

In [2]:
# Read the citation_relations.json file and generate a graph
with open('./data/citation_relations.json', 'r') as f:
    cite_data = json.load(f)

In [3]:
# Check if there are duplicate entries in the graph
node_set = set()
for paper_id in cite_data.keys():
    node_set.add(paper_id)
print(f"Duplicates count = {len(cite_data.keys()) - len(node_set)}")

Duplicates count = 0


In [4]:
# Create a list of nodes in the format expected by networkX
node_list = []
for paper_id in cite_data.keys():
    node_list.append({ 'id': paper_id })

In [5]:
link_list = []
for paper_id, paper_data in cite_data.items():
    for ref_paper_id in paper_data['references']:
        link_list.append({ 'source': paper_id, 'target': ref_paper_id })

In [6]:
graph_data = {
    'directed': True,
    'multigraph': False,
    'graph': {},
    'nodes': node_list,
    'links': link_list
}
cite_graph = nx.node_link_graph(graph_data)

In [7]:
cite_data["102498304"]

{'references': ['2362538', '119576823', '119330938', '29009489'],
 'cited_by': []}

### Graph of similarity of communications modelling

In [3]:
with open('./data/papers.SSN.jsonl', 'r') as f:
    papers = f.readlines()

In [9]:
paper_data = json.loads(papers[0])
abstract_str = " ".join(paper_data['abstract'])
print(abstract_str)

tree boosting is a highly effective and widely used machine learning method . in this paper , we describe a scalable end - to - end tree boosting system called xgboost , which is used widely by data scientists to achieve state - of - the - art results on many machine learning challenges . we propose a novel sparsity - aware algorithm for sparse data and weighted quantile sketch for approximate tree learning . more importantly , we provide insights on cache access patterns , data compression and sharding to build a scalable tree boosting system . by combining these insights , xgboost scales beyond billions of examples using far fewer resources than existing systems .


In [10]:
# Implement TF-IDF on the paper abstracts and titles
from sklearn.feature_extraction.text import TfidfVectorizer

# Extract the abstracts and titles from the papers
abstracts = []
for paper in papers:
    paper_data = json.loads(paper)
    abstracts.append(" ".join(paper_data['abstract']))

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
abstract_matrix = vectorizer.fit_transform(abstracts)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(abstract_matrix)

In [None]:
similarities.shape