In [26]:
import os, sys
sys.path.append(os.path.dirname(os.getcwd()))

In [27]:
from itertools import count
import requests
import json

import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
from tqdm import tqdm

---

In [28]:
API_KEY = "09ybTvNroM4eXDl0QwNy71O6ncKaNHSc4HIijpfa" 

def get_all_citations(paper: dict, max_retries: int = 10) -> dict:
    # get papers that cite a paper 
    paper_id = paper["paperId"] if "paperId" in paper.keys() else paper["doi"]
    query = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations"
    fields = "title,isInfluential,intents,year,venue,citationCount,influentialCitationCount,url"
    # TODO(mm): adding authors will make issues when saving to .gexf as it doesn't like lists as attributes 
    # fields = "title,isInfluential,intents,year,venue,citationCount,influentialCitationCount,url,authors" 

    retrieved_all_papers = False
    offset = 0
    all_citing_papers = []
    while not retrieved_all_papers: 
        
        # define query params
        params = {"fields": fields, "offset": f"{offset}", "limit": 1000} # 1000 is the limit of the API
    
        # query Semantic Scholar API
        response = requests.get(
            query, 
            headers={"x-api-key": API_KEY}, 
            params=params
        )
        
        # try again if query fails
        retries = 1
        while response.status_code != 200: # 200 means success
            # try again
            print(f"Status code={response.status_code}. Trying again ...", )
            response = requests.get(query, headers={"x-api-key": API_KEY}, params=params)

            retries += 1
            if retries > max_retries:
                break
        
        # query successfull
        response_json = response.json()
        papers = []
        for paper in response_json["data"]:
            if "intents" in paper.keys():
                intents_dict = {intent: True if intent in paper["intents"] else False for intent in ["background", "method", "results"]}
                updated_paper = {**intents_dict, **paper} # merge dicts
                del updated_paper["intents"]
                papers.append(updated_paper)
        
        all_citing_papers += papers

        # decide whether to continue
        if "next" in response_json.keys():
            offset = response_json["next"]
            print(f"Continuing with offset={offset}")
        else:
            retrieved_all_papers = True

    return all_citing_papers

---

In [29]:
# read the clean data frame
df = pd.read_csv("../data/clean_data_semantic_scholar.csv", sep="\t")
df = df.drop('Unnamed: 0', axis=1)
print(len(df))

9021


In [30]:
df.head() # look at some papers

Unnamed: 0,id,title,area,interpretability,doi,source,working_doi,classifier_interpretability_prediction,paperId,url,venue,year,citationCount,influentialCitationCount
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics,,10.18653/v1/P18-1001,ACL2018,True,False,812580f6800bf0db4df0f48af6893b5ff1970fb3,https://www.semanticscholar.org/paper/812580f6...,Annual Meeting of the Association for Computat...,2018.0,112.0,13.0
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics,,10.18653/v1/P18-1002,ACL2018,True,False,542b9b17bb6273ee68281b7f29994b2b29d038b8,https://www.semanticscholar.org/paper/542b9b17...,Annual Meeting of the Association for Computat...,2018.0,79.0,17.0
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics,,10.18653/v1/P18-1003,ACL2018,True,False,5fc80b69bfd3fe63ae2c55f4b7d8f961ceac48e1,https://www.semanticscholar.org/paper/5fc80b69...,Annual Meeting of the Association for Computat...,2018.0,25.0,3.0
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics,,10.18653/v1/P18-1004,ACL2018,True,False,165a74c246d7942efd1829b4316aa5120d1ebf12,https://www.semanticscholar.org/paper/165a74c2...,Annual Meeting of the Association for Computat...,2018.0,72.0,11.0
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation,,10.18653/v1/P18-1005,ACL2018,True,False,5206960404cbe05162e9f95067c56c3d0bd6f81b,https://www.semanticscholar.org/paper/52069604...,Annual Meeting of the Association for Computat...,2018.0,118.0,13.0


In [31]:
# Check for duplicates
print(len(df["paperId"].unique()))
print(len(df["doi"].unique()))

8599
8599


In [32]:
# interpretability_papers = df[df["interpretability"] == True]

In [33]:
# n = 100
# head = interpretability_papers.head(n).drop("Unnamed: 0", axis=1)
# # head

In [34]:
# convert to list of dictionaries (each paper becomes one dict in the list)
papers = df.to_dict(orient="records")
len(papers)

9021

In [35]:
papers[:2] # look at the first two

[{'id': '187',
  'title': 'Probabilistic FastText for Multi-Sense Word Embeddings',
  'area': 'Word Semantics',
  'interpretability': nan,
  'doi': '10.18653/v1/P18-1001',
  'source': 'ACL2018',
  'working_doi': True,
  'classifier_interpretability_prediction': False,
  'paperId': '812580f6800bf0db4df0f48af6893b5ff1970fb3',
  'url': 'https://www.semanticscholar.org/paper/812580f6800bf0db4df0f48af6893b5ff1970fb3',
  'venue': 'Annual Meeting of the Association for Computational Linguistics',
  'year': 2018.0,
  'citationCount': 112.0,
  'influentialCitationCount': 13.0},
 {'id': '1520',
  'title': 'A La Carte Embedding: Cheap but Effective Induction of Semantic Feature Vectors',
  'area': 'Word Semantics',
  'interpretability': nan,
  'doi': '10.18653/v1/P18-1002',
  'source': 'ACL2018',
  'working_doi': True,
  'classifier_interpretability_prediction': False,
  'paperId': '542b9b17bb6273ee68281b7f29994b2b29d038b8',
  'url': 'https://www.semanticscholar.org/paper/542b9b17bb6273ee68281b7f

---

## Build citation graph

In [36]:
G = nx.DiGraph() # create graph object

In [37]:
# add all papers from our clean data frame as nodes to graph
for paper in tqdm(papers, desc="Adding nodes to graph"):
    G.add_node(paper["paperId"], **paper)

Adding nodes to graph: 100%|██████████| 9021/9021 [00:00<00:00, 897883.64it/s]


In [38]:
G.number_of_nodes()

8599

In [39]:
# inspect some nodes
for idx, (nid, attributes) in enumerate(G.nodes.data()):
    print(nid)
    print(attributes)
    if idx == 2:
        break

812580f6800bf0db4df0f48af6893b5ff1970fb3
{'id': '187', 'title': 'Probabilistic FastText for Multi-Sense Word Embeddings', 'area': 'Word Semantics', 'interpretability': nan, 'doi': '10.18653/v1/P18-1001', 'source': 'ACL2019', 'working_doi': True, 'classifier_interpretability_prediction': False, 'paperId': '812580f6800bf0db4df0f48af6893b5ff1970fb3', 'url': 'https://www.semanticscholar.org/paper/812580f6800bf0db4df0f48af6893b5ff1970fb3', 'venue': 'Annual Meeting of the Association for Computational Linguistics', 'year': 2018.0, 'citationCount': 112.0, 'influentialCitationCount': 13.0}
542b9b17bb6273ee68281b7f29994b2b29d038b8
{'id': '1520', 'title': 'A La Carte Embedding: Cheap but Effective Induction of Semantic Feature Vectors', 'area': 'Word Semantics', 'interpretability': nan, 'doi': '10.18653/v1/P18-1002', 'source': 'ACL2019', 'working_doi': True, 'classifier_interpretability_prediction': False, 'paperId': '542b9b17bb6273ee68281b7f29994b2b29d038b8', 'url': 'https://www.semanticscholar

In [40]:
# for each node in our graph (i.e., a paper), get all the papers that cite it
# NOTE: running this will take some time (~1:30h on my Macbook Pro M1)
citing_papers = {}
for nid, attributes in tqdm(G.nodes.data(), desc="retrieving citations"):
    citing_papers[nid] = get_all_citations(attributes)
    

retrieving citations:   0%|          | 31/8599 [00:18<1:13:41,  1.94it/s]

Continuing with offset=1000
Continuing with offset=2000
Continuing with offset=3000


retrieving citations:   1%|▏         | 116/8599 [01:06<1:34:48,  1.49it/s]

Continuing with offset=1000


retrieving citations:   3%|▎         | 299/8599 [02:29<1:02:11,  2.22it/s]

Continuing with offset=1000


retrieving citations:   4%|▍         | 351/8599 [02:53<1:15:38,  1.82it/s]

Continuing with offset=1000


retrieving citations:   5%|▍         | 396/8599 [03:13<47:14,  2.89it/s]  

Continuing with offset=1000
Continuing with offset=2000


retrieving citations:   7%|▋         | 620/8599 [04:50<41:47,  3.18it/s]  

Continuing with offset=1000
Continuing with offset=2000
Continuing with offset=3000
Continuing with offset=4000
Continuing with offset=5000
Continuing with offset=6000
Continuing with offset=7000


retrieving citations:  11%|█▏        | 980/8599 [07:58<51:01,  2.49it/s]   

Continuing with offset=1000


retrieving citations:  13%|█▎        | 1104/8599 [09:05<39:12,  3.19it/s]  

Continuing with offset=1000
Continuing with offset=2000
Continuing with offset=3000
Continuing with offset=4000


retrieving citations:  14%|█▍        | 1210/8599 [10:14<1:12:27,  1.70it/s]

Continuing with offset=1000


retrieving citations:  23%|██▎       | 1963/8599 [15:23<51:49,  2.13it/s]  

Continuing with offset=1000


retrieving citations:  26%|██▌       | 2225/8599 [17:37<37:26,  2.84it/s]  

Continuing with offset=1000


retrieving citations:  26%|██▋       | 2262/8599 [17:54<39:06,  2.70it/s]  

Continuing with offset=1000


retrieving citations:  28%|██▊       | 2444/8599 [19:24<38:38,  2.65it/s]  

Continuing with offset=1000


retrieving citations:  33%|███▎      | 2868/8599 [22:25<40:23,  2.36it/s]  

Continuing with offset=1000
Continuing with offset=2000


retrieving citations:  35%|███▍      | 2997/8599 [23:18<28:25,  3.29it/s]  

Continuing with offset=1000
Continuing with offset=2000


retrieving citations:  42%|████▏     | 3575/8599 [27:07<27:24,  3.06it/s]  

Continuing with offset=1000
Continuing with offset=2000


retrieving citations:  45%|████▍     | 3867/8599 [28:54<24:33,  3.21it/s]  

Continuing with offset=1000


retrieving citations:  86%|████████▌ | 7398/8599 [46:15<05:49,  3.44it/s]  

Continuing with offset=1000


retrieving citations:  87%|████████▋ | 7442/8599 [46:35<06:42,  2.87it/s]

Continuing with offset=1000


retrieving citations:  88%|████████▊ | 7545/8599 [47:12<06:38,  2.65it/s]

Continuing with offset=1000


retrieving citations:  88%|████████▊ | 7569/8599 [47:24<05:37,  3.05it/s]

Continuing with offset=1000
Continuing with offset=2000
Continuing with offset=3000
Continuing with offset=4000
Continuing with offset=5000
Continuing with offset=6000


retrieving citations:  89%|████████▉ | 7683/8599 [48:27<05:02,  3.03it/s]  

Continuing with offset=1000


retrieving citations:  91%|█████████ | 7843/8599 [49:27<03:11,  3.95it/s]

Continuing with offset=1000


retrieving citations:  91%|█████████▏| 7856/8599 [49:37<06:01,  2.06it/s]

Continuing with offset=1000
Continuing with offset=2000


retrieving citations:  96%|█████████▌| 8235/8599 [51:48<02:01,  3.01it/s]

Continuing with offset=1000


retrieving citations:  97%|█████████▋| 8313/8599 [52:18<01:09,  4.10it/s]

Continuing with offset=1000
Continuing with offset=2000


retrieving citations: 100%|██████████| 8599/8599 [54:10<00:00,  2.65it/s]


In [41]:
print(len(citing_papers))

8599


In [60]:
# {'id': '187',
#  'title': 'Probabilistic FastText for Multi-Sense Word Embeddings',
#  'area': 'Word Semantics',
#  'interpretability': nan,
#  'doi': '10.18653/v1/P18-1001',
#  'source': 'ACL2019',
#  'working_doi': True,
#  'classifier_interpretability_prediction': False,
#  'paperId': '812580f6800bf0db4df0f48af6893b5ff1970fb3',
#  'url': 'https://www.semanticscholar.org/paper/812580f6800bf0db4df0f48af6893b5ff1970fb3',
#  'venue': 'Annual Meeting of the Association for Computational Linguistics',
#  'year': 2018,
#  'citationCount': 112,
#  'influentialCitationCount': 13,
#  'value': '812580f6800bf0db4df0f48af6893b5ff1970fb3',
#  'name': '812580f6800bf0db4df0f48af6893b5ff1970fb3'}

In [None]:
# TODO(mm): handle missing values

In [42]:
# add citing papers to the graph
for key, values in tqdm(citing_papers.items(), desc="Adding citations to the graph"):
    for paper in values:
        paper_info = paper["citingPaper"]
        if paper_info["paperId"] is not None:
            # TODO(mm): figure out how to best handle None attributes
            cleaned_paper_info = {k: v if v is not None else "None" for k,v in paper_info.items()}
            G.add_node(cleaned_paper_info["paperId"], **cleaned_paper_info)
            G.add_edge(key, paper_info["paperId"], isInfluential=paper["isInfluential"], background=paper["background"], results=paper["results"], method=paper["method"])

Adding citations to the graph: 100%|██████████| 8599/8599 [00:02<00:00, 3819.12it/s]


In [43]:
G.number_of_nodes()

110934

In [53]:
# for edge in G.edges.data():
#     print(edge)
#     break

In [54]:
# # update node attributes
# for nid, attributes in G.nodes.data():
#     # get edges for a specific node
#     edges = G.edges([nid])
#     # update attributes 
#     attributes["citationCount"] = len(edges)
#     nx.set_node_attributes(G, {nid:attributes})

In [55]:
# # plot graph

# # get unique groups
# groups = set(nx.get_node_attributes(G,'citationCount').values())
# mapping = dict(zip(sorted(groups), count()))
# nodes = G.nodes()
# # colors = [mapping[G.nodes[n]['citationCount'] if "citationCount" in G.nodes[n] else 0] for n in nodes]
# colors = [mapping[G.nodes[n]['citationCount']] for n in nodes]

# fig, axes = plt.subplots(nrows=1, ncols=1, dpi=120)

# # nx.draw(G, with_labels=False, font_weight='bold', node_color="red", node_size=5, width=0.5)

# # draw nodes and edges seperately
# pos = nx.spring_layout(G)
# ec = nx.draw_networkx_edges(G, pos, alpha=0.2, width=1)
# nc = nx.draw_networkx_nodes(G, pos, nodelist=G.nodes(), node_color=colors, node_size=5, cmap=plt.cm.jet)
# # nc = nx.draw_networkx_nodes(G, pos, nodelist=G.nodes(), node_color="red", node_size=5)

# plt.colorbar(nc, label="number of citations")
# plt.axis('off')
# plt.show()

In [44]:
# save graph to json
G_json = nx.cytoscape_data(G)  

with open('../citationgraph/graph.json', 'w') as f:
    json.dump(G_json, f)

In [45]:
# verify
with open('../citationgraph/graph.json') as f:
    GG = json.load(f)
GG = nx.cytoscape_graph(GG)

assert G.number_of_nodes() == GG.number_of_nodes()

In [46]:
# save graph to gephi format
nx.write_gexf(G, '../citationgraph/graph.gexf', encoding='utf-8')

In [None]:
# # plot graph

# fig, axes = plt.subplots(nrows=1, ncols=1, dpi=120)

# # nx.draw(G, with_labels=False, font_weight='bold', node_color="red", node_size=5, width=0.5)

# # draw nodes and edges seperately
# pos = nx.spring_layout(G)
# ec = nx.draw_networkx_edges(G, pos, alpha=0.2, width=1)
# nc = nx.draw_networkx_nodes(G, pos, nodelist=G.nodes(), node_color=colors, node_size=5, cmap=plt.cm.jet)
# # nc = nx.draw_networkx_nodes(G, pos, nodelist=G.nodes(), node_color="red", node_size=5)

# # plt.colorbar(nc, label="number of citations")
# plt.axis('off')
# plt.show()

### Graph algorithms

In [42]:
len(nx.dominating_set(G))

4682