In [174]:
import os, sys
sys.path.append(os.path.dirname(os.getcwd()))
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.ticker as ticker
import os
import json
import zipfile
import networkx as nx
import statsmodels.api as sm

from tqdm import tqdm
from matplotlib import rc
from matplotlib import colormaps
from collections import defaultdict

In [175]:
df = pd.read_csv("../data/cl_papers.csv", sep=",", index_col=0)
df.head()

Unnamed: 0,id,title,area,source,year,doi,abstract,semantic_scholar_id
0,main.1004,AnswerFact: Fact Checking in Product Question ...,Question Answering,EMNLP,2020,10.18653/v1/2020.emnlp-main.188,Product-related question answering platforms n...,4c61df1b4b9a164fec1a34587b4fffae029cd18c
1,main.1006,Knowledge-Grounded Dialogue Generation with Pr...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.272,We study knowledge-grounded dialogue generatio...,3447a432f724aa36595643446acda5b78943db19
2,main.1009,BiST: Bi-directional Spatio-Temporal Reasoning...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.145,Video-grounded dialogues are very challenging ...,f4a2acfeb1705df3f430cc53ace26e1dbbbcbd16
3,main.1010,A Knowledge-Aware Sequence-to-Tree Network for...,NLP Applications,EMNLP,2020,10.18653/v1/2020.emnlp-main.579,With the advancements in natural language proc...,24ed85ad966823868c1694a19385d01c6ad71008
4,main.1011,Knowledge Association with Hyperbolic Knowledg...,Information Extraction,EMNLP,2020,10.18653/v1/2020.emnlp-main.460,Capturing associations for knowledge graphs (K...,3d61a28b9429fc8f7047fc379a0134a3765edbcb


In [176]:
from classifier import is_interpretability_title_and_abstract, is_mt_title_and_abstract

In [177]:
json_path = '../citationgraph/graph.json'
zip_path = '../citationgraph/graph.zip'

if not os.path.exists(json_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(os.path.dirname(zip_path))
    print("ZIP file extracted.")
 
with open(json_path) as f:
    graph_json = json.load(f)
    G = nx.cytoscape_graph(graph_json)

G.number_of_nodes()

185384

In [189]:
missing_nodes = []

for node in G.nodes():
  if 'mt_prediction' not in G.nodes[node] or G.nodes[node]['mt_prediction'] is None:
      missing_nodes.append(node)
len(missing_nodes)

26111

In [222]:
def get_dois(paper_ids: list[str]):
    chunk_size = 500
    all_papers = []

    for chunk in tqdm(chunk_list(paper_ids, chunk_size), desc="Fetching papers"):
        url = 'https://api.semanticscholar.org/graph/v1/paper/batch'
        fields = "externalIds"

        response = requests.post(url,
                                 headers={"x-api-key": API_KEY},
                                 params={"fields": fields},
                                 json={"ids": chunk})

        for paper_dict in response.json():
            if paper_dict is None:
                all_papers.append(None)
                continue

            if 'DOI' in paper_dict['externalIds']:
                doi = paper_dict['externalIds']['DOI']
            else:
                doi = None
            all_papers.append(doi)

    return all_papers

papers = get_dois(missing_nodes)

Fetching papers: 53it [01:01,  1.17s/it]


In [233]:
id_to_doi = {}
doi_to_id = {}
for doi, ssid in zip(papers, missing_nodes):
    if doi:
        id_to_doi[ssid] = doi
        doi_to_id[doi] = ssid

In [212]:
import pyalex

pyalex.config.email = "tomvergara@uc.cl"
pyalex.config.max_retries = 1
pyalex.config.retry_backoff_factor = 0.1

In [240]:
doi_to_title_and_abstract = {}

In [243]:
for doi in tqdm(doi_to_id.keys()):
    if doi not in doi_to_title_and_abstract:
        try:
            paper = pyalex.Works()['https://doi.org/' + doi]
            title = paper['title']
            abstract = paper['abstract']
            if title and abstract:
                doi_to_title_and_abstract[doi] = { 'title': title, 'abstract': abstract }
        except:
            pass

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24016/24016 [14:10:25<00:00,  2.12s/it]


In [244]:
len(doi_to_title_and_abstract)

16954

In [246]:
for node in tqdm(G.nodes()):
    if 'mt_prediction' not in G.nodes[node] or G.nodes[node]['mt_prediction'] is None:
        if node in id_to_doi:
            doi = id_to_doi[node]
            if doi in doi_to_title_and_abstract:
                
                abstract = doi_to_title_and_abstract[doi]['abstract']
                title = doi_to_title_and_abstract[doi]['title']
                mt = is_mt_title_and_abstract(title, abstract)
                G.nodes[node]['mt_prediction'] = mt
                interpretability = is_interpretability_title_and_abstract(title, abstract)
                G.nodes[node]['interpretability_prediction'] = interpretability

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185384/185384 [1:46:01<00:00, 29.14it/s]


In [247]:
G_json = nx.cytoscape_data(G)
with open('../citationgraph/graph.json', 'w') as f:
    json.dump(G_json, f)

In [250]:
with open('../citationgraph/abstracts.json', 'w') as f:
    json.dump(doi_to_title_and_abstract, f)