In [1]:
import os, sys
sys.path.append(os.path.dirname(os.getcwd()))
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.ticker as ticker
import os
import json
import zipfile
import networkx as nx
import statsmodels.api as sm

from tqdm import tqdm
from matplotlib import rc
from matplotlib import colormaps
from collections import defaultdict

In [2]:
df = pd.read_csv("../data/cl_papers.csv", sep=",", index_col=0)
df.head()

Unnamed: 0,id,title,area,source,year,doi,abstract,semantic_scholar_id
0,main.1004,AnswerFact: Fact Checking in Product Question ...,Question Answering,EMNLP,2020,10.18653/v1/2020.emnlp-main.188,Product-related question answering platforms n...,4c61df1b4b9a164fec1a34587b4fffae029cd18c
1,main.1006,Knowledge-Grounded Dialogue Generation with Pr...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.272,We study knowledge-grounded dialogue generatio...,3447a432f724aa36595643446acda5b78943db19
2,main.1009,BiST: Bi-directional Spatio-Temporal Reasoning...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.145,Video-grounded dialogues are very challenging ...,f4a2acfeb1705df3f430cc53ace26e1dbbbcbd16
3,main.1010,A Knowledge-Aware Sequence-to-Tree Network for...,NLP Applications,EMNLP,2020,10.18653/v1/2020.emnlp-main.579,With the advancements in natural language proc...,24ed85ad966823868c1694a19385d01c6ad71008
4,main.1011,Knowledge Association with Hyperbolic Knowledg...,Information Extraction,EMNLP,2020,10.18653/v1/2020.emnlp-main.460,Capturing associations for knowledge graphs (K...,3d61a28b9429fc8f7047fc379a0134a3765edbcb


In [3]:
from classifier import is_interpretability_title_and_abstract, is_mt_title_and_abstract, is_dialogue_title_and_abstract, is_ie_title_and_abstract

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
json_path = '../citationgraph/graph.json'
zip_path = '../citationgraph/graph.zip'

if not os.path.exists(json_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(os.path.dirname(zip_path))
    print("ZIP file extracted.")
 
with open(json_path) as f:
    graph_json = json.load(f)
    G = nx.cytoscape_graph(graph_json)

G.number_of_nodes()

185384

In [5]:
missing_nodes = []

for node in G.nodes():
  if 'ie_prediction' not in G.nodes[node] or G.nodes[node]['ie_prediction'] is None:
      missing_nodes.append(node)
len(missing_nodes)

185384

In [6]:
from utils import chunk_list, API_KEY
import requests

def get_dois(paper_ids: list[str]):
    chunk_size = 500
    all_papers = []

    for chunk in tqdm(chunk_list(paper_ids, chunk_size), desc="Fetching papers", total=(1 + len(paper_ids) // chunk_size)):
        url = 'https://api.semanticscholar.org/graph/v1/paper/batch'
        fields = "externalIds,abstract,title"

        response = requests.post(url,
                                 headers={"x-api-key": API_KEY},
                                 params={"fields": fields},
                                 json={"ids": chunk})

        for paper_dict in response.json():
            if paper_dict is None:
                all_papers.append(None)
                continue

            if 'DOI' in paper_dict['externalIds']:
                doi = paper_dict['externalIds']['DOI']
            else:
                doi = None
            abstract = paper_dict['abstract']
            title = paper_dict['title']
            all_papers.append({ 'doi': doi, 'abstract': abstract, 'title': title})

    return all_papers

papers = get_dois(missing_nodes)

Output()

Fetching papers: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 371/371 [09:31<00:00,  1.54s/it]


In [7]:
doi_to_title_and_abstract = {}

In [8]:
for ssid, paper in zip(missing_nodes, papers):
    if paper is not None:
        abstract = paper['abstract']
        title = paper['title']
        if title is not None and abstract is not None:
            doi_to_title_and_abstract[ssid] = { 'title': title, 'abstract': abstract }

In [20]:
len(id_to_doi)

142470

In [12]:
id_to_doi = {}
doi_to_id = {}
for paper, ssid in zip(papers, missing_nodes):
    if paper is not None:
        doi = paper['doi']
        if doi:
            id_to_doi[ssid] = doi
            doi_to_id[doi] = ssid

In [26]:
import pyalex

pyalex.config.email = "tomvergara@uc.cl"
pyalex.config.max_retries = 1
pyalex.config.retry_backoff_factor = 0.1

In [13]:
for doi in tqdm(doi_to_id.keys()):
    if doi not in doi_to_title_and_abstract and doi in prev:
        doi_to_title_and_abstract[doi] = prev[doi]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 142414/142414 [00:00<00:00, 1883381.45it/s]


In [None]:
for doi in tqdm(doi_to_id.keys()):
    if doi not in doi_to_title_and_abstract:
        try:
            paper = pyalex.Works()['https://doi.org/' + doi]
            title = paper['title']
            abstract = paper['abstract']
            if title and abstract:
                doi_to_title_and_abstract[doi] = { 'title': title, 'abstract': abstract }
        except:
            pass

In [None]:
len(doi_to_title_and_abstract)

In [15]:
for doi, paper in doi_to_title_and_abstract.items():
    title = paper['title']
    abstract = paper['abstract']
    if abstract is None:
        print('aaa')

In [16]:
for node in tqdm(G.nodes()):
    if 'ie_prediction' not in G.nodes[node] or G.nodes[node]['ie_prediction'] is None:
        if node in id_to_doi:
            doi = id_to_doi[node]
            if doi in doi_to_title_and_abstract:
                
                abstract = doi_to_title_and_abstract[doi]['abstract']
                title = doi_to_title_and_abstract[doi]['title']
                ie = is_ie_title_and_abstract(title, abstract)
                G.nodes[node]['ie_prediction'] = ie


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185384/185384 [51:42<00:00, 59.75it/s]


In [17]:
n = 0
for node in tqdm(G.nodes()):
    if 'ie_prediction' in G.nodes[node] and G.nodes[node]['ie_prediction'] is not None:
        n += 1
n

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185384/185384 [00:00<00:00, 275082.25it/s]


16949

In [23]:
n = 0
for node in tqdm(G.nodes()):
    if 'ie_prediction' not in G.nodes[node] or G.nodes[node]['ie_prediction'] is None:
        if node in id_to_doi: 
            n += 1
n

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185384/185384 [00:00<00:00, 1186388.81it/s]


125521

In [18]:
len(id_to_doi)

142470

In [247]:
G_json = nx.cytoscape_data(G)
with open('../citationgraph/graph.json', 'w') as f:
    json.dump(G_json, f)

In [250]:
with open('../citationgraph/abstracts.json', 'w') as f:
    json.dump(doi_to_title_and_abstract, f)

In [10]:
with open('../citationgraph/abstracts.json', 'r') as f:
    prev = json.load(f)

In [11]:
prev

{'10.1016/j.cogsys.2021.09.001': {'title': 'Vector Semiotic Model for Visual Question Answering',
  'abstract': 'In this paper, we propose a Vector Semiotic Model as a possible solution to the symbol grounding problem in the context of Visual Question Answering. The Vector Semiotic Model combines the advantages of a Semiotic Approach implemented in the Sign-Based World Model and Vector Symbolic Architectures. The Sign-Based World Model represents information about a scene depicted on an input image in a structured way and grounds abstract objects in an agent’s sensory input. We use the Vector Symbolic Architecture to represent the elements of the Sign-Based World Model on a computational level. Properties of a high-dimensional space and operations defined for high-dimensional vectors allow encoding the whole scene into a high-dimensional vector with the preservation of the structure. That leads to the ability to apply explainable reasoning to answer an input question. We conducted expe