In [19]:
# !pip install unidecode --quiet
# !pip install networkx
import networkx as nx
import os
import re
import json
from unidecode import unidecode
from tqdm import tqdm

def clean_title(title):
    title = unidecode(title.lower())
    title = re.sub(r'\W', '', title)
    return title.strip()

def match_references(ref, find_paper_db):
    clean_ref = clean_title(ref)
    for t, _ in find_paper_db.items():
        title = t.split()
        index = 0
        for w in title:
            h = clean_ref.find(w, index)
            if -1 == h:
                break
            index = h + len(w)
        else:
            return t
    return None

In [20]:
find_paper_db = {}
LIMITATIONS_FOLDER = "../Generate_limitations"
for year in range(2013,2025):
    path = os.path.join(LIMITATIONS_FOLDER, f"test_outputs/output_{year}.jsonl")

    with open(path, "r") as f:
        for line in f:
            e = json.loads(line)
            title = e.get("paper")
            cleaned = clean_title(title)
            find_paper_db[cleaned] = {"title": title, "year":year, "limitations": e.get("generated")}


In [22]:
graph = nx.DiGraph()

for year in range(2013,2025):
    print(f"Processing {year}")
    path = f"references_by_year/references_{year}.json"
    with open(path, "r", encoding="utf-8") as f:
        references_by_year = json.load(f)
    for e in tqdm(references_by_year):
        pdf = e["paper"]
        title = pdf.replace(".pdf", "")
        clean = clean_title(title)
        if clean not in find_paper_db:
            continue

        graph.add_node(clean, **find_paper_db[clean])

        for r in e["references"]:
            target = match_references(r, find_paper_db)
            if target and target != clean:
                graph.add_node(target, **find_paper_db[target])
                graph.add_edge(clean, target)
print(f"Graph nodes: {graph.number_of_nodes()}, Graph edges: {graph.number_of_edges()}")

Processing 2013


100%|██████████| 360/360 [03:54<00:00,  1.53it/s]


Processing 2014


100%|██████████| 411/411 [05:01<00:00,  1.36it/s]


Processing 2015


100%|██████████| 403/403 [05:14<00:00,  1.28it/s]


Processing 2016


100%|██████████| 569/569 [06:31<00:00,  1.45it/s]


Processing 2017


100%|██████████| 679/679 [09:22<00:00,  1.21it/s]


Processing 2018


100%|██████████| 1008/1008 [21:53<00:00,  1.30s/it]


Processing 2019


100%|██████████| 1417/1417 [36:45<00:00,  1.56s/it] 


Processing 2020


100%|██████████| 1898/1898 [1:17:30<00:00,  2.45s/it]


Processing 2021


100%|██████████| 2329/2329 [5:18:02<00:00,  8.19s/it]  


Processing 2022


100%|██████████| 2831/2831 [5:34:58<00:00,  7.10s/it]  


Processing 2023


100%|██████████| 3532/3532 [4:54:25<00:00,  5.00s/it]  


Processing 2024


100%|██████████| 4488/4488 [2:33:54<00:00,  2.06s/it]  

Graph nodes: 19356, Graph edges: 91585





In [None]:
import pickle

with open("graph.gpickle", 'wb') as f:
    pickle.dump(graph, f, pickle.HIGHEST_PROTOCOL)

In [36]:
with open('graph.gpickle', 'rb') as f:
    graph = pickle.load(f)
top_papers = sorted(graph.in_degree(), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 most cited papers:")
for n, c in top_papers:
    title = graph.nodes[n].get("title", node)
    print(f"{title} ({n}) {c} citations")

Top 10 most cited papers:
Attention is All you Need (attentionisallyouneed) 2200 citations
Generative Adversarial Nets (generativeadversarialnets) 1193 citations
PyTorch  An Imperative Style  High Performance Deep Learning Library (pytorchanimperativestylehighperformancedeeplearninglibrary) 1151 citations
Language Models are Few Shot Learners (languagemodelsarefewshotlearners) 1141 citations
Denoising Diffusion Probabilistic Models (denoisingdiffusionprobabilisticmodels) 849 citations
GANs Trained by a Two Time Scale Update Rule Converge to a Local Nash Equilibrium (ganstrainedbyatwotimescaleupdateruleconvergetoalocalnashequilibrium) 530 citations
A  Sampling (asampling) 458 citations
Inductive Representation Learning on Large Graphs (inductiverepresentationlearningonlargegraphs) 401 citations
Faster R CNN  Towards Real Time Object Detection with Region Proposal Networks (fasterrcnntowardsrealtimeobjectdetectionwithregionproposalnetworks) 400 citations
Diffusion Models Beat GANs on Ima