In [2]:
# Find the top cited papers

import pickle
with open('graph.gpickle', 'rb') as f:
    graph = pickle.load(f)
    
papers_top_10 = sorted(graph.in_degree(), key=lambda y: y[1], reverse=True)[:10]
i = 1
for n, citation in papers_top_10:
    paper_year = graph.nodes[n].get("year", "unknown")
    paper_name = graph.nodes[n].get("title", n)
    print(f"{i}. {paper_name} from year {paper_year}: {citation} citations")
    i+=1
    

1. Attention is All you Need from year 2017: 2200 citations
2. Generative Adversarial Nets from year 2014: 1193 citations
3. PyTorch  An Imperative Style  High Performance Deep Learning Library from year 2019: 1151 citations
4. Language Models are Few Shot Learners from year 2020: 1141 citations
5. Denoising Diffusion Probabilistic Models from year 2020: 849 citations
6. GANs Trained by a Two Time Scale Update Rule Converge to a Local Nash Equilibrium from year 2017: 530 citations
7. A  Sampling from year 2014: 458 citations
8. Inductive Representation Learning on Large Graphs from year 2017: 401 citations
9. Faster R CNN  Towards Real Time Object Detection with Region Proposal Networks from year 2015: 400 citations
10. Diffusion Models Beat GANs on Image Synthesis from year 2021: 387 citations


In [8]:
import random
import pickle
import networkx as nx

# Find a random citation chain to show example

begin = [i for i in graph.nodes if 0 <= graph.out_degree(i)]
first_node = random.choice(begin)

for i in graph.nodes:
    if first_node == i:
        continue
    try:
        path = list(nx.all_simple_paths(graph, source=first_node, target=i, cutoff=4))
        if path:
            y = path[0]
            break
    except nx.NetworkXNoPath:
        continue

print(f"The length of the citation chain is {len(y)}")


for k, n in enumerate(y):
    paper_year = graph.nodes[n].get("year", "unknown")
    gaps = graph.nodes[n].get("limitations", [])
    paper_name = graph.nodes[n].get("title", n)
    
    print(f"Paper {1+k} {paper_name} ({paper_year})")
    if isinstance(gaps, list):
        for b in gaps:
            print(f"  - {b}")
    elif isinstance(gaps, str):
        print(f"  {gaps}")
    else:
        print("No limitations found")
    print("\n")

The length of the citation chain is 5
Paper 1 How to Characterize The Landscape of Overparameterized Convolutional Neural Networks (2020)
  - The loss landscape of the overparameterized convolutional neural network (CNN) is not fully understood, especially when the network is trained on a single layer.
- The model is not suitable for training with multiple layers, which may not be suitable for deep neural networks (DNNs).
- Although the model can be trained on multiple layers (e.g., ReLU), it may not fully capture the feature distributions of multiple layers.
– The model's performance is not optimized for deep learning, which is a limitation of the current work.


Paper 2 On the Global Convergence of Gradient Descent for Over parameterized Models using Optimal Transport (2018)
  - The study focuses on the Wasserstein gradient flow, which is a by-product of optimal transport theory.  
- It does not address the generalizability of the gradient flow in the model, which may not be fully un

In [None]:
# Find 1000 citation chains for evaluation

from tqdm import tqdm
import json
from sklearn.metrics.pairwise import cosine_similarity

chain_number = 1000
chains = []
min_len = 3
max_len = 5

begin = [i for i in graph.nodes if 0 <= graph.out_degree(i)]

# Extract 1000 chains from graph
for _ in tqdm(range(chain_number)):
    first_node = random.choice(begin)
    for y in graph.nodes:
        if y == first_node:
            continue
        try: 
            path = list(nx.all_simple_paths(graph, source=first_node, target= y, cutoff=max_len))
            for j in path:
                if min_len <= len(j):
                    chains.append(j)
                    break
            if chain_number <= len(chains) :
                break
        except nx.NetworkXNoPath:
            continue
print(f"{len(chains) chains were found}")

In [None]:
# Evaluate the chains
import json
from tqdm from tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
res = []

def find_lims(child):
    lims = graph.nodes[child].get("limitations", [])
    if isinstance(lims, list):
        return " ".join(lims)
    elif isinstance(lims, str):
        return lims
    return ""

for c in tqdm(chains):
    res_chain = {"chain": [], "similarities": []}
    txt = []
    for n in c:
        paper_year = graph.nodes[n].get("year", "unknown")
        gaps = find_lims(n)
        paper_name = graph.nodes[n].get("title", n)
        res_chain["chain"].append({"node": n, "title": paper_name, "year": paper_year, "limitations":gaps})
        txt.append(gaps)
    if len(txt) > 1:
        tfidf = vectorizer.fit_transform(txt)
        similarities = cosine_similarity(tfidf)
        pw_sims = [similarities[x, x+1] for x in range(c)-1]
        res_chain["similarities"] = pw_sims
    res.append(res_chain)

with open("citation_chains_eval.json", "w", encoding="utf-8") as f:
    json.dump(res, f, indent=2)

In [None]:
# Make a histogram figure out of the results from the cosime similarity

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

with open("citation_chains_eval.json", "r", encoding="utf-8") as f:
    res = json.load(f)

similarities = [s for r in res for s in r["similarities"] if s is not None]
avg_per_chain = [np.mean(r["similarities"]) for r in res if r["similarities"]]

avg_sim = np.mean(similarities)
inc_trend_count = sum(1 for r in res if r["similarities"] and r["similarities"][-1] > r["similarities"][0])
low_s_score = sum(1 for r in res if r["similarities"] and np.mean(r["similarities"]) < 0.3)
all_chains = len(r for r in res if r["similarities"])

sns.set(style="whitegrid")
plt.hist(similarities, bins=30, edgecolor='black', color=sns.color_pallete("Blues")[2])
plt.title("Distribution of Cosine Similarity Across Citation Chains", fontsize=12, fontweight="bold")
plt.xlabel("Cosine Similarity", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()
{
    "Mean Cosine Similarity": avg_sim,
    "% of Chains with Increasing Similarity": inc_trend_count/all_chains * 100,
    "% of Chains with Low Similarity (< 0.3)": low_s_score/all_chains * 100,
    "Total Chains Evaluated":  all_chains
}