In [None]:
from config import Config
import networkx as nx
import pickle
import json
import pandas as pd

g: nx.DiGraph = None
with open(Config.GRAPH_BIN_PATH, 'rb') as infile:
    g = pickle.load(infile)

papers: dict = None
with open(Config.REDUCED_JSON_PATH, 'r') as infile:
    papers = json.load(infile)

unarxiv: dict = None
with open(Config.UNARXIV_REDUCED_JSON_PATH, 'r') as infile:
    unarxiv = json.load(infile)

In [None]:
def pairwise_impact(id_a: str, id_b:str) -> float:
    impact_coef = 1.0
    data_a = unarxiv[id_a]
    data_b = unarxiv[id_b]

    # Different field -> more impact
    if data_a['discipline'] != data_b['discipline']:
        impact_coef *= 1.5
    
    # Shared authors -> less impact
    common_authors = len(set(data_a['authors']).intersection(data_b['authors']))
    if common_authors > 0:
        impact_coef *= 1 - (common_authors / max(len(data_a['authors']), len(data_b['authors'])))

    return impact_coef

In [None]:
def get_node_score(id: str) -> float:
    if 'impact_score' in g.nodes[id]:
        return g.nodes[id]['impact_score']

    score = 1.0
    for _, citing in list(g.out_edges(id)):
        score += get_node_score(citing) * pairwise_impact(id, citing)
    
    g.nodes[id]['impact_score'] = score
    return score

In [None]:
eligible_papers = set(papers.keys()).intersection(set(g.nodes))
for id in eligible_papers:
    get_node_score(id)

In [None]:
scores_dict = {id: g.nodes[id]['impact_score'] for id in eligible_papers}
scores = pd.DataFrame(scores_dict.items(), columns=['id', 'impact'])
scores.set_index('id', inplace=True)
scores.sort_values('impact', ascending=False, inplace=True)
scores['impact_pct'] = scores['impact'] / max(scores['impact']) * 100

In [None]:
most_impactful = scores.head().copy()
most_impactful['title'] = [unarxiv[id]['title'] for id in most_impactful.index]
most_impactful