In [5]:
import pandas as pd
import os
import json
import networkx as nx
import os

os.chdir('/Users/nikhilisukapalli/Downloads/Capstone/mag_papers_0')
os.getcwd()

'/Users/nikhilisukapalli/Downloads/Capstone/mag_papers_0'

In [6]:
G = nx.DiGraph()

In [8]:
paper_count = 0
edge_count = 0

with open('mag_papers_0.txt', 'r') as papers0:
    for line in papers0:
        try:
            paper = json.loads(line)

            paper_id = paper.get("id")
            title = paper.get("title", "")
            year = paper.get("year", None)
            authors = [a.get("name", "") for a in paper.get("authors", [])]
            fos = [f.get("name", "") for f in paper.get("fos", [])]
            references = paper.get("references", [])
            abstract_data = paper.get("indexed_abstract", {})

            if "InvertedIndex" in abstract_data:
                length = abstract_data["IndexLength"]
                index = abstract_data["InvertedIndex"]
                abstract_words = [None] * length
                for word, positions in index.items():
                    for pos in positions:
                        abstract_words[pos] = word
                abstract = " ".join(w for w in abstract_words if w)
            else:
                abstract = ""

            G.add_node(paper_id, 
                       title=title, 
                       year=year,
                       authors=authors, 
                       fos=fos, 
                       abstract=abstract)

            for ref_id in references:
                G.add_edge(paper_id, ref_id, type="cites")
                edge_count += 1
            
            paper_count += 1
            if paper_count % 500000 == 0:
                print(f"Processed {paper_count} papers so far")

        except Exception as e:
            print(f"Error parsing line: {e}")
            continue

Processed 500000 papers so far
Processed 1000000 papers so far
Processed 1500000 papers so far
Processed 2000000 papers so far
Processed 2500000 papers so far
Processed 3000000 papers so far
Processed 3500000 papers so far
Processed 4000000 papers so far
Processed 4500000 papers so far
Processed 5000000 papers so far


In [9]:
print(f"Number of papers (nodes): {G.number_of_nodes()}")
print(f"Number of citations (edges): ", edge_count)

Number of papers (nodes): 16624247
Number of citations (edges):  21102653


In [10]:
import heapq

top_cited = heapq.nlargest(10, G.in_degree(), key=lambda x: x[1])
for paper_id, citations in top_cited:
    print(f"{citations} citations — {G.nodes[paper_id].get('title', 'Unknown Title')}")

1789 citations — Unknown Title
1648 citations — Unknown Title
1402 citations — Unknown Title
1272 citations — Unknown Title
1187 citations — Unknown Title
1164 citations — Unknown Title
1161 citations — Unknown Title
1144 citations — Unknown Title
1091 citations — Unknown Title
1065 citations — Unknown Title


In [12]:
from tqdm import tqdm

invalid_nodes = []
for n, d in tqdm(G.nodes(data=True), total=G.number_of_nodes(), desc="Scanning nodes"):
    if not d.get("title") or not d.get("abstract"):
        invalid_nodes.append(n)

print(f"Found {len(invalid_nodes):,} invalid nodes")

G.remove_nodes_from(invalid_nodes)

Scanning nodes: 100%|██████████| 15687111/15687111 [17:11<00:00, 15206.42it/s]


Found 13,293,426 invalid nodes


In [13]:
nx.write_gpickle(G, "mag_graph_cleaned.gpickle")