In [1]:
import os

os.chdir('/Users/nikhilisukapalli/Downloads/Capstone/mag_papers_0')
os.getcwd()

'/Users/nikhilisukapalli/Downloads/Capstone/mag_papers_0'

In [2]:
import networkx as nx

G = nx.read_gpickle("mag_graph_cleaned.gpickle")

In [3]:
print(f"Number of papers (nodes): {G.number_of_nodes()}")
print(f"Number of citations (edges): {G.number_of_edges()}")

Number of papers (nodes): 2393685
Number of citations (edges): 225071


In [4]:
page_ranks = nx.pagerank(G, alpha=0.85)
print(f"Number of pages: {len(page_ranks)}")

Number of pages: 2393685


In [8]:
import pickle

with open("pageranks.pkl", "wb") as f:
    pickle.dump(page_ranks, f)

In [5]:
import torch
print(torch.__version__)
print(torch.__file__)

import transformers
import sentence_transformers

print(transformers.__version__)
print(sentence_transformers.__version__)

2.2.2
/Users/nikhilisukapalli/opt/anaconda3/envs/graphqa2/lib/python3.9/site-packages/torch/__init__.py


  from .autonotebook import tqdm as notebook_tqdm


4.51.3
4.1.0


In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embedding = model.encode("This is a test sentence")
print(embedding.shape)

(384,)


In [11]:
for node_id, data in G.nodes(data=True):
    print(data)
    break

{'title': 'Electron Spin Resonance Investigations of Oxygen-Centered Free Radicals in Biological Systems', 'year': 1988, 'authors': ['Ronald P. Mason', 'Kim M. Morehouse'], 'fos': ['Hydroxyl radical', 'Autoxidation', 'Oxygen', 'Hyperfine structure', 'Photochemistry', 'Nuclear magnetic resonance', 'Radical', 'Electron paramagnetic resonance', 'Superoxide', 'Xanthine oxidase', 'Chemistry'], 'abstract': 'Oxygen-centered free radicals have been detected directly with ESR in a variety of biological processes such as lipid autoxidation,1 the enzymatic formation of superoxide by xanthine oxidase,2 and hydroxyl radical formation by the Y-irradiation of ice.3 Since the common isotope of oxygen is spin-less, no nuclear hyperfine interaction is possible, and the g-value and any hydrogen hyperfine coupling provide the only criteria for distinguishing an oxygen-centered free radical from other free radicals. The greatest limitation on the direct detection of oxygen-centered free radicals, aside fro

In [None]:
paper_ids = []
paper_texts = []

for node_id, data in G.nodes(data=True):
    if data.get('title') and data.get('abstract'):
        text = (data.get('title', '') + ' ' + data.get('abstract', '')).strip()
        if text:
            paper_ids.append(node_id)
            paper_texts.append(text)

print(f"Parsed {len(paper_texts)} papers")

Parsed 2393685 papers


In [8]:
import psutil

def print_memory_usage():
    process = psutil.Process(os.getpid())
    mem_mb = process.memory_info().rss / (1024 * 1024)
    print(f"Current RAM usage: {mem_mb:.2f} MB")

In [9]:
from tqdm import tqdm
import time
import numpy as np

In [None]:
embeddings = []
batch_sizes = [64, 128, 256]
chunk_size = 10000
timings = {}

for i, size in enumerate(batch_sizes):
    start = i * chunk_size
    end = start + chunk_size
    sample = paper_texts[start:end]
    print(f"Processing batch size {size} on samples {start}–{end}...")

    start_time = time.time()
    batch_embeddings = model.encode(sample, show_progress_bar=True, batch_size=size)
    timings[size] = time.time() - start_time
    embeddings.extend(batch_embeddings)
    print(f"Batch size {size} took {timings[size]:.2f} seconds")
    print()

Processing batch size 64 on samples 0–10000...


Batches: 100%|██████████| 157/157 [09:33<00:00,  3.65s/it]


Batch size 64 took 574.10 seconds

Processing batch size 128 on samples 10000–20000...


Batches: 100%|██████████| 79/79 [11:05<00:00,  8.42s/it]


Batch size 128 took 666.29 seconds

Processing batch size 256 on samples 20000–30000...


Batches: 100%|██████████| 40/40 [09:48<00:00, 14.72s/it]

Batch size 256 took 589.64 seconds






In [26]:
embeddings2 = []
start = 30000
end = 500000
samples = paper_texts[start:end]
print(f"Processing batch size 256 on samples {start}–{end}...")

start_time = time.time()

for i in tqdm(range(0, len(samples), 256)):
    batch_embeddings2 = model.encode(samples[i:i + 256])
    embeddings2.extend(batch_embeddings2)

    if (i // 256) % 100 == 0:
        print(f"Completed {(i // 256)} batches...")
        print_memory_usage()
        print()

duration2 = time.time() - start_time
print(f"Batch size 256 took {duration2:.2f} seconds")

Processing batch size 256 on samples 30000–500000...


  0%|          | 1/1836 [00:08<4:35:08,  9.00s/it]

Completed 0 batches...
Current RAM usage: 1896.48 MB



  6%|▌         | 101/1836 [18:54<5:23:30, 11.19s/it]

Completed 100 batches...
Current RAM usage: 1943.68 MB



 11%|█         | 201/1836 [37:36<4:53:04, 10.75s/it]

Completed 200 batches...
Current RAM usage: 1987.87 MB



 16%|█▋        | 301/1836 [55:55<4:45:19, 11.15s/it]

Completed 300 batches...
Current RAM usage: 1284.52 MB



 22%|██▏       | 401/1836 [1:14:30<4:02:22, 10.13s/it] 

Completed 400 batches...
Current RAM usage: 1234.12 MB



 27%|██▋       | 501/1836 [1:31:25<3:47:48, 10.24s/it]

Completed 500 batches...
Current RAM usage: 1276.98 MB



 33%|███▎      | 601/1836 [1:48:43<3:46:16, 10.99s/it]

Completed 600 batches...
Current RAM usage: 1320.38 MB



 38%|███▊      | 701/1836 [2:07:24<3:18:22, 10.49s/it] 

Completed 700 batches...
Current RAM usage: 1364.24 MB



 44%|████▎     | 801/1836 [2:27:56<3:44:39, 13.02s/it]

Completed 800 batches...
Current RAM usage: 1037.50 MB



 49%|████▉     | 901/1836 [2:49:42<3:28:33, 13.38s/it]

Completed 900 batches...
Current RAM usage: 342.42 MB



 55%|█████▍    | 1001/1836 [3:10:41<2:53:59, 12.50s/it]

Completed 1000 batches...
Current RAM usage: 382.48 MB



 60%|█████▉    | 1101/1836 [3:31:42<2:15:07, 11.03s/it]

Completed 1100 batches...
Current RAM usage: 449.08 MB



 65%|██████▌   | 1201/1836 [3:49:57<1:58:47, 11.22s/it]

Completed 1200 batches...
Current RAM usage: 516.24 MB



 71%|███████   | 1301/1836 [4:08:54<1:28:12,  9.89s/it]

Completed 1300 batches...
Current RAM usage: 582.62 MB



 76%|███████▋  | 1401/1836 [4:31:00<1:51:31, 15.38s/it]

Completed 1400 batches...
Current RAM usage: 495.63 MB



 82%|████████▏ | 1501/1836 [4:52:05<1:13:12, 13.11s/it]

Completed 1500 batches...
Current RAM usage: 503.15 MB



 87%|████████▋ | 1601/1836 [5:13:07<1:23:38, 21.36s/it]

Completed 1600 batches...
Current RAM usage: 564.59 MB



 93%|█████████▎| 1701/1836 [5:35:51<21:40,  9.63s/it]  

Completed 1700 batches...
Current RAM usage: 349.45 MB



 98%|█████████▊| 1801/1836 [5:52:30<05:52, 10.08s/it]

Completed 1800 batches...
Current RAM usage: 422.50 MB



100%|██████████| 1836/1836 [5:58:20<00:00, 11.71s/it]

Batch size 256 took 21500.97 seconds





In [29]:
embeddings3 = []
start = 500000
end = 1000000
samples = paper_texts[start:end]
print(f"Processing batch size 256 on samples {start}–{end}...")

start_time = time.time()

for i in tqdm(range(0, len(samples), 256)):
    batch_embeddings3 = model.encode(samples[i:i + 256])
    embeddings3.extend(batch_embeddings3)

    if (i // 256) % 100 == 0:
        print(f"Completed {(i // 256)} batches...")
        print_memory_usage()
        print()

duration3 = time.time() - start_time
print(f"Batch size 256 took {duration3:.2f} seconds")

Processing batch size 256 on samples 500000–1000000...


  0%|          | 1/1954 [00:09<5:07:17,  9.44s/it]

Completed 0 batches...
Current RAM usage: 925.82 MB



  5%|▌         | 101/1954 [22:54<6:42:35, 13.04s/it]

Completed 100 batches...
Current RAM usage: 421.03 MB



 10%|█         | 201/1954 [43:05<5:22:00, 11.02s/it]

Completed 200 batches...
Current RAM usage: 497.32 MB



 15%|█▌        | 301/1954 [1:10:34<7:50:46, 17.09s/it]

Completed 300 batches...
Current RAM usage: 384.47 MB



 21%|██        | 401/1954 [1:35:25<6:28:02, 14.99s/it]

Completed 400 batches...
Current RAM usage: 438.60 MB



 26%|██▌       | 501/1954 [1:58:07<6:07:18, 15.17s/it]

Completed 500 batches...
Current RAM usage: 506.52 MB



 31%|███       | 601/1954 [2:21:44<4:54:44, 13.07s/it]

Completed 600 batches...
Current RAM usage: 515.70 MB



 36%|███▌      | 701/1954 [2:42:30<4:17:41, 12.34s/it]

Completed 700 batches...
Current RAM usage: 581.36 MB



 41%|████      | 801/1954 [3:02:47<3:55:18, 12.24s/it]

Completed 800 batches...
Current RAM usage: 647.79 MB



 46%|████▌     | 901/1954 [5:01:24<4:46:56, 16.35s/it]    

Completed 900 batches...
Current RAM usage: 489.66 MB



 51%|█████     | 1001/1954 [5:26:38<3:34:53, 13.53s/it]

Completed 1000 batches...
Current RAM usage: 340.71 MB



 56%|█████▋    | 1101/1954 [5:47:49<3:10:15, 13.38s/it]

Completed 1100 batches...
Current RAM usage: 413.37 MB



 61%|██████▏   | 1201/1954 [6:13:28<3:28:09, 16.59s/it]

Completed 1200 batches...
Current RAM usage: 405.63 MB



 67%|██████▋   | 1301/1954 [6:34:06<2:08:09, 11.78s/it]

Completed 1300 batches...
Current RAM usage: 474.25 MB



 72%|███████▏  | 1401/1954 [6:53:27<1:44:47, 11.37s/it]

Completed 1400 batches...
Current RAM usage: 540.25 MB



 77%|███████▋  | 1501/1954 [7:14:25<1:27:07, 11.54s/it]

Completed 1500 batches...
Current RAM usage: 574.64 MB



 82%|████████▏ | 1601/1954 [7:34:45<1:10:24, 11.97s/it]

Completed 1600 batches...
Current RAM usage: 523.18 MB



 87%|████████▋ | 1701/1954 [7:55:38<55:36, 13.19s/it]  

Completed 1700 batches...
Current RAM usage: 608.16 MB



 92%|█████████▏| 1801/1954 [8:16:19<31:30, 12.36s/it]

Completed 1800 batches...
Current RAM usage: 687.35 MB



 97%|█████████▋| 1901/1954 [8:37:19<12:31, 14.17s/it]

Completed 1900 batches...
Current RAM usage: 728.07 MB



100%|██████████| 1954/1954 [8:48:52<00:00, 16.24s/it]

Batch size 256 took 31732.62 seconds





In [None]:
print(len(embeddings), len(embeddings2), len(embeddings3))

30000 470000 500000


In [None]:
np.savez_compressed(
    'embeddings_1M.npz',
    embeddings=np.array(embeddings, dtype=object),
    embeddings2=np.array(embeddings2, dtype=object),
    embeddings3=np.array(embeddings3, dtype=object)
)

In [17]:
embeddings4 = []
start = 1000000
end = 1500000
samples = paper_texts[start:end]
print(f"Processing batch size 256 on samples {start}–{end}...")

start_time = time.time()

for i in tqdm(range(0, len(samples), 256)):
    batch_embeddings4 = model.encode(samples[i:i + 256])
    embeddings4.extend(batch_embeddings4)

    if (i // 256) % 100 == 0:
        print(f"Completed {(i // 256)} batches...")
        print_memory_usage()
        print()

duration4 = time.time() - start_time
print(f"Batch size 256 took {duration4:.2f} seconds")

Processing batch size 256 on samples 1000000–1500000...


  0%|          | 1/1954 [00:13<7:31:58, 13.89s/it]

Completed 0 batches...
Current RAM usage: 1189.71 MB



  5%|▌         | 101/1954 [20:59<6:25:08, 12.47s/it]

Completed 100 batches...
Current RAM usage: 2392.49 MB



 10%|█         | 201/1954 [41:54<5:58:15, 12.26s/it]

Completed 200 batches...
Current RAM usage: 431.52 MB



 15%|█▌        | 301/1954 [1:02:54<5:39:50, 12.34s/it]

Completed 300 batches...
Current RAM usage: 576.51 MB



 21%|██        | 401/1954 [11:31:11<6:10:14, 14.30s/it]      

Completed 400 batches...
Current RAM usage: 317.47 MB



 26%|██▌       | 501/1954 [11:52:41<4:43:36, 11.71s/it]

Completed 500 batches...
Current RAM usage: 363.61 MB



 31%|███       | 601/1954 [12:12:53<4:33:56, 12.15s/it]

Completed 600 batches...
Current RAM usage: 426.09 MB



 36%|███▌      | 701/1954 [12:35:19<4:17:36, 12.34s/it]

Completed 700 batches...
Current RAM usage: 402.97 MB



 41%|████      | 801/1954 [12:55:52<3:36:28, 11.26s/it]

Completed 800 batches...
Current RAM usage: 393.62 MB



 46%|████▌     | 901/1954 [13:15:57<2:54:36,  9.95s/it]

Completed 900 batches...
Current RAM usage: 446.82 MB



 51%|█████     | 1001/1954 [13:31:35<2:28:34,  9.35s/it]

Completed 1000 batches...
Current RAM usage: 518.43 MB



 56%|█████▋    | 1101/1954 [13:46:50<2:11:25,  9.24s/it]

Completed 1100 batches...
Current RAM usage: 591.14 MB



 61%|██████▏   | 1201/1954 [14:02:21<1:57:56,  9.40s/it]

Completed 1200 batches...
Current RAM usage: 663.43 MB



 67%|██████▋   | 1301/1954 [18:57:02<3:45:50, 20.75s/it]    

Completed 1300 batches...
Current RAM usage: 522.20 MB



 72%|███████▏  | 1401/1954 [19:12:37<1:21:59,  8.90s/it]

Completed 1400 batches...
Current RAM usage: 346.21 MB



 77%|███████▋  | 1501/1954 [19:27:49<1:09:04,  9.15s/it]

Completed 1500 batches...
Current RAM usage: 424.40 MB



 82%|████████▏ | 1601/1954 [19:43:08<55:01,  9.35s/it]  

Completed 1600 batches...
Current RAM usage: 503.02 MB



 87%|████████▋ | 1701/1954 [20:34:53<46:50, 11.11s/it]    

Completed 1700 batches...
Current RAM usage: 430.87 MB



 92%|█████████▏| 1801/1954 [20:51:10<26:48, 10.51s/it]

Completed 1800 batches...
Current RAM usage: 498.56 MB



 97%|█████████▋| 1901/1954 [21:08:31<09:29, 10.75s/it]

Completed 1900 batches...
Current RAM usage: 514.81 MB



100%|██████████| 1954/1954 [21:17:29<00:00, 39.23s/it]

Batch size 256 took 76649.67 seconds





In [18]:
print(len(embeddings4))

np.savez_compressed(
    'embeddings_1.5M.npz',
    embeddings4=np.array(embeddings4, dtype=object)
)

500000


In [10]:
embeddings5 = []
start = 1500000
end = 2000000
samples = paper_texts[start:end]
print(f"Processing batch size 256 on samples {start}–{end}...")

start_time = time.time()

for i in tqdm(range(0, len(samples), 256)):
    batch_embeddings5 = model.encode(samples[i:i + 256])
    embeddings5.extend(batch_embeddings5)

    if (i // 256) % 100 == 0:
        print(f"Completed {(i // 256)} batches...")
        print_memory_usage()
        print()

duration5 = time.time() - start_time
print(f"Batch size 256 took {duration5:.2f} seconds")

Processing batch size 256 on samples 1500000–2000000...


  0%|          | 1/1954 [00:23<12:59:29, 23.95s/it]

Completed 0 batches...
Current RAM usage: 4416.01 MB



  5%|▌         | 101/1954 [23:24<8:57:03, 17.39s/it]

Completed 100 batches...
Current RAM usage: 264.92 MB



 10%|█         | 201/1954 [42:36<5:57:41, 12.24s/it]

Completed 200 batches...
Current RAM usage: 371.10 MB



 15%|█▌        | 301/1954 [1:04:14<5:02:57, 11.00s/it]

Completed 300 batches...
Current RAM usage: 435.88 MB



 21%|██        | 401/1954 [1:27:16<7:14:51, 16.80s/it]

Completed 400 batches...
Current RAM usage: 365.91 MB



 26%|██▌       | 501/1954 [1:48:38<5:07:17, 12.69s/it]

Completed 500 batches...
Current RAM usage: 441.34 MB



 31%|███       | 601/1954 [2:09:23<4:47:28, 12.75s/it]

Completed 600 batches...
Current RAM usage: 517.61 MB



 36%|███▌      | 701/1954 [2:30:01<7:33:48, 21.73s/it]

Completed 700 batches...
Current RAM usage: 239.73 MB



 41%|████      | 801/1954 [2:48:11<3:35:56, 11.24s/it]

Completed 800 batches...
Current RAM usage: 361.80 MB



 46%|████▌     | 901/1954 [3:12:34<4:41:47, 16.06s/it]

Completed 900 batches...
Current RAM usage: 442.86 MB



 51%|█████     | 1001/1954 [3:37:24<3:11:23, 12.05s/it]

Completed 1000 batches...
Current RAM usage: 521.48 MB



 56%|█████▋    | 1101/1954 [3:58:31<3:04:58, 13.01s/it]

Completed 1100 batches...
Current RAM usage: 599.62 MB



 61%|██████▏   | 1201/1954 [4:23:31<3:25:34, 16.38s/it]

Completed 1200 batches...
Current RAM usage: 676.71 MB



 67%|██████▋   | 1301/1954 [4:45:05<2:12:23, 12.16s/it]

Completed 1300 batches...
Current RAM usage: 754.95 MB



 72%|███████▏  | 1401/1954 [5:05:27<1:54:41, 12.44s/it]

Completed 1400 batches...
Current RAM usage: 831.46 MB



 77%|███████▋  | 1501/1954 [5:27:57<1:43:03, 13.65s/it]

Completed 1500 batches...
Current RAM usage: 861.86 MB



 82%|████████▏ | 1601/1954 [5:47:51<1:06:02, 11.23s/it]

Completed 1600 batches...
Current RAM usage: 940.15 MB



 87%|████████▋ | 1701/1954 [6:06:28<46:56, 11.13s/it]  

Completed 1700 batches...
Current RAM usage: 1011.88 MB



 92%|█████████▏| 1801/1954 [6:24:41<27:51, 10.92s/it]

Completed 1800 batches...
Current RAM usage: 1083.92 MB



 97%|█████████▋| 1901/1954 [7:15:52<11:35, 13.12s/it]    

Completed 1900 batches...
Current RAM usage: 704.33 MB



100%|██████████| 1954/1954 [7:26:33<00:00, 13.71s/it]

Batch size 256 took 26793.09 seconds





In [11]:
print(len(embeddings5))

np.savez_compressed(
    'embeddings_2M.npz',
    embeddings5=np.array(embeddings5, dtype=object)
)

500000


In [13]:
embeddings6 = []
start = 2000000
samples = paper_texts[start:]
print(f"Processing batch size 256 on samples {start}–{len(paper_texts)}...")

start_time = time.time()

for i in tqdm(range(0, len(samples), 256)):
    batch_embeddings6 = model.encode(samples[i:i + 256])
    embeddings6.extend(batch_embeddings6)

    if (i // 256) % 100 == 0:
        print(f"Completed {(i // 256)} batches...")
        print_memory_usage()
        print()

duration6 = time.time() - start_time
print(f"Batch size 256 took {duration6:.2f} seconds")

Processing batch size 256 on samples 2000000–2393685...


  0%|          | 1/1538 [00:12<5:16:54, 12.37s/it]

Completed 0 batches...
Current RAM usage: 1385.68 MB



  7%|▋         | 101/1538 [19:23<4:32:25, 11.37s/it]

Completed 100 batches...
Current RAM usage: 1483.93 MB



 13%|█▎        | 201/1538 [46:08<7:03:49, 19.02s/it]

Completed 200 batches...
Current RAM usage: 285.93 MB



 20%|█▉        | 301/1538 [1:11:03<3:57:01, 11.50s/it]

Completed 300 batches...
Current RAM usage: 312.04 MB



 26%|██▌       | 401/1538 [1:30:45<3:44:34, 11.85s/it]

Completed 400 batches...
Current RAM usage: 401.62 MB



 33%|███▎      | 501/1538 [1:52:07<3:56:04, 13.66s/it]

Completed 500 batches...
Current RAM usage: 487.92 MB



 39%|███▉      | 601/1538 [2:13:39<3:34:55, 13.76s/it]

Completed 600 batches...
Current RAM usage: 574.04 MB



 46%|████▌     | 701/1538 [2:33:47<2:47:43, 12.02s/it]

Completed 700 batches...
Current RAM usage: 661.98 MB



 52%|█████▏    | 801/1538 [2:52:35<2:13:37, 10.88s/it]

Completed 800 batches...
Current RAM usage: 743.28 MB



 59%|█████▊    | 901/1538 [3:11:12<2:14:24, 12.66s/it]

Completed 900 batches...
Current RAM usage: 823.90 MB



 65%|██████▌   | 1001/1538 [3:33:25<1:34:46, 10.59s/it]

Completed 1000 batches...
Current RAM usage: 603.93 MB



 72%|███████▏  | 1101/1538 [3:50:56<1:18:40, 10.80s/it]

Completed 1100 batches...
Current RAM usage: 684.76 MB



 78%|███████▊  | 1201/1538 [4:10:04<1:10:12, 12.50s/it]

Completed 1200 batches...
Current RAM usage: 762.74 MB



 85%|████████▍ | 1301/1538 [4:30:34<46:54, 11.87s/it]  

Completed 1300 batches...
Current RAM usage: 690.11 MB



 91%|█████████ | 1401/1538 [4:48:19<25:10, 11.02s/it]

Completed 1400 batches...
Current RAM usage: 773.73 MB



 98%|█████████▊| 1501/1538 [5:08:37<06:29, 10.52s/it]

Completed 1500 batches...
Current RAM usage: 482.22 MB



100%|██████████| 1538/1538 [5:16:09<00:00, 12.33s/it]

Batch size 256 took 18969.82 seconds





In [14]:
print(len(embeddings6))

np.savez_compressed(
    'embeddings_2.4M.npz',
    embeddings6=np.array(embeddings6, dtype=object)
)

393685
