In [2]:
import os
import numpy as np
import pandas as pd
import random

os.chdir('/Users/nikhilisukapalli/Downloads/Capstone/mag_papers_0')
os.getcwd()

'/Users/nikhilisukapalli/Downloads/Capstone/mag_papers_0'

In [None]:
import openai
print(openai.__version__)

openai.api_key = "API_KEY"

1.82.1


In [4]:
data = np.load('embeddings_1M.npz', allow_pickle=True)

embeddings1 = data['embeddings']
embeddings2 = data['embeddings2']
embeddings3 = data['embeddings3']

print(len(embeddings1), len(embeddings2), len(embeddings3))

30000 470000 500000


In [5]:
data = np.load('embeddings_1.5M.npz', allow_pickle=True)

embeddings4 = data['embeddings4']
print(len(embeddings4))

500000


In [6]:
data = np.load('embeddings_2M.npz', allow_pickle=True)

embeddings5 = data['embeddings5']
print(len(embeddings5))

500000


In [7]:
data = np.load('embeddings_2.4M.npz', allow_pickle=True)

embeddings6 = data['embeddings6']
print(len(embeddings6))

393685


In [8]:
import networkx as nx

G = nx.read_gpickle("mag_graph_cleaned.gpickle")
print(f"Number of papers (nodes): {G.number_of_nodes()}")
print(f"Number of citations (edges): {G.number_of_edges()}")

Number of papers (nodes): 2393685
Number of citations (edges): 225071


In [9]:
import pickle

page_ranks = nx.pagerank(G, alpha=0.85)
print(f"Number of pages: {len(page_ranks)}")

with open("pageranks.pkl", "wb") as f:
    pickle.dump(page_ranks, f)

Number of pages: 2393685


In [10]:
paper_ids = []
paper_texts = []

for node_id, data in G.nodes(data=True):
    if data.get('title') and data.get('abstract'):
        text = (data.get('title', '') + ' ' + data.get('abstract', '')).strip()
        if text:
            paper_ids.append(node_id)
            paper_texts.append(text)

print(f"Parsed {len(paper_texts)} papers")

Parsed 2393685 papers


In [11]:
for node_id, data in G.nodes(data=True):
    print(node_id)
    print(data)
    break

100000002
{'title': 'Electron Spin Resonance Investigations of Oxygen-Centered Free Radicals in Biological Systems', 'year': 1988, 'authors': ['Ronald P. Mason', 'Kim M. Morehouse'], 'fos': ['Hydroxyl radical', 'Autoxidation', 'Oxygen', 'Hyperfine structure', 'Photochemistry', 'Nuclear magnetic resonance', 'Radical', 'Electron paramagnetic resonance', 'Superoxide', 'Xanthine oxidase', 'Chemistry'], 'abstract': 'Oxygen-centered free radicals have been detected directly with ESR in a variety of biological processes such as lipid autoxidation,1 the enzymatic formation of superoxide by xanthine oxidase,2 and hydroxyl radical formation by the Y-irradiation of ice.3 Since the common isotope of oxygen is spin-less, no nuclear hyperfine interaction is possible, and the g-value and any hydrogen hyperfine coupling provide the only criteria for distinguishing an oxygen-centered free radical from other free radicals. The greatest limitation on the direct detection of oxygen-centered free radicals,

In [12]:
all_embeddings = {}
all_embeddings.update(dict(zip(paper_ids[:30000], embeddings1)))
all_embeddings.update(dict(zip(paper_ids[30000:500000], embeddings2)))
all_embeddings.update(dict(zip(paper_ids[500000:1000000], embeddings3)))
all_embeddings.update(dict(zip(paper_ids[1000000:1500000], embeddings4)))
all_embeddings.update(dict(zip(paper_ids[1500000:2000000], embeddings5)))
all_embeddings.update(dict(zip(paper_ids[2000000:], embeddings6)))
print(len(all_embeddings))

2393685


In [13]:
final_data = {
    paper_id: {
        "embedding": all_embeddings.get(paper_id),
        "pagerank": page_ranks.get(paper_id)
    }
    for paper_id in all_embeddings
}
print(len(final_data))

2393685


In [14]:
# with open("final_data.pkl", "wb") as f:
#     pickle.dump(final_data, f)

In [15]:
import torch
import transformers
import sentence_transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikhilisukapalli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
from collections import defaultdict

pattern = re.compile(r'\w+')
inverted_index = defaultdict(set)
for paper_id, data in G.nodes(data=True):
    for fos in data.get('fos', []):
        tokens = pattern.findall(fos.lower())
        for token in tokens:
            inverted_index[token].add(paper_id)

In [18]:
stop_words = set(stopwords.words('english'))

def get_topk_papers(query, k=5, graph=G, all_embeddings=all_embeddings, page_ranks=page_ranks):
    query_tokens = set(pattern.findall(query.lower())) - stop_words
    
    paper_ids = set()
    for token in query_tokens:
        paper_ids.update(inverted_index.get(token, set()))
    print(f"Found {len(paper_ids)} relevant papers for query: {query}")
    if not paper_ids:
        return []
    
    query_emb = model.encode(query)
    query_emb = np.array(query_emb).reshape(1, -1)
    relevant_embs = np.array([all_embeddings[id] for id in paper_ids])
    sim_scores = cosine_similarity(query_emb, relevant_embs)[0]

    # print("Query shape: ", query_emb.shape)
    # print("Relevant_embs shape: ", relevant_embs.shape)
    # print("Sim_scores shape: ", sim_scores.shape)

    weighted_scores = []
    for id, sim_score in zip(paper_ids, sim_scores):
        pagerank_score = page_ranks.get(id, 0)
        weighted_score = sim_score * (pagerank_score + 1)
        weighted_scores.append((id, weighted_score))

    topk_papers = sorted(weighted_scores, key=lambda x: x[1], reverse=True)[:k]
    return topk_papers

In [19]:
def process_queries(queries_df):
    for i, row in queries_df.iterrows():
        query = row['Query']
        topk_papers = get_topk_papers(query, k=5)

        context_chunks = []
        for paper_id, _ in topk_papers:
            paper_node = G.nodes[paper_id]
            title = paper_node.get('title', '')
            year = paper_node.get('year', '')
            authors = paper_node.get('authors', [])
            abstract = paper_node.get('abstract', '')
            context_chunks.append(f"Title: {title}\nYear: {year}\nAuthors: {', '.join(authors)}\nAbstract: {abstract}")

        context = "\n\n".join(context_chunks)
        prompt = f"Answer the following question. Below the question is context from several papers that you may use to guide your answer if relevant (cite the title and authors if relevant): {query}\n\n{context}"

        try:
            response = openai.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are an expert research assistant."},
                    {"role": "user", "content": prompt}
                ]
            )
            answer = response.choices[0].message.content
        except Exception as e:
            print(f"Error processing query {i}: {e}")
            answer = f"Error: {e}"
        
        queries_df.at[i, 'Output'] = answer

        print(f"Processed query {i + 1}/{len(queries_df)}: {row['Query']}")
        print()
    
    return queries_df

In [57]:
test_queries1 = pd.read_csv('test_queries1.csv', usecols=['Query', 'Label', 'Output'])
test_queries1['Output'] = test_queries1['Output'].fillna('')
test_queries1.head()

Unnamed: 0,Query,Label,Output
0,What are the key scalability trade-offs betwee...,"DHTs, such as Chord or Kademlia, distribute th...",
1,How can adversarial attacks exploit the latent...,Adversarial attacks manipulate the latent spac...,
2,Design an efficient algorithm to detect cycles...,A naive approach would recompute cycles after ...,
3,Explain how homomorphic encryption enables pri...,Homomorphic encryption (HE) allows computation...,
4,"In the context of cloud computing, compare and...",Strong consistency guarantees that all reads r...,


In [58]:
test_queries1 = process_queries(test_queries1)
test_queries1.to_csv('test_queries1.csv', index=False)

Found 40947 relevant papers for query: What are the key scalability trade-offs between distributed hash tables (DHTs) and centralized indexing systems in peer-to-peer networks?
Processed query 1/10: What are the key scalability trade-offs between distributed hash tables (DHTs) and centralized indexing systems in peer-to-peer networks?

Found 16746 relevant papers for query: How can adversarial attacks exploit the latent space of generative models like GANs or VAEs, and what defense strategies have been proposed?
Processed query 2/10: How can adversarial attacks exploit the latent space of generative models like GANs or VAEs, and what defense strategies have been proposed?

Found 50455 relevant papers for query: Design an efficient algorithm to detect cycles in a dynamic graph where edges are frequently added and removed. How would you ensure minimal recomputation?
Processed query 3/10: Design an efficient algorithm to detect cycles in a dynamic graph where edges are frequently added an

In [59]:
test_queries2 = pd.read_csv('test_queries2.csv', usecols=['Query', 'Label', 'Output'])
test_queries2['Output'] = test_queries2['Output'].fillna('')
test_queries2.head()

Unnamed: 0,Query,Label,Output
0,How can machine learning accelerate the discov...,Machine learning (ML) models can rapidly scree...,
1,Why does chirality emerge spontaneously in cer...,Spontaneous chiral symmetry breaking during cr...,
2,Compare the roles of frontier molecular orbita...,"In nucleophilic aromatic substitution (NAS), t...",
3,What experimental techniques are most effectiv...,Techniques like femtosecond transient absorpti...,
4,How do solid-state defects contribute to the i...,"Defects such as vacancies, interstitials, and ...",


In [60]:
test_queries2 = process_queries(test_queries2)
test_queries2.to_csv('test_queries2.csv', index=False)

Found 151228 relevant papers for query: How can machine learning accelerate the discovery of new catalytic materials for CO₂ reduction?
Processed query 1/10: How can machine learning accelerate the discovery of new catalytic materials for CO₂ reduction?

Found 77793 relevant papers for query: Why does chirality emerge spontaneously in certain crystallization processes, and what factors control it?
Processed query 2/10: Why does chirality emerge spontaneously in certain crystallization processes, and what factors control it?

Found 34182 relevant papers for query: Compare the roles of frontier molecular orbitals in explaining both nucleophilic and electrophilic aromatic substitution reactions.
Processed query 3/10: Compare the roles of frontier molecular orbitals in explaining both nucleophilic and electrophilic aromatic substitution reactions.

Found 41997 relevant papers for query: What experimental techniques are most effective in characterizing ultrafast electron transfer processes 

In [20]:
test_queries3 = pd.read_csv('test_queries3.csv', usecols=['Query', 'Label', 'Output'])
test_queries3['Output'] = test_queries3['Output'].fillna('')
test_queries3.head()

Unnamed: 0,Query,Label,Output
0,How is Building Information Modeling (BIM) int...,Building Information Modeling (BIM) integrates...,
1,Describe how real-time operating systems (RTOS...,Real-time operating systems provide determinis...,
2,Explain the role of nozzle design in optimizin...,Nozzles in supersonic engines (typically conve...,
3,How does modal analysis contribute to understa...,Modal analysis identifies the natural frequenc...,
4,Compare the effectiveness and limitations of r...,"Both types recover waste heat, but their mecha...",


In [21]:
test_queries3 = process_queries(test_queries3)
test_queries3.to_csv('test_queries3.csv', index=False)

Found 198752 relevant papers for query: How is Building Information Modeling (BIM) integrated with HVAC system simulation to optimize energy usage and airflow dynamics in large-scale commercial buildings?
Processed query 1/10: How is Building Information Modeling (BIM) integrated with HVAC system simulation to optimize energy usage and airflow dynamics in large-scale commercial buildings?

Found 104397 relevant papers for query: Describe how real-time operating systems (RTOS) manage task scheduling and interrupt handling in safety-critical embedded applications, such as in automotive systems.
Processed query 2/10: Describe how real-time operating systems (RTOS) manage task scheduling and interrupt handling in safety-critical embedded applications, such as in automotive systems.

Found 58030 relevant papers for query: Explain the role of nozzle design in optimizing thrust and efficiency in supersonic jet engines. How do engineers deal with flow separation and shock waves?
Processed quer

In [21]:
test_queries4 = pd.read_csv('test_queries4.csv', usecols=['Query', 'Label', 'Output'])
test_queries4['Output'] = test_queries4['Output'].fillna('')
test_queries4.head()

Unnamed: 0,Query,Label,Output
0,How does dynamic programming improve upon the ...,Dynamic programming (DP) improves performance ...,
1,"Explain how a hash table works, including how ...",A hash table maps keys to indices in an array ...,
2,Describe the concept of deadlock in operating ...,Deadlock occurs when multiple processes are st...,
3,What is the difference between breadth-first s...,Breadth-first search (BFS) and depth-first sea...,
4,"How does the TCP three-way handshake work, and...",The TCP three-way handshake is crucial for est...,


In [22]:
test_queries4 = process_queries(test_queries4)
test_queries4.to_csv('test_queries4.csv', index=False)

Found 71613 relevant papers for query: How does dynamic programming improve upon the time complexity of a naive recursive solution? Provide an example problem and explain how overlapping subproblems are handled.
Processed query 1/10: How does dynamic programming improve upon the time complexity of a naive recursive solution? Provide an example problem and explain how overlapping subproblems are handled.

Found 11948 relevant papers for query: Explain how a hash table works, including how collisions are handled. What are the trade-offs between separate chaining and open addressing?
Processed query 2/10: Explain how a hash table works, including how collisions are handled. What are the trade-offs between separate chaining and open addressing?

Found 73458 relevant papers for query: Describe the concept of deadlock in operating systems. Provide a real-world analogy and explain two methods to prevent or avoid deadlocks.
Processed query 3/10: Describe the concept of deadlock in operating sy

In [49]:
test_queries5 = pd.read_csv('test_queries5.csv', usecols=['Query', 'Label', 'Output'])
test_queries5['Output'] = test_queries5['Output'].fillna('')
test_queries5.head()

Unnamed: 0,Query,Label,Output
0,What geographic and environmental factors led ...,The Fertile Crescent and Yellow River Valley b...,
1,How did the administrative systems of the Roma...,The Roman Empire utilized a decentralized appr...,
2,In what ways did the economic structures estab...,European colonialism imposed extractive econom...,
3,How have modern historians re-evaluated the ca...,"Earlier narratives, particularly in Western Eu...",
4,What might have been the global political and ...,Had the Islamic Golden Age persisted beyond th...,


In [52]:
test_queries5 = process_queries(test_queries5)
test_queries5.to_csv('test_queries5.csv', index=False)

Found 61113 relevant papers for query: What geographic and environmental factors led to the simultaneous rise of agriculture in both the Fertile Crescent and the Yellow River Valley, and how did these shape their respective societies?
Processed query 1/10: What geographic and environmental factors led to the simultaneous rise of agriculture in both the Fertile Crescent and the Yellow River Valley, and how did these shape their respective societies?

Found 33930 relevant papers for query: How did the administrative systems of the Roman Empire and the Han Dynasty differ in terms of governance, military organization, and integration of conquered peoples?
Processed query 2/10: How did the administrative systems of the Roman Empire and the Han Dynasty differ in terms of governance, military organization, and integration of conquered peoples?

Found 101198 relevant papers for query: In what ways did the economic structures established during European colonialism in Sub-Saharan Africa continu

In [23]:
test_queries6 = pd.read_csv('test_queries6.csv', usecols=['Query', 'Label', 'Output'])
test_queries6['Output'] = test_queries6['Output'].fillna('')
test_queries6.head()

Unnamed: 0,Query,Label,Output
0,How did the economic policies of the Gilded Ag...,During the Gilded Age (approximately 1870–1900...,
1,Evaluate the role of geography and disease in ...,Geography determined where Europeans settled a...,
2,To what extent did the Constitution of 1787 re...,The Constitution was a compromise between demo...,
3,How did the United States justify its expansio...,The U.S. framed westward expansion as a divine...,
4,Compare the strategies and limitations of the ...,The Civil Rights Movement used nonviolent prot...,


In [24]:
test_queries6 = process_queries(test_queries6)
test_queries6.to_csv('test_queries6.csv', index=False)

Found 47889 relevant papers for query: How did the economic policies of the Gilded Age contribute to both industrial growth and systemic inequality, and what were the long-term consequences for American labor movements?
Processed query 1/10: How did the economic policies of the Gilded Age contribute to both industrial growth and systemic inequality, and what were the long-term consequences for American labor movements?

Found 94147 relevant papers for query: Evaluate the role of geography and disease in shaping the outcome of European-Native American encounters during the 17th century colonization of North America.
Processed query 2/10: Evaluate the role of geography and disease in shaping the outcome of European-Native American encounters during the 17th century colonization of North America.

Found 139848 relevant papers for query: To what extent did the Constitution of 1787 reflect both the revolutionary ideals of the American colonists and the desire to maintain social order and el

In [25]:
test_queries7 = pd.read_csv('test_queries7.csv', usecols=['Query', 'Label', 'Output'])
test_queries7['Output'] = test_queries7['Output'].fillna('')
test_queries7.head()

Unnamed: 0,Query,Label,Output
0,How do epigenetic modifications influence gene...,Epigenetic modifications regulate gene activit...,
1,Describe the mechanisms of horizontal gene tra...,Horizontal gene transfer (HGT) allows prokaryo...,
2,Compare and contrast the biochemical pathways ...,Photosynthesis and cellular respiration are co...,
3,Explain the significance of the Hardy-Weinberg...,The Hardy-Weinberg equilibrium provides a math...,
4,How do changes in biodiversity affect ecosyste...,Biodiversity enhances ecosystem resilience by ...,


In [26]:
test_queries7 = process_queries(test_queries7)
test_queries7.to_csv('test_queries7.csv', index=False)

Found 53416 relevant papers for query: How do epigenetic modifications influence gene expression without altering the DNA sequence, and what are the implications for inheritance and disease?
Processed query 1/10: How do epigenetic modifications influence gene expression without altering the DNA sequence, and what are the implications for inheritance and disease?

Found 38348 relevant papers for query: Describe the mechanisms of horizontal gene transfer in prokaryotes. How do these mechanisms contribute to antibiotic resistance and microbial evolution?
Processed query 2/10: Describe the mechanisms of horizontal gene transfer in prokaryotes. How do these mechanisms contribute to antibiotic resistance and microbial evolution?

Found 53246 relevant papers for query: Compare and contrast the biochemical pathways of photosynthesis and cellular respiration. How do their interdependencies shape ecosystem energy flow?
Processed query 3/10: Compare and contrast the biochemical pathways of photos

In [27]:
test_queries8 = pd.read_csv('test_queries8.csv', usecols=['Query', 'Label', 'Output'])
test_queries8['Output'] = test_queries8['Output'].fillna('')
test_queries8.head()

Unnamed: 0,Query,Label,Output
0,Explain the concept of theory of mind in devel...,Theory of mind (ToM) is the ability to attribu...,
1,Discuss the role of implicit bias in social co...,Implicit bias refers to unconscious associatio...,
2,How has the field of positive psychology redef...,Positive psychology shifts the focus from trea...,
3,To what extent do genetic predispositions and ...,Twin studies show that schizophrenia has a her...,
4,What are the psychological and neurobiological...,Addiction arises from a complex interaction be...,


In [28]:
test_queries8 = process_queries(test_queries8)
test_queries8.to_csv('test_queries8.csv', index=False)

Found 143690 relevant papers for query: Explain the concept of theory of mind in developmental psychology. How is it assessed, and what are its implications for understanding autism spectrum disorders?
Processed query 1/10: Explain the concept of theory of mind in developmental psychology. How is it assessed, and what are its implications for understanding autism spectrum disorders?

Found 71295 relevant papers for query: Discuss the role of implicit bias in social cognition. How do researchers measure implicit attitudes, and what interventions have shown promise in reducing them?
Processed query 2/10: Discuss the role of implicit bias in social cognition. How do researchers measure implicit attitudes, and what interventions have shown promise in reducing them?

Found 136872 relevant papers for query: How has the field of positive psychology redefined traditional clinical approaches to mental health, and what critiques exist regarding its application in diverse populations?
Processed q

In [22]:
test_queries9 = pd.read_csv('test_queries9.csv', usecols=['Query', 'Label', 'Output'])
test_queries9['Output'] = test_queries9['Output'].fillna('')
test_queries9.head()

Unnamed: 0,Query,Label,Output
0,How do restorative justice practices challenge...,"Restorative justice focuses on healing, accoun...",
1,Examine the role of stare decisis in Supreme C...,Stare decisis—Latin for “to stand by things de...,
2,To what extent does antitrust law effectively ...,"Traditional antitrust law, focused on price-fi...",
3,Evaluate the constitutional arguments for and ...,Executive orders allow U.S. presidents to mana...,
4,How do intellectual property laws handle deriv...,The legal treatment of derivative works produc...,


In [23]:
test_queries9 = process_queries(test_queries9)
test_queries9.to_csv('test_queries9.csv', index=False)

Found 70279 relevant papers for query: How do restorative justice practices challenge traditional retributive models in criminal law, and what legal frameworks are necessary to implement them more broadly within state justice systems?
Processed query 1/10: How do restorative justice practices challenge traditional retributive models in criminal law, and what legal frameworks are necessary to implement them more broadly within state justice systems?

Found 8429 relevant papers for query: Examine the role of stare decisis in Supreme Court rulings. Under what conditions should precedent be overturned, and how has this principle evolved in controversial decisions?
Processed query 2/10: Examine the role of stare decisis in Supreme Court rulings. Under what conditions should precedent be overturned, and how has this principle evolved in controversial decisions?

Found 117615 relevant papers for query: To what extent does antitrust law effectively regulate modern tech monopolies that rely on 

In [24]:
test_queries10 = pd.read_csv('test_queries10.csv', usecols=['Query', 'Label', 'Output'])
test_queries10['Output'] = test_queries10['Output'].fillna('')
test_queries10.head()

Unnamed: 0,Query,Label,Output
0,Discuss the methodological challenges of recon...,The comparative method reconstructs proto-lang...,
1,Explain how language acquisition differs for s...,Children acquire signed languages like ASL on ...,
2,How does Grice’s Cooperative Principle and its...,Grice’s Cooperative Principle posits that effe...,
3,To what extent can typological features be pre...,Typological features often correlate with gene...,
4,How does Optimality Theory differ from rule-ba...,Optimality Theory (OT) replaces sequential pho...,


In [25]:
test_queries10 = process_queries(test_queries10)
test_queries10.to_csv('test_queries10.csv', index=False)

Found 28618 relevant papers for query: Discuss the methodological challenges of reconstructing proto-languages using the comparative method. How do linguists address issues such as borrowing and convergence?
Processed query 1/10: Discuss the methodological challenges of reconstructing proto-languages using the comparative method. How do linguists address issues such as borrowing and convergence?

Found 45040 relevant papers for query: Explain how language acquisition differs for signed languages compared to spoken languages. What does this tell us about the modality-independence of linguistic structure?
Processed query 2/10: Explain how language acquisition differs for signed languages compared to spoken languages. What does this tell us about the modality-independence of linguistic structure?

Found 34524 relevant papers for query: How does Grice’s Cooperative Principle and its maxims help explain implicature in discourse, and what are the limitations of this model?
Processed query 3/