In [3]:
import pandas as pd
import os
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
load_dotenv(dotenv_path=".env")
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import GraphCypherQAChain
# gemini-2.0-flash-exp

In [4]:
scholar_data = pd.read_csv("data/scholer_recommendation.csv")
scholar_data = scholar_data.drop(columns=["Abstract", "Keywords"], axis=1)
scholar_data = pd.concat([scholar_data.head(80), scholar_data.tail(20)], ignore_index=True)
scholar_data.rename(columns={'Fields of Study': 'Discipline', 'Category': 'Topic'}, inplace=True)
print(scholar_data.shape)
scholar_data.head()

(100, 9)


Unnamed: 0,Title,Authors,Year,Venue,URL,Open Access,Discipline,Citations,Topic
0,Fashion-MNIST: a Novel Image Dataset for Bench...,"Han Xiao, Kashif Rasul, Roland Vollgraf",2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,"Computer Science, Mathematics",8026,Machine Learning
1,TensorFlow: A system for large-scale machine l...,"Martín Abadi, P. Barham, Jianmin Chen, Z. Chen...",2016.0,USENIX Symposium on Operating Systems Design a...,https://www.semanticscholar.org/paper/4954fa18...,No,Computer Science,17652,Machine Learning
2,TensorFlow: Large-Scale Machine Learning on He...,"Martín Abadi, Ashish Agarwal, P. Barham, E. Br...",2016.0,arXiv.org,https://www.semanticscholar.org/paper/9c9d7247...,No,Computer Science,10819,Machine Learning
3,Stop explaining black box machine learning mod...,C. Rudin,2018.0,Nature Machine Intelligence,https://www.semanticscholar.org/paper/bc00ff34...,Yes,"Medicine, Computer Science",5152,Machine Learning
4,Convolutional LSTM Network: A Machine Learning...,"Xingjian Shi, Zhourong Chen, Hao Wang, D. Yeun...",2015.0,Neural Information Processing Systems,https://www.semanticscholar.org/paper/f9c990b1...,No,Computer Science,7266,Machine Learning


In [5]:
def duplicate_row_check(df):
    duplicate_list = []
    previous_row = 0
    D_count = 0
    for index, row in df.iterrows():
        if index > 0:
            is_match = (previous_row == row).all()
            if is_match:
                duplicate_list.append(index)
            D_count = D_count + is_match
        previous_row = row.copy()
    return D_count, duplicate_list

# Call the function with scholar_data
duplicate_count, duplicate_list = duplicate_row_check(scholar_data)
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 0


In [6]:
scholar_data['Authors_list'] = scholar_data['Authors'].str.split(',')
scholar_data = scholar_data.explode('Authors_list').reset_index(drop=True)
scholar_data["Authors"] = scholar_data["Authors_list"]
scholar_data.drop(["Authors_list"], axis=1, inplace=True)

scholar_data['Discipline_list'] = scholar_data['Discipline'].str.split(',')
scholar_data = scholar_data.explode('Discipline_list').reset_index(drop=True)
scholar_data["Discipline"] = scholar_data["Discipline_list"]
scholar_data.drop(["Discipline_list"], axis=1, inplace=True)

In [7]:
print(scholar_data.shape)
scholar_data.head()

(887, 9)


Unnamed: 0,Title,Authors,Year,Venue,URL,Open Access,Discipline,Citations,Topic
0,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning
1,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning
2,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning
3,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning
4,Fashion-MNIST: a Novel Image Dataset for Bench...,Roland Vollgraf,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning


In [8]:
duplicate_count, duplicate_list = duplicate_row_check(scholar_data)
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 0


In [9]:
# Delete the specified rows
scholar_data.drop(duplicate_list, inplace=True)
scholar_data.reset_index(drop=True, inplace=True)
duplicate_count = duplicate_row_check(scholar_data)
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: (0, [])


In [10]:
scholar_data.dropna(inplace=True)
scholar_data.describe(include='all')

Unnamed: 0,Title,Authors,Year,Venue,URL,Open Access,Discipline,Citations,Topic
count,849,849,849.0,849,849,849,849,849.0,849
unique,84,528,,65,84,2,18,,2
top,Accurate prediction of protein structures and ...,P. Laurienti,,arXiv.org,https://www.semanticscholar.org/paper/51362860...,Yes,Computer Science,,Machine Learning
freq,64,8,,87,64,553,378,,573
mean,,,2017.809187,,,,,4381.024735,
std,,,3.443117,,,,,10731.748135,
min,,,1997.0,,,,,9.0,
25%,,,2016.0,,,,,279.0,
50%,,,2019.0,,,,,1557.0,
75%,,,2021.0,,,,,2681.0,


In [11]:
duplicate_count = duplicate_row_check(scholar_data)
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: (0, [])


In [12]:
print(scholar_data.shape)
scholar_data.rename(columns={'Title': 'Paper Title', 'Authors': 'Author', 'Year': 'Year Published'}, inplace=True)

(849, 9)


In [13]:
def categorize_venue(venue):
    if 'conference' in venue.lower() or 'symposium' in venue.lower() or 'workshop' in venue.lower():
        return 'Conference'
    elif 'journal' in venue.lower() or 'transactions' in venue.lower() or 'letters' in venue.lower():
        return 'Journal'
    else:
        return 'Other'

scholar_data['Venue Type'] = scholar_data['Venue'].apply(categorize_venue)
scholar_data = scholar_data.applymap(lambda x: x.strip() if isinstance(x, str) else x)
duplicate_count = duplicate_row_check(scholar_data)
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: (1, [637])


  scholar_data = scholar_data.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [14]:
print(scholar_data.shape)
scholar_data.drop(duplicate_list, inplace=True)
scholar_data.reset_index(drop=True, inplace=True)
duplicate_count = duplicate_row_check(scholar_data)
print(f"Number of duplicate rows: {duplicate_count}")
print(scholar_data.shape)

(849, 10)
Number of duplicate rows: (1, [600])
(849, 10)


In [15]:
# Initialize Neo4j connection
gemini_api = os.getenv("GOOGLE_API_KEY")
neo4j_url = os.getenv("NEO4J_CONNECTION_URL")
neo4j_user = os.getenv("NEO4J_USER")
neo4j_password = os.getenv("NEO4J_PASSWORD")
neo4j_db = os.getenv("NEO4J_SCHOLAR")

# graph = Neo4jGraph(neo4j_url,neo4j_user,neo4j_password)
graph = Neo4jGraph(neo4j_url, neo4j_user, neo4j_password, database=neo4j_db)


In [16]:
query = """
MATCH (n)
DETACH DELETE n
"""
graph.query(query) 

[]

In [17]:
for index, row in scholar_data.iterrows():
    paper_title = row['Paper Title']
    author = row['Author']
    year_published = row['Year Published']
    citations = row['Citations']
    discipline = row['Discipline']
    venue = row['Venue']
    topic = row['Topic']
    
    # Create Paper node
    query = """
    MERGE (p:Paper {title: $paper_title, year: $year_published, citations: $citations, topic: $topic})
    """
    parameters = {
        'paper_title': paper_title,
        'year_published': year_published,
        'citations': citations,
        'topic': topic
    }
    graph.query(query, parameters)

    # Create Author node
    query = """
    MERGE (a:Author {name: $author})
    """
    parameters = {
        'author': author
    }
    graph.query(query, parameters)

    # Create AUTHORED relationship
    query = """
    MATCH (p:Paper {title: $paper_title, year: $year_published})
    MATCH (a:Author {name: $author})
    MERGE (a)-[:AUTHORED]->(p)
    """
    parameters = {
        'paper_title': paper_title,
        'year_published': year_published,
        'author': author
    }
    graph.query(query, parameters)
    
    # Create Field of Study node
    query = """
    MERGE (f:Discipline {name: $discipline})
    """
    parameters = {
        'discipline': discipline
    }
    graph.query(query, parameters)

    # Create BELONGS_TO relationship
    query = """
    MATCH (p:Paper {title: $paper_title, year: $year_published})
    MATCH (f:Discipline {name: $discipline})
    MERGE (p)-[:BELONGS_TO]->(f)
    """
    parameters = {
        'paper_title': paper_title,
        'year_published': year_published,
        'discipline': discipline
    }
    graph.query(query, parameters)
    
    # Create Venue node
    query = """
    MERGE (v:Venue {name: $venue})
    """
    parameters = {
        'venue': venue
    }
    graph.query(query, parameters)

    # Create PUBLISHED_IN relationship between Paper and Venue
    query = """
    MATCH (p:Paper {title: $paper_title, year: $year_published})
    MATCH (v:Venue {name: $venue})
    MERGE (p)-[:PUBLISHED_IN]->(v)
    """
    parameters = {
        'paper_title': paper_title,
        'year_published': year_published,
        'venue': venue
    }
    graph.query(query, parameters)

    

In [18]:
print(scholar_data["Discipline"].unique())
print(scholar_data["Topic"].unique())
print(scholar_data["Author"].unique()[:20])

['Computer Science' 'Mathematics' 'Medicine' 'No fields available'
 'Chemistry' 'Materials Science' 'Environmental Science' 'Physics'
 'Biology' 'Psychology' 'Sociology' 'Engineering']
['Machine Learning' 'Network Science']
['Han Xiao' 'Kashif Rasul' 'Roland Vollgraf' 'Martín Abadi' 'P. Barham'
 'Jianmin Chen' 'Z. Chen' 'Andy Davis' 'J. Dean' 'M. Devin'
 'Sanjay Ghemawat' 'G. Irving' 'M. Isard' 'M. Kudlur' 'J. Levenberg'
 'R. Monga' 'Sherry Moore' 'D. Murray' 'Benoit Steiner' 'P. Tucker']


In [19]:
scholar_data.columns

Index(['Paper Title', 'Author', 'Year Published', 'Venue', 'URL',
       'Open Access', 'Discipline', 'Citations', 'Topic', 'Venue Type'],
      dtype='object')

In [20]:
graph.refresh_schema()
print(graph.schema)

Node properties:
Paper {citations: INTEGER, title: STRING, topic: STRING, year: FLOAT}
Author {name: STRING}
Discipline {name: STRING}
Venue {name: STRING}
Relationship properties:

The relationships:
(:Paper)-[:BELONGS_TO]->(:Discipline)
(:Paper)-[:PUBLISHED_IN]->(:Venue)
(:Author)-[:AUTHORED]->(:Paper)


In [21]:
example = [
    {
        "question": "List all papers authored by 'Han Xiao'.",
        "query": "MATCH (a:Author {name: 'Han Xiao'})-[:AUTHORED]->(p:Paper) RETURN p.title AS PapersAuthoredByHanXiao",
    },
    {
        "question": "Which papers belong to the 'Computer Science' discipline?",
        "query": "MATCH (p:Paper)-[:BELONGS_TO]->(d:Discipline {name: 'Computer Science'}) RETURN p.title AS PapersInComputerScience Limit 5"
    },
    {
        "question": "What are the papers published in 'Nature' in the year 2018?",
        "query": "MATCH (p:Paper)-[:PUBLISHED_IN]->(v:Venue {name: 'Nature'}) WHERE p.year = 2018 RETURN p.title AS PapersPublishedInNature2018"
    },
    {
        "question": "How many papers did 'Jianmin Chen' author?",
        'query': "MATCH (a:Author {name: 'Jianmin Chen'})-[:AUTHORED]->(p:Paper) RETURN COUNT(p) AS NumberOfPapersAuthoredByJianminChen"
    },
    {
        "question": "List all authors who have published papers in the topic 'Machine Learning'.",
        "query": "MATCH (a:Author)-[:AUTHORED]->(p:Paper {topic: 'Machine Learning'}) RETURN DISTINCT a.name AS AuthorsInMachineLearning Limit 5"
    },
    {
        'question': "What are the most cited papers in 'Mathematics'?",
        'query': "MATCH (p:Paper)-[:BELONGS_TO]->(d:Discipline {name: 'Mathematics'}) RETURN p.title AS PapersInComputerScience Limit 5"
    },
    {
        'question': "What are the most cited papers in 'Materials Science' discipline?",
        'query': "MATCH (p:Paper)-[:BELONGS_TO]->(d:Discipline {name: 'Materials Science'}) RETURN p.title AS Paper, p.citations AS Citations ORDER BY Citations DESC LIMIT 5"
    },
    {
        'question': "Which venues have published papers in the 'Network Science' topic?",
        'query': "MATCH (p:Paper {topic: 'Network Science'})-[:PUBLISHED_IN]->(v:Venue) RETURN DISTINCT v.name AS VenuesForNetworkScience LIMIT 5"
    },
    {
        'question': "I am 'Han Xiao' conducts research in 'Computer Science' and 'Machine Learning'. Which professors should he collaborate with?",
        'query': "MATCH (a:Author {name: 'Han Xiao'})-[:AUTHORED]->(p:Paper)-[:BELONGS_TO]->(d:Discipline) WHERE d.name = 'Computer Science' OR p.topic = 'Machine Learning' WITH DISTINCT d AS Discipline, p.topic AS Topic MATCH (other:Author)-[:AUTHORED]->(:Paper)-[:BELONGS_TO]->(d) WHERE other.name <> 'Han Xiao' RETURN DISTINCT other.name AS PotentialCollaborators LIMIT 5"
    },
    {
        'question': "I am 'Han Xiao'. Which researchers I collaborated with before?",
        'query': "MATCH (a1:Author {name: 'Han Xiao'})-[:AUTHORED]->(p:Paper)<-[:AUTHORED]-(a2:Author) WHERE a1 <> a2 RETURN DISTINCT a2.name"
    },
    {
        'question': "I am 'Han Xiao'. Which new researchers should I collaborate with for future work?",
        'query': "MATCH (a1:Author {name: 'Han Xiao'})-[:AUTHORED]->(p:Paper)-[:BELONGS_TO]->(d:Discipline)<-[:BELONGS_TO]-(p2:Paper)<-[:AUTHORED]-(a2:Author) WHERE a1 <> a2 RETURN a2.name, COUNT(p2) AS collaborations ORDER BY collaborations DESC"
    },
    {
        'question': "I am 'Kashif Rasul'. I have some workes in 'Mathematics' and want to expand my research in this field. Which researchers should I collaborate with based on papers related to 'Mathematics'?",
        'query': "MATCH (a:Author {name: 'Kashif Rasul'})-[:AUTHORED]->(p:Paper)-[:BELONGS_TO]->(d:Discipline) WHERE d.name = 'Mathematics' WITH DISTINCT d AS Discipline MATCH (other:Author)-[:AUTHORED]->(:Paper)-[:BELONGS_TO]->(d) WHERE other.name <> 'Kashif Rasul' RETURN DISTINCT other.name AS PotentialCollaborators"
    }
    # {
    #     'question': "I am from 'Medicine' Discipline. But i want to expand my research in 'Machine Learning' topic. Which researchers should I collaborate with?",
    #     'query': "MATCH (a:Author)-[:AUTHORED]->(p:Paper)-[:BELONGS_TO]->(d:Discipline {name: 'Medicine'}) WHERE p.topic = 'Machine Learning' return DISTINCT a.name"
    # }
]


In [22]:
scholar_data.head()

Unnamed: 0,Paper Title,Author,Year Published,Venue,URL,Open Access,Discipline,Citations,Topic,Venue Type
0,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Other
1,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning,Other
2,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Other
3,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning,Other
4,Fashion-MNIST: a Novel Image Dataset for Bench...,Roland Vollgraf,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Other


In [23]:
# Updated Cypher Query Prompt Template
cypher_generation_prompt = PromptTemplate(
    template="""Based on the schema, write a Cypher query to answer the question.

    The question may ask about:
    - Authors and their research fields
    - Publication venues and trends
    - Paper citations and collaborations
    - Discipline for authors and papers
    - Recommendations for collaborations or venues

    Schema:
    {schema}

    Example questions and queries:
    {example}

    **Important**:
    - Always filter by specific properties in the question when provided, such as `category` for papers or `name` for authors.
    - Ensure the query aligns precisely with the requested category, author, or venue.
    - When counting or aggregating, provide meaningful aliases like `VenueName`, `PaperCount`, or `AuthorName`.
    - Do not include irrelevant nodes or relationships in the query.

    Question: {question}
    Query:""",
    input_variables=["schema", "question", "example"],
)


# Define the answer generation prompt template for the scholar knowledge graph
qa_prompt = PromptTemplate(
    template="""Based on the Cypher query results, answer the question.
    Question: {question}
    Results: {context}
    Give a clear, direct, and human-friendly answer using the data from the results. 
    If it's a list, combine all items and summarize. For example, for authors or papers, list them in a human-readable format.
    Answer:""",
    input_variables=["question", "context"],
)

In [32]:
# Initialize the LLM (Google Generative AI)
llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro-preview-03-25", google_api_key=gemini_api, temperature=0)
# llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", google_api_key=gemini_api, temperature=0)

ALLOW_DANGEROUS_REQUEST = True

# Define the chain
chain = GraphCypherQAChain.from_llm(
    llm=llm,
    graph=graph,  # Your Neo4j graph object
    verbose=True,
    cypher_generation_prompt=cypher_generation_prompt,
    qa_prompt=qa_prompt,
    allow_dangerous_requests=ALLOW_DANGEROUS_REQUEST,
)

In [33]:
graph.refresh_schema()
print(graph.schema)

Node properties:
Paper {citations: INTEGER, title: STRING, topic: STRING, year: FLOAT}
Author {name: STRING}
Discipline {name: STRING}
Venue {name: STRING}
Relationship properties:

The relationships:
(:Paper)-[:BELONGS_TO]->(:Discipline)
(:Paper)-[:PUBLISHED_IN]->(:Venue)
(:Author)-[:AUTHORED]->(:Paper)


In [34]:
import io
import sys
import re

def clean_ansi(text):
    # Remove ANSI escape codes
    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    return ansi_escape.sub('', text).strip()

def test_query(question):
    try:
        # Create a string buffer to capture the output
        output_buffer = io.StringIO()
        original_stdout = sys.stdout
        # sys.stdout = output_buffer

        # Explicitly generate the Cypher query first using the prompt
        generated_query = cypher_generation_prompt.format(
            schema=graph.schema,  # Ensure dynamic schema usage
            question=question,
            example=example
        )

        # Run the chain with the generated query
        result = chain.run(query=generated_query, question=question)
        
        # Restore original stdout and get the captured output
        sys.stdout = original_stdout
        output = output_buffer.getvalue()
        
        # Extract Cypher query and context from the captured output
        cypher_query = None
        full_context = None
        
        if 'Generated Cypher:' in output:
            cypher_query = output.split('Generated Cypher:')[1].split('Full Context:')[0].strip()
            cypher_query = clean_ansi(cypher_query)
        
        if 'Full Context:' in output:
            full_context = output.split('Full Context:')[1].split('>')[0].strip()
            full_context = clean_ansi(full_context)
        
        print(f"Q: {question}")
        print(f"A: {result}\n")
        
        return {
            'result': result,
            'cypher_query': cypher_query,
            'full_context': full_context
        }
    except Exception as e:
        print(f"Error: {str(e)}")
        return {
            'result': None,
            'cypher_query': None,
            'full_context': None,
            'error': str(e)
        }

In [35]:
response = test_query("List all papers authored by 'Han Xiao'.")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (a:Author {name: 'Han Xiao'})-[:AUTHORED]->(p:Paper)
RETURN p
[0m
Full Context:
[32;1m[1;3m[{'p': {'citations': 8026, 'year': 2017.0, 'topic': 'Machine Learning', 'title': 'Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms'}}][0m

[1m> Finished chain.[0m
Q: List all papers authored by 'Han Xiao'.
A: Han Xiao authored the paper titled 'Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms'.



In [47]:
distinct_authors = scholar_data.groupby('Author', as_index=False).agg({
    'Discipline': lambda x: ', '.join(sorted(set(x))),
    'Topic': lambda x: ', '.join(sorted(set(x)))
})
distinct_authors.head()

Unnamed: 0,Author,Discipline,Topic
0,A. Bradley,Computer Science,Machine Learning
1,A. Esterline,Computer Science,Network Science
2,A. Galstyan,"Computer Science, Mathematics",Machine Learning
3,A. Gruber,Computer Science,Network Science
4,A. Harp,Computer Science,Machine Learning


In [48]:
distinct_authors.Topic.unique()

array(['Machine Learning', 'Network Science'], dtype=object)

In [49]:
response = test_query("i want to create a grsreach group on 'AI network optimization' with these reserachers: ['A. Bradley', 'A. Galstyan', 'A. Gruber']. who should be Principal Investigator?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (a:Author)-[:AUTHORED]->(p:Paper)
WHERE a.name IN ['A. Bradley', 'A. Galstyan', 'A. Gruber'] AND p.topic = 'AI network optimization'
WITH a, sum(p.citations) AS totalCitations
ORDER BY totalCitations DESC
LIMIT 1
RETURN a.name
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m
Q: i want to create a grsreach group on 'AI network optimization' with these reserachers: ['A. Bradley', 'A. Galstyan', 'A. Gruber']. who should be Principal Investigator?
A: Based on the query results, none of the researchers ('A. Bradley', 'A. Galstyan', 'A. Gruber') have authored papers specifically on the topic 'AI network optimization' in the database. Therefore, we cannot determine a Principal Investigator from this list based on publication count in this specific topic.



In [32]:
response = test_query("List all papers authored by 'Benoit Steiner'.")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (author:Author {name: 'Benoit Steiner'})-[:AUTHORED]->(paper:Paper)
RETURN paper
[0m
Full Context:
[32;1m[1;3m[{'paper': {'citations': 10819, 'year': 2016.0, 'topic': 'Machine Learning', 'title': 'TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems'}}, {'paper': {'citations': 17652, 'year': 2016.0, 'topic': 'Machine Learning', 'title': 'TensorFlow: A system for large-scale machine learning'}}][0m

[1m> Finished chain.[0m
Q: List all papers authored by 'Benoit Steiner'.
A: The papers authored by 'Benoit Steiner' are: 'TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems' and 'TensorFlow: A system for large-scale machine learning'.




In [33]:
response = test_query("Which papers belong to the 'Computer Science' discipline?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (p:Paper)-[:BELONGS_TO]->(d:Discipline)
WHERE d.name = 'Computer Science'
RETURN p
[0m
Full Context:
[32;1m[1;3m[{'p': {'citations': 10819, 'year': 2016.0, 'topic': 'Machine Learning', 'title': 'TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems'}}, {'p': {'citations': 5152, 'year': 2018.0, 'topic': 'Machine Learning', 'title': 'Stop explaining black box machine learning models for high stakes decisions and use interpretable models instead'}}, {'p': {'citations': 7266, 'year': 2015.0, 'topic': 'Machine Learning', 'title': 'Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting'}}, {'p': {'citations': 4017, 'year': 2017.0, 'topic': 'Machine Learning', 'title': 'An Introduction to Machine Learning'}}, {'p': {'citations': 3527, 'year': 2019.0, 'topic': 'Machine Learning', 'title': 'A Survey on Bias and Fairness in Machine Learning'}},

In [34]:
response = test_query("Which papers belong to the 'Sociology' discipline?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (p:Paper)-[:BELONGS_TO]->(d:Discipline)
WHERE d.name = 'Sociology'
RETURN p
[0m
Full Context:
[32;1m[1;3m[{'p': {'citations': 19, 'year': 2014.0, 'topic': 'Network Science', 'title': 'NetSci High: Bringing Network Science Research to High Schools'}}, {'p': {'citations': 11, 'year': 2013.0, 'topic': 'Network Science', 'title': 'Commentary: Teach network science to teenagers'}}][0m

[1m> Finished chain.[0m
Q: Which papers belong to the 'Sociology' discipline?
A: The papers belonging to the 'Sociology' discipline are: "NetSci High: Bringing Network Science Research to High Schools" and "Commentary: Teach network science to teenagers".




In [35]:
response = test_query("What are the papers published in 'Nature Microbiology' in the year 2021?")




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (p:Paper)-[:PUBLISHED_IN]->(v:Venue)
WHERE v.name = 'Nature Microbiology' AND p.year = 2021.0
RETURN p
[0m
Full Context:
[32;1m[1;3m[{'p': {'citations': 63, 'year': 2021.0, 'topic': 'Network Science', 'title': 'The science of the host–virus network'}}][0m

[1m> Finished chain.[0m
Q: What are the papers published in 'Nature Microbiology' in the year 2021?
A: The paper titled "The science of the host–virus network" was published in 'Nature Microbiology' in the year 2021.




In [36]:
response = test_query("How many papers did 'Mathieu Blondel' author?")



[1m> Entering new GraphCypherQAChain chain...[0m


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Generated Cypher:
[32;1m[1;3mMATCH (a:Author {name: 'Mathieu Blondel'})-[:AUTHORED]->(p:Paper)
RETURN count(p)
[0m
Full Context:
[32;1m[1;3m[{'count(p)': 1}][0m
Error: 429 Resource has been exhausted (e.g. check quota).


In [39]:
response = test_query("List all authors who have published papers in the topic 'Machine Learning'.")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (a:Author)-[:AUTHORED]->(p:Paper) WHERE p.topic = 'Machine Learning' RETURN a.name
[0m
Full Context:
[32;1m[1;3m[{'a.name': 'Vijay Vasudevan'}, {'a.name': 'Pete Warden'}, {'a.name': 'M. Wicke'}, {'a.name': 'Yuan Yu'}, {'a.name': 'Ashish Agarwal'}, {'a.name': 'E. Brevdo'}, {'a.name': 'C. Citro'}, {'a.name': 'G. Corrado'}, {'a.name': 'I. Goodfellow'}, {'a.name': 'A. Harp'}][0m

[1m> Finished chain.[0m
Q: List all authors who have published papers in the topic 'Machine Learning'.
A: Vijay Vasudevan, Pete Warden, M. Wicke, Yuan Yu, Ashish Agarwal, E. Brevdo, C. Citro, G. Corrado, I. Goodfellow, and A. Harp have published papers in the topic 'Machine Learning'.




In [40]:
response = test_query("What are the most cited papers in 'Medicine'?")
print(f"Cypher Query: {response['cypher_query']}")
print(f"Full Context: {response['full_context']}")
print(f"Result: {response['result']}")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Paper)-[:BELONGS_TO]->(d:Discipline)
WHERE d.name = 'Medicine'
RETURN p.title, p.citations
ORDER BY p.citations DESC
[0m
Full Context:
[32;1m[1;3m[{'p.title': 'Gaussian Processes For Machine Learning', 'p.citations': 16596}, {'p.title': 'Machine learning: Trends, perspectives, and prospects', 'p.citations': 5899}, {'p.title': 'Stop explaining black box machine learning models for high stakes decisions and use interpretable models instead', 'p.citations': 5152}, {'p.title': 'SoilGrids250m: Global gridded soil information based on machine learning', 'p.citations': 2681}, {'p.title': 'Machine learning for molecular and materials science', 'p.citations': 2600}, {'p.title': 'Multimodal Machine Learning: A Survey and Taxonomy', 'p.citations': 2524}, {'p.title': 'Accurate prediction of protein structures and interactions using a 3-track neural network', 'p.citations': 2243}, {'p.title': 'Machine Le

In [41]:
response = test_query("Which venues have published papers in the 'Machine Learning' topic?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Paper)-[:PUBLISHED_IN]->(v:Venue)
WHERE p.topic = 'Machine Learning'
RETURN DISTINCT v.name
[0m
Full Context:
[32;1m[1;3m[{'v.name': 'arXiv.org'}, {'v.name': 'Nature Machine Intelligence'}, {'v.name': 'Neural Information Processing Systems'}, {'v.name': 'Cambridge International Law Journal'}, {'v.name': 'ACM Computing Surveys'}, {'v.name': 'Science'}, {'v.name': 'Introduction to AI Techniques for Renewable Energy Systems'}, {'v.name': 'Nature Reviews Physics'}, {'v.name': 'SN Computer Science'}, {'v.name': 'Hands-On Machine Learning with R'}][0m

[1m> Finished chain.[0m
Q: Which venues have published papers in the 'Machine Learning' topic?
A: The following venues have published papers in the 'Machine Learning' topic: arXiv.org, Nature Machine Intelligence, Neural Information Processing Systems, Cambridge International Law Journal, ACM Computing Surveys, Science, Introduction to AI Techniq

In [42]:
response = test_query("I am 'Han Xiao'. Which researchers I collaborated with before?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (a1:Author {name: 'Han Xiao'})-[:AUTHORED]->(p:Paper)-[:AUTHORED]->(a2:Author)
WHERE a1 <> a2
RETURN DISTINCT a2.name
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m
Q: I am 'Han Xiao'. Which researchers I collaborated with before?
A: Han Xiao has not collaborated with any other researchers.




In [43]:
response = test_query("I am 'Kashif Rasul'. Which researchers I collaborated with before?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (a:Author {name: 'Kashif Rasul'})-[:AUTHORED]->(p:Paper)<-[:AUTHORED]-(coauthor:Author)
WHERE coauthor <> a
RETURN DISTINCT coauthor.name
[0m
Full Context:
[32;1m[1;3m[{'coauthor.name': 'Han Xiao'}, {'coauthor.name': 'Roland Vollgraf'}][0m

[1m> Finished chain.[0m
Q: I am 'Kashif Rasul'. Which researchers I collaborated with before?
A: Kashif Rasul has collaborated with Han Xiao and Roland Vollgraf.




In [44]:
response = test_query("I am 'Han Xiao'. Which researcher should I collaborate with for future work?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (a1:Author {name: "Han Xiao"})-[:AUTHORED]->(p:Paper)<-[:AUTHORED]-(a2:Author)
WHERE a1 <> a2
RETURN a2.name, COUNT(p) AS collaborations
ORDER BY collaborations DESC
LIMIT 1
[0m
Full Context:
[32;1m[1;3m[{'a2.name': 'Kashif Rasul', 'collaborations': 1}][0m

[1m> Finished chain.[0m
Q: I am 'Han Xiao'. Which researcher should I collaborate with for future work?
A: Han Xiao should consider collaborating with Kashif Rasul.




In [45]:
response = test_query("I am 'Kashif Rasul'. Which professors should I collaborate with for future work?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (a:Author {name: 'Kashif Rasul'})-[:AUTHORED]->(p:Paper)-[:BELONGS_TO]->(d:Discipline)<-[:BELONGS_TO]-(p2:Paper)<-[:AUTHORED]-(a2:Author)
WHERE a <> a2
RETURN a2.name, COUNT(p2) AS collaborations
ORDER BY collaborations DESC
[0m
Full Context:
[32;1m[1;3m[{'a2.name': 'I. Goodfellow', 'collaborations': 4}, {'a2.name': 'Dzmitry Bahdanau', 'collaborations': 4}, {'a2.name': 'Kyunghyun Cho', 'collaborations': 4}, {'a2.name': 'Yoshua Bengio', 'collaborations': 4}, {'a2.name': 'C. Rudin', 'collaborations': 3}, {'a2.name': 'L. Bottou', 'collaborations': 3}, {'a2.name': 'Hsin-Yuan Huang', 'collaborations': 3}, {'a2.name': 'Vijay Vasudevan', 'collaborations': 2}, {'a2.name': 'Pete Warden', 'collaborations': 2}, {'a2.name': 'M. Wicke', 'collaborations': 2}][0m

[1m> Finished chain.[0m
Q: I am 'Kashif Rasul'. Which professors should I collaborate with for future work?
A: Kashif Rasul could collaborate wi

In [46]:
response = test_query("I am from 'Medicine' Discipline. But i want to expand my research in 'Machine Learning' topic. Which researchers should I collaborate with?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p1:Paper)-[:BELONGS_TO]->(:Discipline {name: 'Medicine'})
MATCH (p2:Paper {topic: 'Machine Learning'})
MATCH (a:Author)-[:AUTHORED]->(p2)
RETURN DISTINCT a.name
[0m
Full Context:
[32;1m[1;3m[{'a.name': 'Vijay Vasudevan'}, {'a.name': 'Pete Warden'}, {'a.name': 'M. Wicke'}, {'a.name': 'Yuan Yu'}, {'a.name': 'Ashish Agarwal'}, {'a.name': 'E. Brevdo'}, {'a.name': 'C. Citro'}, {'a.name': 'G. Corrado'}, {'a.name': 'I. Goodfellow'}, {'a.name': 'A. Harp'}][0m

[1m> Finished chain.[0m
Q: I am from 'Medicine' Discipline. But i want to expand my research in 'Machine Learning' topic. Which researchers should I collaborate with?
A: Based on the provided data, you could collaborate with Vijay Vasudevan, Pete Warden, M. Wicke, Yuan Yu, Ashish Agarwal, E. Brevdo, C. Citro, G. Corrado, I. Goodfellow, and A. Harp.  These researchers are all working on topics related to Machine Learning.




In [20]:
TEST_SET = [
    {
        "question": "List the titles of papers authored by 'Jianmin Chen' in 2016?",
        "query": "",
        "answer": "Jianmin Chen authored the paper 'TensorFlow: A system for large-scale machine learning' in 2016.",
        "DF": "",
    },
    {
        "question": "What are the most cited papers in 'Physics'?",
        "query": "",
        "answer": "",
        "DF": "scholar_data[scholar_data.Discipline == 'Physics'][['Paper Title', 'Citations']].drop_duplicates().sort_values(by='Citations', ascending=False).head(5)"
    },
    {
        "question": "Which authors have worked on the 'Network Science' topic?",
        "query": "",
        "answer": "",
        "DF": "scholar_data[scholar_data.Topic == 'Network Science']['Author'].drop_duplicates().head(5)"
    },
    {
        "question": "What venues have published papers in 'Environmental Science' Discipline?",
        "query": "",
        "answer": "",
        "DF": "scholar_data[scholar_data.Discipline == 'Environmental Science']['Venue'].drop_duplicates().head(5)"
    },
    {
        "question": "How many papers authored by 'Roland Vollgraf'?",
        "query": "",
        "answer": "",
        "DF": "scholar_data[scholar_data.Author == 'Roland Vollgraf']['Paper Title'].drop_duplicates().shape[0]"
    },
    {
        "question": "I am 'Kashif Rasul'. Which researchers I collaborated with before?",
        "query": "",
        "answer": "Kashif Rasul has collaborated with Han Xiao and Roland Vollgraf.",
        "DF": ""
    },
    {
        "question": "I am 'Kashif Rasul'. Which professors should I collaborate with for future work?",
        "query": "",
        "answer": "Kashif Rasul could collaborate with Vijay Vasudevan, Pete Warden, M. Wicke, Yuan Yu, Ashish Agarwal, E. Brevdo, C. Citro, G. Corrado, I. Goodfellow, and A. Harp.",
        "GT_NDCG": ["Vijay Vasudevan", "Pete Warden", "M. Wicke", "Yuan Yu", "Ashish Agarwal", "E. Brevdo", "C. Citro", "G. Corrado", "I. Goodfellow", "A. Harp"],
        "DF": ""
    },
    {
        "question": "List all papers published in 'Biology' before 2015",
        "query": "",
        "answer": "",
        "DF": "scholar_data[(scholar_data.Discipline == 'Biology') & (scholar_data['Year Published'] < 2015)]['Paper Title'].drop_duplicates().to_list()"
    },
    {
        "question": "Find the authors who have contributed to papers in 'Chemistry' and 'Materials Science'",
        "query": "",
        "answer": "",
        "DF": "scholar_data[scholar_data.Discipline.isin(['Chemistry', 'Materials Science'])]['Author'].drop_duplicates().head(5).to_list()"
    },
    {
        "question": "Which papers were published in the 'Medicine' discipline in venues of type 'Journal'?",
        "query": "",
        "answer": "",
        "DF": "scholar_data[(scholar_data.Discipline == 'Medicine') & (scholar_data['Venue Type'] == 'Journal')]['Paper Title'].drop_duplicates().to_list()"
    },
    {
        "question": "List papers with 'Machine Learning' as the topic and citations greater than 5000.",
        "query": "",
        "answer": "",
        "DF": "scholar_data[(scholar_data.Topic == 'Machine Learning') & (scholar_data.Citations > 5000)]['Paper Title'].drop_duplicates().head().to_list()"
    },
    {
        "question": "What are the venues where 'Jianmin Chen' has published papers",
        "query": "",
        "answer": "",
        "DF": "scholar_data[scholar_data['Author'] == 'Jianmin Chen']['Venue'].drop_duplicates().to_list()"
    },
    {
        "question": "What are the papers authored by 'Roland Vollgraf' in 'Machine Learning'?",
        "query": "",
        "answer": "",
        "DF": "scholar_data[(scholar_data['Author'] == 'Roland Vollgraf') & (scholar_data['Topic'] == 'Machine Learning')]['Paper Title'].drop_duplicates().tolist()"
    },
    {
        "question": "Which authors have worked in both 'Computer Science' and 'Mathematics'?",
        "query": "",
        "answer": "",
        "DF": "cs_authors = scholar_data[scholar_data['Discipline'] == 'Computer Science']['Author'] / math_authors = scholar_data[scholar_data['Discipline'] == 'Mathematics']['Author']/ cs_authors[cs_authors.isin(math_authors)].drop_duplicates().to_list()[:5]"
    },
    {
        "question": "Which authors have published in 'Materials Science' and the venue 'Nature'?",
        "query": "",
        "answer": "",
        "DF": "scholar_data[(scholar_data['Discipline'] == 'Materials Science') & (scholar_data['Venue'] == 'Nature')]['Author'].drop_duplicates().to_list()"
    },
    {
        "question": "What are the least cited papers in the discipline 'Psychology'?",
        "query": "",
        "answer": "",
        "DF": "scholar_data[scholar_data['Discipline'] == 'Psychology'][['Paper Title', 'Citations']].sort_values(by='Citations').drop_duplicates()['Paper Title'].to_list()"
    },
    {
        "question": "what are the top venues for papers on 'Network Science'?",
        "query": "",
        "answer": "",
        "DF": "scholar_data[scholar_data['Topic'] == 'Network Science']['Venue'].value_counts().index.tolist()[:5]"
    },
    {
        "question": "Which authors published papers in 'Medicine' after 2015?",
        "query": "",
        "answer": "",
        "DF": "scholar_data[(scholar_data['Discipline'] == 'Medicine') & (scholar_data['Year Published'] > 2015)]['Author'].to_list()[:5]"
    },
    {
        "question": "which authors have collaborated with 'D. Davies' on papers in 'Chemistry'?",
        "query": "",
        "answer": "",
        "DF": "z_chen_papers = scholar_data[(scholar_data['Author'] == 'D. Davies') & (scholar_data['Discipline'] == 'Chemistry')]['Paper Title']/scholar_data[scholar_data['Paper Title'].isin(z_chen_papers) & (scholar_data['Author'] != 'Z. Chen')]['Author'].drop_duplicates().to_list()"
    }
]


19

In [48]:
c = 1
for test_data in TEST_SET:
    print(f"Test Case: {c}")
    question = test_data["question"]
    query = test_data["query"]
    # answer = test_data["answer"]
    test_query(question)
    print(test_data["DF"])
    c += 1
    print("-----------------------------------------------------------")

Test Case: 1


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (a:Author {name: 'Jianmin Chen'})-[:AUTHORED]->(p:Paper {year: 2016})
RETURN p.title
[0m
Full Context:
[32;1m[1;3m[{'p.title': 'TensorFlow: A system for large-scale machine learning'}][0m

[1m> Finished chain.[0m
Q: List the titles of papers authored by 'Jianmin Chen' in 2016?
A: Jianmin Chen authored the paper "TensorFlow: A system for large-scale machine learning" in 2016.



-----------------------------------------------------------
Test Case: 2


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Paper)-[:BELONGS_TO]->(d:Discipline {name: 'Physics'})
RETURN p
ORDER BY p.citations DESC
[0m
Full Context:
[32;1m[1;3m[{'p': {'citations': 1830, 'year': 2019.0, 'topic': 'Machine Learning', 'title': 'Machine Learning for Fluid Mechanics'}}, {'p': {'citations': 1557, 'year': 2017.0, 'topic': 'Machine Learning', 'title': 'MoleculeNet: a ben