In [1]:
from neo4j import GraphDatabase

URI = "bolt://localhost:7687"
AUTH = ("neo4j", "12345678")

# Neo4j connection
driver = GraphDatabase.driver(URI, auth=AUTH)

# ----------- Common Queries for Project ------------------

# 1. Search protein by Entry ID or Name or Description (protein_names)
def search_protein(tx, search_term):
    query = """
    MATCH (p:Protein)
    WHERE toLower(p.entry) CONTAINS toLower($term) 
       OR toLower(p.entry_name) CONTAINS toLower($term) 
       OR toLower(p.protein_names) CONTAINS toLower($term)
    RETURN p.entry, p.entry_name, p.protein_names LIMIT 10
    """
    result = tx.run(query, term=search_term)
    return [record.data() for record in result]

# 2. View neighbors and neighbors-of-neighbors
def get_protein_neighborhood(tx, entry_id):
    query = """
    MATCH (p:Protein {entry: $entry_id})-[r*1..2]-(n)
    RETURN p, r, n
    LIMIT 50
    """
    result = tx.run(query, entry_id=entry_id)
    return result.data()

# 3. Count labeled and unlabeled proteins
def get_label_stats(tx):
    query = """
    MATCH (p:Protein)
    RETURN 
      COUNT(p) AS total,
      SUM(CASE WHEN p.ec_number IS NOT NULL THEN 1 ELSE 0 END) AS labeled,
      SUM(CASE WHEN p.ec_number IS NULL THEN 1 ELSE 0 END) AS unlabeled
    """
    return tx.run(query).single()

# 4. Count isolated proteins (no edges)
def get_isolated_nodes(tx):
    query = """
    MATCH (p:Protein)
    WHERE NOT (p)--()
    RETURN COUNT(p) AS isolated_proteins
    """
    return tx.run(query).single()

# 5. View predicted ECs for proteins
def get_predicted_ecs(tx):
    query = """
    MATCH (p:Protein)
    WHERE p.ec_predictions IS NOT NULL
    RETURN p.entry, p.ec_predictions LIMIT 20
    """
    return [record.data() for record in tx.run(query)]

# 6. Filter proteins by EC level prefix (e.g. level-2 = "3.1")
def filter_ec_predictions_by_prefix(tx, prefix):
    query = """
    MATCH (p:Protein)
    WHERE ANY(ec IN p.ec_predictions WHERE ec STARTS WITH $prefix)
    RETURN p.entry, p.ec_predictions
    LIMIT 20
    """
    return [record.data() for record in tx.run(query, prefix=prefix)]

# ----------------------------------------------------------

# Example usage:
with driver.session() as session:
    print("1. Search example:")
    print(session.execute_read(search_protein, "kinase"))

    print("\n2. Neighborhood example:")
    print(session.execute_read(get_protein_neighborhood, "A0A024R6A3"))

    print("\n3. Label stats:")
    print(session.execute_read(get_label_stats))

    print("\n4. Isolated proteins:")
    print(session.execute_read(get_isolated_nodes))

    print("\n5. Predicted ECs:")
    print(session.execute_read(get_predicted_ecs))

    print("\n6. Filter ECs by prefix (e.g. level-2: '2.7'):")
    print(session.execute_read(filter_ec_predictions_by_prefix, "2.7"))

driver.close()


1. Search example:
[{'p.entry': 'A0A087WZ06', 'p.entry_name': 'A0A087WZ06_HUMAN', 'p.protein_names': 'Serine/threonine-protein kinase 3 (EC 2.7.11.1)'}, {'p.entry': 'A0A087X0I6', 'p.entry_name': 'A0A087X0I6_HUMAN', 'p.protein_names': 'non-specific serine/threonine protein kinase (EC 2.7.11.1)'}, {'p.entry': 'A0A090N7W4', 'p.entry_name': 'A0A090N7W4_HUMAN', 'p.protein_names': 'Cell division protein kinase 5'}, {'p.entry': 'A0A0A0MRJ0', 'p.entry_name': 'A0A0A0MRJ0_HUMAN', 'p.protein_names': 'non-specific serine/threonine protein kinase (EC 2.7.11.1)'}, {'p.entry': 'A0A0A0MRJ1', 'p.entry_name': 'A0A0A0MRJ1_HUMAN', 'p.protein_names': 'non-specific serine/threonine protein kinase (EC 2.7.11.1)'}, {'p.entry': 'A0A0A6YYC0', 'p.entry_name': 'A0A0A6YYC0_HUMAN', 'p.protein_names': 'non-specific serine/threonine protein kinase (EC 2.7.11.1)'}, {'p.entry': 'A0A0D9SFP6', 'p.entry_name': 'A0A0D9SFP6_HUMAN', 'p.protein_names': 'Tyrosine-protein kinase receptor (EC 2.7.10.1)'}, {'p.entry': 'A0A0S2Z310