# Finding similar analyses based on brain region and cell type

This is the code to find all similar analyses that are from the same specie, brain region and cell types.

The result is also stored in a csv for use at the website.

If you have cloned this project, the generated similarity file is already created and all you need to run is the `1. Create the Rodent Basal Ganglia Graph` notebook on the start page.

In [46]:
from neo4j import GraphDatabase, basic_auth
from dotenv import load_dotenv
import os

load_dotenv()

neo4jUser = os.getenv("NEO4J_USER")
neo4jPwd = os.getenv("NEO4J_PASSWORD_DS")
neo4jUrl = os.getenv("NEO4j_BOLT_DS")

driver = GraphDatabase.driver(neo4jUrl,auth=basic_auth(neo4jUser, neo4jPwd))

## Create the graph projection

Create a projection of analyses connected with brain region and cell type, per specie

For simplicity, we first add a special named relationship between the analyzes and brain region, cell type and specie.
This relationship is cleaned up after the algorithm is complete.

In [65]:
with driver.session() as session:
    session.run("""
        MATCH (n:Analysis)-->(:DataType)-->(:RegionRecord)-[:PRIMARY_REGION]->(b:BrainRegion)
        MERGE (n)-[:NODE_SIMILARITY]->(b)
    """)
    session.run("""
        MATCH (n:Analysis)-->(c:CellType)
        MERGE (n)-[:NODE_SIMILARITY]->(c)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:Specimen)-->(s:Specie)
        MERGE (n)-[:NODE_SIMILARITY]->(s)
    """)



In [66]:
with driver.session() as session:
    res = session.run("""
        CALL gds.graph.create(
            'analyses', 
            ["CellType", "BrainRegion", "Specie", "Analysis"], 
            'NODE_SIMILARITY'
        )
    """)
    for rec in res:
        print(rec)
    old =""" CALL gds.graph.create.cypher(
            'analyses-rat',
            'MATCH (n:Analysis)-[:SIMILARITY_ALGORITHM]->(m) WHERE (n)-->(:Specie {id: "1"}) WITH collect(n)+collect(m) as nodes UNWIND nodes as x RETURN id(x) as id',
            'MATCH (n:Analysis)-[:SIMILARITY_ALGORITHM]->(m) WHERE (n)-->(:Specie {id: "1"}) RETURN id(n) AS source, id(m) AS target'
        )"""
    res = session.run("""
        CALL gds.graph.create.cypher(
            'analyses-mouse',
            'MATCH (n:Analysis)-[:NODE_SIMILARITY]->(m) WHERE (n)-->(:Specie {id: "2"}) WITH collect(n)+collect(m) as nodes UNWIND nodes as x RETURN id(x) as id',
            'MATCH (n:Analysis)-[:NODE_SIMILARITY]->(m) WHERE (n)-->(:Specie {id: "2"}) RETURN id(n) AS source, id(m) AS target'
        )
    """)
    for rec in res:
        print(rec)



<Record graphName='analyses' nodeProjection={'BrainRegion': {'properties': {}, 'label': 'BrainRegion'}, 'Specie': {'properties': {}, 'label': 'Specie'}, 'CellType': {'properties': {}, 'label': 'CellType'}, 'Analysis': {'properties': {}, 'label': 'Analysis'}} relationshipProjection={'NODE_SIMILARITY': {'orientation': 'NATURAL', 'aggregation': 'DEFAULT', 'type': 'NODE_SIMILARITY', 'properties': {}}} nodeCount=861 relationshipCount=2133 createMillis=5>
<Record graphName='analyses-mouse' nodeProjection={'*': {'properties': {}, 'label': '*'}} relationshipProjection={'*': {'orientation': 'NATURAL', 'aggregation': 'DEFAULT', 'type': '*', 'properties': {}}} nodeCount=2020 relationshipCount=1010 createMillis=76>


## Run the similarity algorithm and store the results

### Mouse

Runs node similarity on the projection `analyses-mouse` and stores the result in `Data/csvs/basal_ganglia/regions/analysis_similarity_mouse.csv`.

In [45]:
import pandas as pd

similarity_rows = []
with driver.session() as session:
    res = session.run("""
        CALL gds.nodeSimilarity.stream(
            'analyses-mouse',
            {
                degreeCutoff: 3,
                similarityCutoff: 1.0
            }
        )
        YIELD node1, node2, similarity
        RETURN gds.util.asNode(node1).id as id1, gds.util.asNode(node2).id as id2, similarity
    """)
    for record in res:
        similarity_rows.append([record["id1"], record["id1"], record["similarity"]])


    
pd.DataFrame(similarity_rows, columns = ["id1", "id2", "score"])

Unnamed: 0,id1,id2,score
0,616,616,1.0
1,616,616,1.0
2,616,616,1.0
3,616,616,1.0
4,616,616,1.0
...,...,...,...
2979,410,410,1.0
2980,410,410,1.0
2981,410,410,1.0
2982,410,410,1.0


In [38]:
# Stores the result in a CSV file for the genereal database import
_.to_csv("..\Data/csvs/basal_ganglia/regions/analysis_similarity_mouse.csv")

### Rat

Runs node similarity on the projection `analyses-rat` and stores the result in `Data/csvs/basal_ganglia/regions/analysis_similarity_rat.csv`.

In [70]:
import pandas as pd

similarity_rows = []
with driver.session() as session:
    res = session.run("""
        CALL gds.nodeSimilarity.stream(
            'analyses',
            {
                degreeCutoff: 3,
                similarityCutoff: 1.0,
                topK: 10
            }
        )
        YIELD node1, node2, similarity
        RETURN gds.util.asNode(node1).id as id1, gds.util.asNode(node2).id as id2, similarity
        ORDER BY id1
    """)
    for record in res:
        similarity_rows.append([record["id1"], record["id2"], record["similarity"]])


    
pd.DataFrame(similarity_rows, columns = ["id1", "id2", "score"])

Unnamed: 0,id1,id2,score
0,101,498,1.0
1,101,593,1.0
2,101,676,1.0
3,101,675,1.0
4,101,550,1.0
...,...,...,...
3863,91,291,1.0
3864,91,289,1.0
3865,91,287,1.0
3866,91,78,1.0


In [72]:
# Stores the result in a CSV file for the genereal database import
_.to_csv("..\Data/csvs/basal_ganglia/regions/analysis_similarity.csv")

## Clean-up
We remove the projected graphs, and delete the created relationship `SIMILARITY_ALGORITHM`

In [73]:
with driver.session() as session:
    session.run("call gds.graph.drop('analyses')")
    session.run("call gds.graph.drop('analyses-mouse')")
    session.run("""
        MATCH ()-[r:NODE_SIMILARITY]-()
        DETACH DELETE r
    """)

# Other stuff

In [4]:
  ### MATCH All these for part 2 (looking at method differences)
with driver.session() as session:   
    session.run("""
        MATCH (n:Analysis)-->(:ReporterIncubation)-->(r:Reporter)
        MERGE (n)-[:ANA_DATA {strength: 1}]->(r)
    """)
  
    session.run("""
        MATCH (n:Analysis)-->(:DataType)-->(:RegionRecord)-->(b:BrainRegion)
        MERGE (n)-[:ANA_DATA {strength: 1}]->(b)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:DataType)-->(r:CellularRegion)
        MERGE (n)-[:ANA_DATA {strength: 1}]->(r)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:Experiment)-[:ANAESTHETIC]->(r:Solution)
        MERGE (n)-[:ANESTHETIC]->(r)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:Experiment)-[:PERFUSION_FIX_MEDIUM]->(r:Solution)
        MERGE (n)-[:PERFUSION]->(r)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:Experiment)-->(:Specimen)-->(s:Specie)
        MERGE (n)-[:ANA_DATA]->(s)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:DataType)-->(s:Software)
        MERGE (n)-[:ANA_DATA]->(s)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:DataType)-->(s:RegionZone)
        MERGE (n)-[:ANA_DATA]->(s)
    """)

Failed to read from defunct connection Address(host='100.26.227.192', port=34124) (Address(host='100.26.227.192', port=34124))


ServiceUnavailable: Failed to read from defunct connection Address(host='100.26.227.192', port=34124) (Address(host='100.26.227.192', port=34124))

In [15]:
## Get "real" analysis similarity
# This is ONLY to see if we find anything interesting.
# The "real" similarities will be set on the website
# the "_all.csv" are based on Reporter, CellTYpe and BrainRegion only
import pandas as pd

df = pd.read_csv("Data/csvs/graph/analyses_similarity_mouse_all.csv")

new_table = []
for index, row in df.iterrows():
    if(pd.isnull(row["id1"])):
        continue
    
    name1 = row["analysis1"].split("_")[:2]
    name2 = row["analysis2"].split("_")[:2]
    if(name1 != name2 and float(row["similarity"] > 0.5)):
        new_table.append([row["id1"], row["id2"], row["analysis1"], row["analysis2"], row["similarity"]])

# remove duplicated
for row1 in new_table:
    for row2 in new_table:
        if(row1[0] == row2[0] and row1[1] == row2[1] and row1[4] == row2[4]):
            new_table.remove(row2)
            
        
new_df =  pd.DataFrame(new_table, columns=["id1", "id2", "analysis1", "analysis2", "similarity"])
new_df.to_csv("Data/csvs/graph/reffined_analyses_similarity_mouse_all.csv")


In [2]:
## Is there somehting in the methods used that make a difference in the results?

cell_ids = []
region_ids = []

with driver.session() as session:    
    res = session.run("""
        Match (b:BrainRegion)
        WHERE size((:Analysis)-[:DATA_TYPE]->(:Quantitation)-[:REGION_RECORD]->(:RegionRecord)-[:PRIMARY_REGION]->(b)) >=20
        return b.id
    
    """)
    for record in res:
        region_ids.append(record["b.id"])
    res = session.run("""
        Match (c:CellType)
        WHERE size((c)<-[:CELL_TYPE_PUTATIVE]-(:Analysis)) >=20
        return c.id
    """)
    for record in res:
        cell_ids.append(record["c.id"])

print(len(cell_ids), len(region_ids))
queries = []
        
for cell_id in cell_ids:
    for region_id in region_ids:
        q = """
        MATCH (c:CellType)<-[:CELL_TYPE_PUTATIVE]-(n:Analysis)-[:DATA_TYPE]->(:Quantitation)-[:REGION_RECORD]->(:RegionRecord)-[:PRIMARY_REGION]->(b:BrainRegion)
        WHERE c.id ="%s" AND b.id = "%s"
        return COUNT(DISTINCT n.id) as analysisCount, c.id, c.name, b.id, b.name
        
        """ % (cell_id, region_id)
        queries.append(q)
        
print(len(queries), queries[0])
results = []
for query in queries:
    with driver.session() as session:  
        print(".", end =" ")
        res = session.run(query)
        for rec in res:
            if(rec["analysisCount"] > 20):
                print(rec["analysisCount"], rec["c.id"], rec["c.name"], rec["b.id"], rec["b.name"])
                results.append((rec["analysisCount"], rec["c.id"], rec["c.name"], rec["b.id"], rec["b.name"]))
            
print(results)



8 11
88 
        MATCH (c:CellType)<-[:CELL_TYPE_PUTATIVE]-(n:Analysis)-[:DATA_TYPE]->(:Quantitation)-[:REGION_RECORD]->(:RegionRecord)-[:PRIMARY_REGION]->(b:BrainRegion)
        WHERE c.id ="8" AND b.id = "1"
        return COUNT(DISTINCT n.id) as analysisCount, c.id, c.name, b.id, b.name
        
        
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 72 14 Tyrosine hydroxylase expressing 10 Pars compacta
. . . . . 25 14 Tyrosine hydroxylase expressing 26 Substantia nigra
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 37 42 Neuron 1 Caudoputamen
. 29 42 Neuron 2 Caudoputamen
. . . . 27 42 Neuron 10 Pars compacta
. . . . . [(72, '14', 'Tyrosine hydroxylase expressing', '10', 'Pars compacta'), (25, '14', 'Tyrosine hydroxylase expressing', '26', 'Substantia nigra'), (37, '42', 'Neuron', '1', 'Caudoputamen'), (29, '42', 'Neuron', '2', 'Caudoputamen'), (27, '42', 'Neuron', '10', 'Pars compacta')]


In [None]:
results = [
    (72, '14', 'Tyrosine hydroxylase expressing', '10', 'Pars compacta'), 
    (25, '14', 'Tyrosine hydroxylase expressing', '26', 'Substantia nigra'), 
    (37, '42', 'Neuron', '1', 'Caudoputamen'), 
    (29, '42', 'Neuron', '2', 'Caudoputamen'), 
    (27, '42', 'Neuron', '10', 'Pars compacta')
]

analyses_groups = []

with driver.session() as session:    
    for res in results:
        _, cellId, _, regId, _ = res
        ## I'M Making these into subgraphs in neo3j directly instead and perform node similarity, as it's only really two..
        analyses_query = """
        MATCH (a:Analysis)-->(r:Reporter)
        MATCH (a)-->(s:Strain)
        MATCH (a)-->(v:VisualizationProtocol)
        MATCH (a)-->(soft:Software)
        MATCH (a)-->()-->(m:Microscope)
        MATCH (a)-[:PERFUSION]->(perf:Solution)
        MATCH (a)-[:ANESTHETIC]->(ane:Solution)
        WHERE (a)-->(:CellType {id: "%s"}) AND (a)-->(:BrainRegion {id: "%s"})
        RETURN a.id, a.name, s.name, r.name
        """ % (cellId, regId)
        neo_res = session.run(query)
        ana = []
        for rec in neo_res:
            ana.push((rec["a.id"], rec[a.name])
        analyses_groups.push(ana)
                     

for group in analyses_groups:
    
    




In [None]:
## Graph for networkx
from neo4j import GraphDatabase, basic_auth
from dotenv import load_dotenv
import os

load_dotenv()

neo4jUser = os.getenv("NEO4J_USER")
neo4jPwd = os.getenv("NEO4J_PASSWORD")
neo4jUrl = os.getenv("NEO4j_BOLT")

driver = GraphDatabase.driver(neo4jUrl,auth=basic_auth(neo4jUser, neo4jPwd))

  ### MATCH All these for part 2 (looking at method differences)
with driver.session() as session:   
    session.run("""
        MATCH (n:Analysis)-->(i:ReporterIncubation)-->(r:Reporter)
        MERGE (n)-[:ANA_DATA:USED {weight: 1, type: i.Order}]->(r)
    """)
  
    session.run("""
        MATCH (n:Analysis)-->(:DataType)-->(:RegionRecord)-[:PRIMARY_REGION]->(b:BrainRegion)
        MERGE (n)-[:ANA_DATA:OBSERVED_REGION {weight: 1, type: "primary"}]->(b)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:DataType)-->(:RegionRecord)-[:SECONDARY_REGION]->(b:BrainRegion)
        MERGE (n)-[:ANA_DATA:OBSERVED_REGION {weight: 1, type: "secondary"}]->(b)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:DataType)-->(r:CellularRegion)
        MERGE (n)-[:ANA_DATA:CELLULAR_REGION {weight: 1}]->(r)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:Experiment)-[:ANAESTHETIC]->(r:Solution)
        MERGE (n)-[:ANA_DATA:ANESTHETIC]->(r)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:Experiment)-[:PERFUSION_FIX_MEDIUM]->(r:Solution)
        MERGE (n)-[:ANA_DATA:PERFUSION]->(r)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:Experiment)-->(:Specimen)-->(s:Specie)
        MERGE (n)-[:ANA_DATA]->(s)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:DataType)-->(s:Software)
        MERGE (n)-[:ANA_DATA]->(s)
    """)
    session.run("""
        MATCH (n:Analysis)-->(:DataType)-->(s:RegionZone)
        MERGE (n)-[:ANA_DATA]->(s)
    """)