In [67]:
from neo4j import GraphDatabase 
import pandas as pd
from tqdm import tqdm
import ast

url = "bolt://localhost:7687" 

driver = GraphDatabase.driver(url, auth=("neo4j", "heart"))

# Graph Algorithms

Creating in-memory graph with cypher projection:

@cypher:

    CALL gds.graph.create(
    'OS KG',
        ["MeSH", "Protein", "Document", "Drug"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'}
        }
    )

In [3]:
def count_nodes():
    query = "MATCH (n) RETURN COUNT(n)"
    with driver.session() as session:
        info = session.run(query)
    return info

count_nodes()


<neo4j.work.result.Result at 0x7fe2b0c07610>

# Node Similarity for OS and CVD

Creating in-memory graph with cypher projection:

USED IN CLASS:

@cypher:

    CALL gds.graph.create(
    'MeSH-PMID-Protein-Drug',
        ["MeSH", "Document", "Protein", "Drug"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'}
        }
    )

UNUSED:

@cypher:

    CALL gds.graph.create(
    'OS KG',
        ["MeSH", "Protein", "Document", "Drug"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'}
        }
    )

    CALL gds.graph.create(
    'MESH',
        ["MeSH"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'}
        }
    )

    CALL gds.graph.create(
    'MeSH-PMID-Protein',
        ["MeSH", "Document", "Protein"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'}
        }
    )



In [102]:
class nodesim():
    """Class to run LinkPred"""
    def __init__(self) -> None:
        self.driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "heart"))

    def close(self) -> None:
        self.driver.close()

    @classmethod
    def algo(cls, tx) -> any:
        """
        @param cls is the class
        @param tx is the transaction
        @return result.data() is the data of the cluster
        """
        query = ("""
        CALL gds.nodeSimilarity.stream('MeSH-PMID-Protein-Drug')
        YIELD node1, node2, similarity
        RETURN gds.util.asNode(node1).name AS MeSH_Term1, gds.util.asNode(node1).type AS Type1, similarity, gds.util.asNode(node2).type AS Type2, gds.util.asNode(node2).name AS MeSH_Term2
        ORDER BY similarity DESCENDING, MeSH_Term1, MeSH_Term2
        """)
        result = tx.run(query)
        return result.data()


    def run_algo(self) -> any:
        """
        @param self
        @return result is the dataframe from the pagerank
        """
        result = self.driver.session().write_transaction(self.algo)
        result = pd.DataFrame(result)
        return result

In [103]:
#3 min runtime
algo = nodesim()
result = algo.run_algo()
#extract nodes whose similarity is 1
types = ["OS", "CVD"]
CVD_OS = result[result["Type1"].isin(types)]
CVD_OS = CVD_OS[result["Type2"].isin(types)]
#result.dropna()
CVD_OS = CVD_OS.reset_index(drop=True)
CVD_OS

  CVD_OS = CVD_OS[result["Type2"].isin(types)]


Unnamed: 0,MeSH_Term1,Type1,similarity,Type2,MeSH_Term2
0,brugada syndrome,CVD,0.321839,CVD,cardiac conduction system disease
1,cardiac conduction system disease,CVD,0.321839,CVD,brugada syndrome
2,dna-binding proteins,OS,0.251169,OS,transcription factors
3,transcription factors,OS,0.251169,OS,dna-binding proteins
4,bicuspid aortic valve disease,CVD,0.227848,CVD,heart valve diseases
...,...,...,...,...,...
1453,"tachycardia, atrioventricular nodal reentry",CVD,0.000364,CVD,myocardial infarction
1454,protein disulfide reductase (glutathione),OS,0.000323,OS,membrane proteins
1455,"aortic stenosis, subvalvular",CVD,0.000161,OS,membrane proteins
1456,glycogen storage disease type iib,CVD,0.000161,OS,membrane proteins


In [104]:
#filtering out CVD and OS to compare them:
compare = CVD_OS[CVD_OS["Type1"] != CVD_OS["Type2"]].reset_index(drop=True)
compare.head()
compare

Unnamed: 0,MeSH_Term1,Type1,similarity,Type2,MeSH_Term2
0,malondialdehyde,OS,0.030755,CVD,myocardial reperfusion injury
1,myocardial reperfusion injury,CVD,0.030755,OS,malondialdehyde
2,myocardial reperfusion injury,CVD,0.021693,OS,superoxide dismutase
3,isoenzymes,OS,0.021073,CVD,myocardial infarction
4,anterior wall myocardial infarction,CVD,0.016667,OS,lipid peroxides
5,lipid peroxides,OS,0.016667,CVD,anterior wall myocardial infarction
6,"cardiomyopathy, hypertrophic",CVD,0.013499,OS,carrier proteins
7,diabetic cardiomyopathies,CVD,0.01087,OS,protein carbonylation
8,protein carbonylation,OS,0.01087,CVD,diabetic cardiomyopathies
9,deoxyguanosine,OS,0.009804,CVD,ventricular dysfunction


In [7]:
#compare.to_csv("OS_CVD_Nodesim.csv")

# Link Prediction between OS and CVD

@cypher:

    CALL gds.graph.create(
    'stuff' ,
        ["MeSH", "Document", "Protein", "Drug", "Pathway"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'}, 
            CANDIDATE: {orientation: 'UNDIRECTED'}
        }
    )

In [84]:
#GET A LIST OF ALL OS NODES
type = "OS"
query = "MATCH (m:MeSH{type: '" + type + "'}) RETURN m.name as Name, m.event as Event"
Result = []
with driver.session() as session:
    info = session.run(query)
    for item in info:
        Result.append({"OS Name":item.values()[0],
                      "OS Event":item.values()[1]})

OS = pd.DataFrame(Result)
OS

Unnamed: 0,OS Name,OS Event
0,superoxides,IOS
1,hydrogen peroxide,IOS
2,hydroxyl radical,IOS
3,singlet oxygen,IOS
4,peroxides,IOS
...,...,...
73,zeaxanthins,ROS
74,lycopene,ROS
75,protein carbonylation,OOS
76,hydroxylation,OOS


In [83]:
#GET A LIST OF ALL CVD NODES
type = "CVD"
query = "MATCH (m:MeSH{type: '" + type + "'}) RETURN m.name as Name, m.category as Event"
Result = []
with driver.session() as session:
    info = session.run(query)
    for item in info:
        Result.append({"CVD Name":item.values()[0],
                      "CVD Category":item.values()[1]})

CVD = pd.DataFrame(Result)
CVD

Unnamed: 0,CVD Name,CVD Category
0,cardiomyopathies,CM
1,arrhythmogenic right ventricular dysplasia,CHD
2,"cardiomyopathy, alcoholic",CM
3,"cardiomyopathy, dilated",OHD
4,"cardiomyopathy, hypertrophic",VD
...,...,...
171,cardiac papillary fibroelastoma,OHD
172,carney complex,OHD
173,pericarditis,OHD
174,"pericarditis, constrictive",OHD


In [49]:
class LinkPred():
    """Class to run LinkPred"""
    def __init__(self) -> None:
        self.driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "heart"))

    def close(self) -> None:
        self.driver.close()

    @classmethod
    def LinkPred(cls, tx, name1, name2) -> any:
        """
        @param cls is the class
        @param tx is the transaction
        @return result.data() is the data of the cluster
        """
        query = ("""
        MATCH (m:MeSH{name: $name1})
        MATCH (n:MeSH{name: $name2})
        RETURN gds.alpha.linkprediction.commonNeighbors(n, m) AS LinkPred_Score
        """)
        result = tx.run(query, name1 = name1, name2 = name2)
        return result.data()

    def run_LinkPred(self, df1 = OS, df2 = CVD) -> any:
        """
        @param self
        @param df1 is first df comparing
        @param df2 is second df comparing
        @return result is the dataframe from the pagerank
        """
        # create empty list
        df = pd.DataFrame()
        linkpred = []
        os = []
        cvd = []
        os_event = []
        cvd_cat = []
        for i in tqdm(range(len(df1['OS Name'])), desc= "Iterating: "):
            for j in range(len(df2["CVD Name"])):
                linkpred.append(self.driver.session().write_transaction(self.LinkPred, df1['OS Name'][i], df2['CVD Name'][j]))
                os.append(df1['OS Name'][i])
                cvd.append(df2['CVD Name'][j])
                os_event.append(df1["OS Event"][i])
                cvd_cat.append(df2["CVD Category"][j])                
        df["CVD Name"] = cvd
        df["CVD Category"] = cvd_cat
        df["LinkPred Score"] = linkpred
        df["OS Event"] = os_event
        df["OS Name"] = os
        return df

In [40]:
#3 min runtime
pred = LinkPred()
df = pred.run_LinkPred(OS, CVD)

Iterating: 100%|██████████| 78/78 [00:46<00:00,  1.66it/s]


In [41]:
for index, row in df.iterrows():
    row['LinkPred Score'] = row['LinkPred Score'][0]['LinkPred_Score']
df = df.sort_values(by = 'LinkPred Score', ascending= False)
df = df.reset_index(drop = True)
df = df[df["LinkPred Score"] != 0]
df

Unnamed: 0,CVD Name,CVD Category,LinkPred Score,OS Event,OS Name
0,myocardial infarction,IHD,117.0,ROS,isoenzymes
1,myocardial ischemia,IHD,52.0,ROS,isoenzymes
2,heart failure,CM,48.0,IOS,nitric oxide
3,myocardial infarction,IHD,41.0,IOS,nitric oxide
4,myocardial reperfusion injury,IHD,39.0,IOS,nitric oxide
...,...,...,...,...,...
592,cardiomegaly,OHD,1.0,IOS,hydrogen peroxide
593,long qt syndrome,CCS,1.0,IOS,hydrogen peroxide
594,"ventricular dysfunction, left",OHD,1.0,IOS,hydroxyl radical
595,cardiomegaly,OHD,1.0,IOS,superoxides


In [45]:
#df.to_csv("hello.csv", index = False)

## Link Prediction with Ion Channel Proteins and MeSH Terms

In [77]:
#GET A LIST OF ALL Proteins

query = "MATCH (m:Protein) RETURN m.name as Name, m.id as id"
Result = []
with driver.session() as session:
    info = session.run(query)
    for item in info:
        Result.append({"Protein Name":item.values()[0],
                      "UniProt ID":item.values()[1]})

protein = pd.DataFrame(Result)
protein.reset_index(drop=True)

Unnamed: 0,Protein Name,UniProt ID
0,Gap junction alpha-1 protein,P17302
1,Voltage-dependent L-type calcium channel subun...,Q13936
2,Voltage-dependent T-type calcium channel subun...,O95180
3,Kv channel-interacting protein 2,Q9NS61
4,Transient receptor potential cation channel su...,Q8TD43
...,...,...
426,,Q16515
427,,P63252
428,,Q9UF02
429,,Q9H244


In [102]:
#USING THE MESH QUERIES FROM THE OTHER LINKPRED TABLE

class LinkPred():
    """Class to run LinkPred"""
    def __init__(self) -> None:
        self.driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "heart"))


    def close(self) -> None:
        self.driver.close()

    @classmethod
    def LinkPred(cls, tx, name1, name2) -> any:
        """
        @param cls is the class
        @param tx is the transaction
        @return result.data() is the data of the cluster
        """
        query = ("""
        MATCH (m:MeSH {name: $name1})
        MATCH (n:Protein {id: $name2})
        RETURN gds.alpha.linkprediction.adamicAdar(n, m) AS LinkPred_Score
        """)
        result = tx.run(query, name1 = name1, name2 = name2)
        return result.data()

    def run_LinkPred(self, os, protein) -> any:
        """
        @param self
        @return result is the dataframe from the pagerank
        """
        # create empty list
        protname = []
        id = []
        linkpred = []
        os_list = []
        os_event = []
        for i in tqdm(range(len(os["OS Name"])), desc= "Iterating through df: "):
            for j in range(len(protein["UniProt ID"])):
                linkpred.append(self.driver.session().write_transaction(self.LinkPred, os["OS Name"][i], protein["UniProt ID"][j]))
                os_list.append(os["OS Name"][i])
                id.append(protein["UniProt ID"][j])
                protname.append(protein["Protein Name"][j])
                os_event.append(os["OS Event"][i])
        df = pd.DataFrame()
        df["Protein Name"] = protname
        df["UniProt ID"] = id
        df["LinkPred Score"] = linkpred
        df["OS Name"] = os_list
        df["OS Event"] = os_event
        return df

In [103]:
protein_mesh = LinkPred()
result = protein_mesh.run_LinkPred(OS, protein)
result

Iterating through df: 100%|██████████| 78/78 [02:34<00:00,  1.97s/it]


Unnamed: 0,Protein Name,UniProt ID,LinkPred Score,OS Name,OS Event
0,Gap junction alpha-1 protein,P17302,[{'LinkPred_Score': 4.316951657079506}],superoxides,IOS
1,Voltage-dependent L-type calcium channel subun...,Q13936,[{'LinkPred_Score': 0.0}],superoxides,IOS
2,Voltage-dependent T-type calcium channel subun...,O95180,[{'LinkPred_Score': 0.0}],superoxides,IOS
3,Kv channel-interacting protein 2,Q9NS61,[{'LinkPred_Score': 0.7213475204444817}],superoxides,IOS
4,Transient receptor potential cation channel su...,Q8TD43,[{'LinkPred_Score': 0.0}],superoxides,IOS
...,...,...,...,...,...
33613,,Q16515,[{'LinkPred_Score': 0.0}],deoxyguanosine,OOS
33614,,P63252,[{'LinkPred_Score': 0.0}],deoxyguanosine,OOS
33615,,Q9UF02,[{'LinkPred_Score': 0.0}],deoxyguanosine,OOS
33616,,Q9H244,[{'LinkPred_Score': 0.0}],deoxyguanosine,OOS


In [97]:
result = result[result['LinkPred Score'].map(lambda d: len(d)) > 0]
result = result.dropna().reset_index(drop=True)
result

Unnamed: 0,Protein Name,UniProt ID,LinkPred Score,OS Name,OS Event
0,Gap junction alpha-1 protein,P17302,[{'LinkPred_Score': 5.0}],superoxides,IOS
1,Voltage-dependent L-type calcium channel subun...,Q13936,[{'LinkPred_Score': 0.0}],superoxides,IOS
2,Voltage-dependent T-type calcium channel subun...,O95180,[{'LinkPred_Score': 0.0}],superoxides,IOS
3,Kv channel-interacting protein 2,Q9NS61,[{'LinkPred_Score': 1.0}],superoxides,IOS
4,Transient receptor potential cation channel su...,Q8TD43,[{'LinkPred_Score': 0.0}],superoxides,IOS
...,...,...,...,...,...
10837,Matrix metalloproteinase-9,P14780,[{'LinkPred_Score': 13.0}],deoxyguanosine,OOS
10838,Natural resistance-associated macrophage prote...,P49281,[{'LinkPred_Score': 0.0}],deoxyguanosine,OOS
10839,Leucine-rich repeat serine/threonine-protein k...,Q5S007,[{'LinkPred_Score': 0.0}],deoxyguanosine,OOS
10840,Calcium uniporter protein,S4R468,[{'LinkPred_Score': 0.0}],deoxyguanosine,OOS


In [98]:
for i in range(len(result["LinkPred Score"])):
    result["LinkPred Score"][i] = result["LinkPred Score"][i][0]["LinkPred_Score"]

result = result.sort_values(by = 'LinkPred Score', ascending= False)
result = result.dropna().reset_index(drop=True)
result = result[result["LinkPred Score"] > 0]
result

Unnamed: 0,Protein Name,UniProt ID,LinkPred Score,OS Name,OS Event
0,"Nitric oxide synthase, brain",P29475,2592.0,nitric oxide,IOS
1,Amyloid-beta precursor protein,P05067,1364.0,membrane proteins,ROS
2,Protein kinase C alpha type,P17252,1318.0,isoenzymes,ROS
3,Presenilin-1,P49768,985.0,membrane proteins,ROS
4,Dystrophin,P11532,670.0,membrane proteins,ROS
...,...,...,...,...,...
1930,"Nitric oxide synthase, brain",P29475,1.0,peroxiredoxin vi,ROS
1931,Protein kinase C alpha type,P17252,1.0,peroxynitrous acid,IOS
1932,Protein kinase C alpha type,P17252,1.0,protein carbonylation,OOS
1933,Dystrophin,P11532,1.0,protein carbonylation,OOS


In [99]:
result = result.dropna()
result

Unnamed: 0,Protein Name,UniProt ID,LinkPred Score,OS Name,OS Event
0,"Nitric oxide synthase, brain",P29475,2592.0,nitric oxide,IOS
1,Amyloid-beta precursor protein,P05067,1364.0,membrane proteins,ROS
2,Protein kinase C alpha type,P17252,1318.0,isoenzymes,ROS
3,Presenilin-1,P49768,985.0,membrane proteins,ROS
4,Dystrophin,P11532,670.0,membrane proteins,ROS
...,...,...,...,...,...
1930,"Nitric oxide synthase, brain",P29475,1.0,peroxiredoxin vi,ROS
1931,Protein kinase C alpha type,P17252,1.0,peroxynitrous acid,IOS
1932,Protein kinase C alpha type,P17252,1.0,protein carbonylation,OOS
1933,Dystrophin,P11532,1.0,protein carbonylation,OOS


In [101]:
result.to_csv("[REVISED]OS_Protein_LinkPred_commonNeighbors.csv", index = False) 