In [1]:
from neo4j import GraphDatabase 
import pandas as pd
from tqdm import tqdm
import ast

url = "bolt://localhost:7687" 

driver = GraphDatabase.driver(url, auth=("neo4j", "heart"))

### Link Prediction for OS and Ion Channel Proteins

Creating in-memory graph with cypher projection:

@cypher:

    CALL gds.graph.create(
    'OS KG',
        ["MeSH", "Protein", "Document", "Drug"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'}
        }
    )

In [2]:
def count_nodes():
    query = "MATCH (n) RETURN COUNT(n)"
    with driver.session() as session:
        info = session.run(query)
    return info

count_nodes()


<neo4j.work.result.Result at 0x7f8271072130>

# Node Similarity for OS and CVD

Creating in-memory graph with cypher projection:

USED IN CLASS:

@cypher:

    CALL gds.graph.create(
    'MeSH-PMID-Protein-Drug',
        ["MeSH", "Document", "Protein", "Drug"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'}
        }
    )

UNUSED:

@cypher:

    CALL gds.graph.create(
    'OS KG',
        ["MeSH", "Protein", "Document", "Drug"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'}
        }
    )

    CALL gds.graph.create(
    'MESH',
        ["MeSH"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'}
        }
    )

    CALL gds.graph.create(
    'MeSH-PMID-Protein',
        ["MeSH", "Document", "Protein"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'}
        }
    )



In [102]:
class nodesim():
    """Class to run LinkPred"""
    def __init__(self) -> None:
        self.driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "heart"))

    def close(self) -> None:
        self.driver.close()

    @classmethod
    def algo(cls, tx) -> any:
        """
        @param cls is the class
        @param tx is the transaction
        @return result.data() is the data of the cluster
        """
        query = ("""
        CALL gds.nodeSimilarity.stream('MeSH-PMID-Protein-Drug')
        YIELD node1, node2, similarity
        RETURN gds.util.asNode(node1).name AS MeSH_Term1, gds.util.asNode(node1).type AS Type1, similarity, gds.util.asNode(node2).type AS Type2, gds.util.asNode(node2).name AS MeSH_Term2
        ORDER BY similarity DESCENDING, MeSH_Term1, MeSH_Term2
        """)
        result = tx.run(query)
        return result.data()


    def run_algo(self) -> any:
        """
        @param self
        @return result is the dataframe from the pagerank
        """
        result = self.driver.session().write_transaction(self.algo)
        result = pd.DataFrame(result)
        return result

In [103]:
#3 min runtime
algo = nodesim()
result = algo.run_algo()
#extract nodes whose similarity is 1
types = ["OS", "CVD"]
CVD_OS = result[result["Type1"].isin(types)]
CVD_OS = CVD_OS[result["Type2"].isin(types)]
#result.dropna()
CVD_OS = CVD_OS.reset_index(drop=True)
CVD_OS

  CVD_OS = CVD_OS[result["Type2"].isin(types)]


Unnamed: 0,MeSH_Term1,Type1,similarity,Type2,MeSH_Term2
0,brugada syndrome,CVD,0.321839,CVD,cardiac conduction system disease
1,cardiac conduction system disease,CVD,0.321839,CVD,brugada syndrome
2,dna-binding proteins,OS,0.251169,OS,transcription factors
3,transcription factors,OS,0.251169,OS,dna-binding proteins
4,bicuspid aortic valve disease,CVD,0.227848,CVD,heart valve diseases
...,...,...,...,...,...
1453,"tachycardia, atrioventricular nodal reentry",CVD,0.000364,CVD,myocardial infarction
1454,protein disulfide reductase (glutathione),OS,0.000323,OS,membrane proteins
1455,"aortic stenosis, subvalvular",CVD,0.000161,OS,membrane proteins
1456,glycogen storage disease type iib,CVD,0.000161,OS,membrane proteins


In [104]:
#filtering out CVD and OS to compare them:
compare = CVD_OS[CVD_OS["Type1"] != CVD_OS["Type2"]].reset_index(drop=True)
compare.head()
compare

Unnamed: 0,MeSH_Term1,Type1,similarity,Type2,MeSH_Term2
0,malondialdehyde,OS,0.030755,CVD,myocardial reperfusion injury
1,myocardial reperfusion injury,CVD,0.030755,OS,malondialdehyde
2,myocardial reperfusion injury,CVD,0.021693,OS,superoxide dismutase
3,isoenzymes,OS,0.021073,CVD,myocardial infarction
4,anterior wall myocardial infarction,CVD,0.016667,OS,lipid peroxides
5,lipid peroxides,OS,0.016667,CVD,anterior wall myocardial infarction
6,"cardiomyopathy, hypertrophic",CVD,0.013499,OS,carrier proteins
7,diabetic cardiomyopathies,CVD,0.01087,OS,protein carbonylation
8,protein carbonylation,OS,0.01087,CVD,diabetic cardiomyopathies
9,deoxyguanosine,OS,0.009804,CVD,ventricular dysfunction


In [7]:
#compare.to_csv("OS_CVD_Nodesim.csv")

# Link Prediction between OS and CVD

@cypher:

    CALL gds.graph.create(
    'MeSH-PMID-Protein-Drug',
        ["MeSH", "Document", "Protein", "Drug"],
        {
            MENTIONS: {orientation: 'UNDIRECTED'},
            TARGET: {orientation: 'UNDIRECTED'},
            ASSIGNS: {orientation: 'UNDIRECTED'}
        }
    )

In [105]:
class LinkPred():
    """Class to run LinkPred"""
    def __init__(self, df) -> None:
        self.driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "heart"))
        self.df = df

    def close(self) -> None:
        self.driver.close()

    @classmethod
    def LinkPred(cls, tx, name1, name2) -> any:
        """
        @param cls is the class
        @param tx is the transaction
        @return result.data() is the data of the cluster
        """
        query = ("""
        MATCH (m:MeSH{name: $name1})
        MATCH (n:MeSH{name: $name2})
        RETURN gds.alpha.linkprediction.commonNeighbors(n, m) AS LinkPred_Score
        """)
        result = tx.run(query, name1 = name1, name2 = name2)
        return result.data()

    def run_LinkPred(self) -> any:
        """
        @param self
        @return result is the dataframe from the pagerank
        """
        # create empty list
        values = []
        for index, row in self.df.iterrows():
            values.append(self.driver.session().write_transaction(self.LinkPred, row['MeSH_Term1'], row['MeSH_Term2']))
        self.df["LinkPred Score"] = values
        return self.df

In [106]:
#3 min runtime
pred = LinkPred(compare)
df = pred.run_LinkPred()

#reorder columns
df = df.drop(axis=1, columns='similarity')[['MeSH_Term1', 'Type1', 'LinkPred Score', 'Type2', 'MeSH_Term2']]
for index, row in df.iterrows():
    row['LinkPred Score'] = row['LinkPred Score'][0]['LinkPred_Score']
df = df.sort_values(by = 'LinkPred Score', ascending= False)
df.head()

Unnamed: 0,MeSH_Term1,Type1,LinkPred Score,Type2,MeSH_Term2
3,isoenzymes,OS,117.0,CVD,myocardial infarction
6,"cardiomyopathy, hypertrophic",CVD,37.0,OS,carrier proteins
0,malondialdehyde,OS,33.0,CVD,myocardial reperfusion injury
1,myocardial reperfusion injury,CVD,33.0,OS,malondialdehyde
2,myocardial reperfusion injury,CVD,31.0,OS,superoxide dismutase


In [107]:
#df.to_csv("OS_CVD_Linkpred.csv")

# Using Pagerank as Ion Channel Proteins in Link Prediction for MeSH Terms

In [108]:
from neo4j import GraphDatabase

class pagerank:
    def __init__(self) -> None:
        self.driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "heart"))

    def close(self) -> None:
        self.driver.close()

    #estimation for memory
    @classmethod
    def memory_estimation(cls, tx) -> any:
        """
        @param cls is the class
        @param tx is the transaction
        @returns the data for the memory
        """
        query = ("Call gds.pageRank.write.estimate('MeSH-PMID-Protein-Drug', {writeProperty: 'pageRank', maxIterations: 30, dampingFactor: 0.85}) YIELD nodeCount, relationshipCount, bytesMin, bytesMax, requiredMemory")
        result = tx.run(query)
        return result.single()

    @classmethod
    def pagerank(cls, tx) -> any:
        """
        @param cls is the class
        @param tx is the transaction
        @return result.data() is the data of the pagerank
        """
        query = ("Call gds.pageRank.stream('MeSH-PMID-Protein-Drug') YIELD nodeId, score RETURN gds.util.asNode(nodeId).id AS Name, score ORDER BY score DESC, Name ASC")
        result = tx.run(query)
        return result.data()


    def estimate_pagerank(self) -> any:
        """
        @param self
        @return result is the result of the memory estimation
        """
        result = self.driver.session().write_transaction(self.memory_estimation)
        return result


    def run_pagerank(self) -> any:
        """
        @param self
        @return result is the dataframe from the pagerank
        """
        result = self.driver.session().write_transaction(self.pagerank)
        result = pd.DataFrame(result)
        return result

In [109]:
import pandas as pd


driver = pagerank()
result = driver.estimate_pagerank()
print('Pagerank Info: ', result)
print('--------------------------------')

pagerank = driver.run_pagerank()

df_pr = pd.DataFrame(pagerank)
df_pr = df_pr.rename(columns={'score': 'PageRank Score'})
df_pr = df_pr.rename(columns={'Name': 'UniProt ID'})
df_pr.head(10)

Pagerank Info:  <Record nodeCount=95529 relationshipCount=316850 bytesMin=2305424 bytesMax=2305424 requiredMemory='2251 KiB'>
--------------------------------


Unnamed: 0,UniProt ID,PageRank Score
0,P05067,4771.292775
1,P14780,3611.396362
2,O00555,2614.134139
3,P41180,1953.900199
4,P29475,1858.685063
5,P11532,1723.231379
6,D12.776.543,1484.078699
7,P17302,1466.597295
8,Q03135,1338.287091
9,P14416,990.391326


In [118]:
class LinkPred():
    """Class to run LinkPred"""
    def __init__(self, mesh, protein) -> None:
        self.driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "heart"))
        self.Mesh_df = mesh
        self.Protein_df = protein


    def close(self) -> None:
        self.driver.close()

    @classmethod
    def LinkPred(cls, tx, name1, name2) -> any:
        """
        @param cls is the class
        @param tx is the transaction
        @return result.data() is the data of the cluster
        """
        query = ("""
        MATCH (m:MeSH {name: $name1})
        MATCH (n:Protein {id: $name2})
        RETURN gds.alpha.linkprediction.adamicAdar(n, m) AS LinkPred_Score
        """)
        result = tx.run(query, name1 = name1, name2 = name2)
        return result.data()

    def run_LinkPred(self) -> any:
        """
        @param self
        @return result is the dataframe from the pagerank
        """
        # create empty list
        values = []
        mesh = []
        protein = []
        type = []
        for i in tqdm(range(len(self.Mesh_df["MeSH_Term1"])), desc= "Iterating through df: "):
            for j in range(len(self.Protein_df["UniProt ID"][:20])):
                values.append(self.driver.session().write_transaction(self.LinkPred, self.Mesh_df["MeSH_Term1"][i], self.Protein_df["UniProt ID"][:20][j]))
                mesh.append(self.Mesh_df["MeSH_Term1"][i])
                protein.append(self.Protein_df["UniProt ID"][:20][j])
                type.append(self.Mesh_df["Type1"][i])
        hello = pd.DataFrame(columns=["MeSH Terms", "Type", "LinkPred Score", "UniProtID"])
        hello["MeSH Terms"] = mesh
        hello["Type"] = type
        hello["LinkPred Score"] = values
        hello["UniProtID"] = protein
        return hello

In [144]:
protein_mesh = LinkPred(df, df_pr)
result = protein_mesh.run_LinkPred()

Iterating through df: 100%|██████████| 60/60 [00:05<00:00, 10.84it/s]


In [145]:
result = result[result['LinkPred Score'].map(lambda d: len(d)) > 0]
result = result.dropna().reset_index(drop=True)
result

Unnamed: 0,MeSH Terms,Type,LinkPred Score,UniProtID
0,malondialdehyde,OS,[{'LinkPred_Score': 31.38912169269522}],P05067
1,malondialdehyde,OS,[{'LinkPred_Score': 91.74229888375838}],P14780
2,malondialdehyde,OS,[{'LinkPred_Score': 22.155828796983325}],O00555
3,malondialdehyde,OS,[{'LinkPred_Score': 26.711638554584805}],P41180
4,malondialdehyde,OS,[{'LinkPred_Score': 39.11175671966376}],P29475
...,...,...,...,...
895,heterotaxy syndrome,CVD,[{'LinkPred_Score': 0.0}],P19429
896,heterotaxy syndrome,CVD,[{'LinkPred_Score': 0.9102392266268373}],A0A654IBU3
897,heterotaxy syndrome,CVD,[{'LinkPred_Score': 0.0}],P49768
898,heterotaxy syndrome,CVD,[{'LinkPred_Score': 0.0}],P11021


In [146]:
for i in range(len(result["LinkPred Score"])):
    result["LinkPred Score"][i] = result["LinkPred Score"][i][0]["LinkPred_Score"]

result = result.sort_values(by = 'LinkPred Score', ascending= False)
result = result.dropna().reset_index(drop=True)
result = result[result["LinkPred Score"] > 0]

result = result.drop_duplicates(subset="UniProtID")

In [147]:
result

Unnamed: 0,MeSH Terms,Type,LinkPred Score,UniProtID
0,isoenzymes,OS,540.035157,P29475
1,myocardial reperfusion injury,CVD,282.899618,P19429
3,isoenzymes,OS,273.587538,P14780
5,isoenzymes,OS,170.131378,O00555
6,isoenzymes,OS,166.966789,P05067
9,myocardial reperfusion injury,CVD,88.61073,P17302
13,isoenzymes,OS,66.392272,P41180
14,myocardial reperfusion injury,CVD,62.152471,A0A654IBU3
30,isoenzymes,OS,39.333195,Q03135
39,isoenzymes,OS,30.657327,P11532


In [148]:
#result.to_csv("OS CVD and Protein LinkPred") 