In [2]:
import requests
import pandas as pd


In [8]:
config_data = {
    'BASE_URI' : 'https://spoke.rbvi.ucsf.edu',
    'cutoff_Compound_max_phase' : 3,
    'cutoff_Protein_source' : ['SwissProt'],
    'cutoff_DaG_diseases_sources' : ['knowledge', 'experiments'],
    'cutoff_DaG_textmining' : 3,
    'cutoff_CtD_phase' : 3,
    'cutoff_PiP_confidence' : 0.7,
    'cutoff_ACTeG_level' : ['Low', 'Medium', 'High']    
}


In [9]:
def get_spoke_api_resp(base_uri, end_point, params=None):
    uri = base_uri + end_point
    if params:
        return requests.get(uri, params=params)
    else:
        return requests.get(uri)

    
def get_context_using_spoke_api(node_value):
    type_end_point = "/api/v1/types"
    result = get_spoke_api_resp(config_data['BASE_URI'], type_end_point)
    data_spoke_types = result.json()
    node_types = list(data_spoke_types["nodes"].keys())
    edge_types = list(data_spoke_types["edges"].keys())
    node_types_to_remove = ["DatabaseTimestamp", "Version"]
    filtered_node_types = [node_type for node_type in node_types if node_type not in node_types_to_remove]
    api_params = {
        'node_filters' : filtered_node_types,
        'edge_filters': edge_types,
        'cutoff_Compound_max_phase': config_data['cutoff_Compound_max_phase'],
        'cutoff_Protein_source': config_data['cutoff_Protein_source'],
        'cutoff_DaG_diseases_sources': config_data['cutoff_DaG_diseases_sources'],
        'cutoff_DaG_textmining': config_data['cutoff_DaG_textmining'],
        'cutoff_CtD_phase': config_data['cutoff_CtD_phase'],
        'cutoff_PiP_confidence': config_data['cutoff_PiP_confidence'],
        'cutoff_ACTeG_level': config_data['cutoff_ACTeG_level']
    }
    node_type = "Disease"
    attribute = "name"
    nbr_end_point = "/api/v1/neighborhood/{}/{}/{}".format(node_type, attribute, node_value)
    result = get_spoke_api_resp(config_data['BASE_URI'], nbr_end_point, params=api_params)
    node_context = result.json()
    nbr_nodes = []
    nbr_edges = []
    for item in node_context:
        if "_" not in item["data"]["neo4j_type"]:
            try:
                if item["data"]["neo4j_type"] == "Protein":
                    nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["description"]))
                else:
                    nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["name"]))
            except:
                nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["identifier"]))
        elif "_" in item["data"]["neo4j_type"]:
            try:
                provenance = ", ".join(item["data"]["properties"]["sources"])
            except:
                try:
                    provenance = item["data"]["properties"]["source"]
                    if isinstance(provenance, list):
                        provenance = ", ".join(provenance)                    
                except:
                    try:                    
                        preprint_list = ast.literal_eval(item["data"]["properties"]["preprint_list"])
                        if len(preprint_list) > 0:                                                    
                            provenance = ", ".join(preprint_list)
                        else:
                            pmid_list = ast.literal_eval(item["data"]["properties"]["pmid_list"])
                            pmid_list = map(lambda x:"pubmedId:"+x, pmid_list)
                            if len(pmid_list) > 0:
                                provenance = ", ".join(pmid_list)
                            else:
                                provenance = "Based on data from Institute For Systems Biology (ISB)"
                    except:                                
                        provenance = "SPOKE-KG"                                    
            nbr_edges.append((item["data"]["source"], item["data"]["neo4j_type"], item["data"]["target"], provenance))
    nbr_nodes_df = pd.DataFrame(nbr_nodes, columns=["node_type", "node_id", "node_name"])
    nbr_edges_df = pd.DataFrame(nbr_edges, columns=["source", "edge_type", "target", "provenance"])
    merge_1 = pd.merge(nbr_edges_df, nbr_nodes_df, left_on="source", right_on="node_id").drop("node_id", axis=1)
    merge_1.loc[:,"node_name"] = merge_1.node_type + " " + merge_1.node_name
    merge_1.drop(["source", "node_type"], axis=1, inplace=True)
    merge_1 = merge_1.rename(columns={"node_name":"source"})
    merge_2 = pd.merge(merge_1, nbr_nodes_df, left_on="target", right_on="node_id").drop("node_id", axis=1)
    merge_2.loc[:,"node_name"] = merge_2.node_type + " " + merge_2.node_name
    merge_2.drop(["target", "node_type"], axis=1, inplace=True)
    merge_2 = merge_2.rename(columns={"node_name":"target"})
    merge_2 = merge_2[["source", "edge_type", "target", "provenance"]]
    merge_2.loc[:, "predicate"] = merge_2.edge_type.apply(lambda x:x.split("_")[0])
    merge_2.loc[:, "context"] =  merge_2.source + " " + merge_2.predicate.str.lower() + " " + merge_2.target + " and Provenance of this association is " + merge_2.provenance + ". "
    context = merge_2['context'].str.cat(sep=' ')
    context += node_value + " has a " + node_context[0]["data"]["properties"]["source"] + " identifier of " + node_context[0]["data"]["properties"]["identifier"] + " and Provenance of this association is " + node_context[0]["data"]["properties"]["source"] + "."
    return context


In [161]:
%%time

node_value = 'giant cell glioblastoma'

type_end_point = "/api/v1/types"
result = get_spoke_api_resp(config_data['BASE_URI'], type_end_point)
data_spoke_types = result.json()
node_types = list(data_spoke_types["nodes"].keys())
edge_types = list(data_spoke_types["edges"].keys())
node_types_to_remove = ["DatabaseTimestamp", "Version"]
filtered_node_types = [node_type for node_type in node_types if node_type not in node_types_to_remove]
api_params = {
    'node_filters' : filtered_node_types,
    'edge_filters': edge_types,
    'cutoff_Compound_max_phase': config_data['cutoff_Compound_max_phase'],
    'cutoff_Protein_source': config_data['cutoff_Protein_source'],
    'cutoff_DaG_diseases_sources': config_data['cutoff_DaG_diseases_sources'],
    'cutoff_DaG_textmining': config_data['cutoff_DaG_textmining'],
    'cutoff_CtD_phase': config_data['cutoff_CtD_phase'],
    'cutoff_PiP_confidence': config_data['cutoff_PiP_confidence'],
    'cutoff_ACTeG_level': config_data['cutoff_ACTeG_level'],
    'depth' : 1
}
node_type = "Disease"
attribute = "name"
nbr_end_point = "/api/v1/neighborhood/{}/{}/{}".format(node_type, attribute, node_value)
result = get_spoke_api_resp(config_data['BASE_URI'], nbr_end_point, params=api_params)
node_context = result.json()
len(node_context)


CPU times: user 68.7 ms, sys: 6.93 ms, total: 75.6 ms
Wall time: 286 ms


124

In [143]:
edge_evidence = False

nbr_nodes = []
nbr_edges = []
for item in node_context:
    if "_" not in item["data"]["neo4j_type"]:
        try:
            if item["data"]["neo4j_type"] == "Protein":
                nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["description"]))
            else:
                nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["name"]))
        except:
            nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["identifier"]))
    elif "_" in item["data"]["neo4j_type"]:
        try:
            provenance = ", ".join(item["data"]["properties"]["sources"])
        except:
            try:
                provenance = item["data"]["properties"]["source"]
                if isinstance(provenance, list):
                    provenance = ", ".join(provenance)                    
            except:
                try:                    
                    preprint_list = ast.literal_eval(item["data"]["properties"]["preprint_list"])
                    if len(preprint_list) > 0:                                                    
                        provenance = ", ".join(preprint_list)
                    else:
                        pmid_list = ast.literal_eval(item["data"]["properties"]["pmid_list"])
                        pmid_list = map(lambda x:"pubmedId:"+x, pmid_list)
                        if len(pmid_list) > 0:
                            provenance = ", ".join(pmid_list)
                        else:
                            provenance = "Based on data from Institute For Systems Biology (ISB)"
                except:                                
                    provenance = "SPOKE-KG"     
        try:
            evidence = item["data"]["properties"]
        except:
            evidence = None
        nbr_edges.append((item["data"]["source"], item["data"]["neo4j_type"], item["data"]["target"], provenance, evidence))
    nbr_nodes_df = pd.DataFrame(nbr_nodes, columns=["node_type", "node_id", "node_name"])
    nbr_edges_df = pd.DataFrame(nbr_edges, columns=["source", "edge_type", "target", "provenance", "evidence"])
    merge_1 = pd.merge(nbr_edges_df, nbr_nodes_df, left_on="source", right_on="node_id").drop("node_id", axis=1)
    merge_1.loc[:,"node_name"] = merge_1.node_type + " " + merge_1.node_name
    merge_1.drop(["source", "node_type"], axis=1, inplace=True)
    merge_1 = merge_1.rename(columns={"node_name":"source"})
    merge_2 = pd.merge(merge_1, nbr_nodes_df, left_on="target", right_on="node_id").drop("node_id", axis=1)
    merge_2.loc[:,"node_name"] = merge_2.node_type + " " + merge_2.node_name
    merge_2.drop(["target", "node_type"], axis=1, inplace=True)
    merge_2 = merge_2.rename(columns={"node_name":"target"})
    merge_2 = merge_2[["source", "edge_type", "target", "provenance", "evidence"]]
    merge_2.loc[:, "predicate"] = merge_2.edge_type.apply(lambda x:x.split("_")[0])
#     if edge_evidence:
#         merge_2.loc[:, "context"] =  merge_2.source + " " + merge_2.predicate.str.lower() + " " + merge_2.target + " and Provenance of this association is " + merge_2.provenance + " and attributes associated with this association is in the following JSON format:\n " + merge_2.evidence.astype('str') + "\n\n"
#     else:
    merge_2.loc[:, "context"] =  merge_2.source + " " + merge_2.predicate.str.lower() + " " + merge_2.target + " and Provenance of this association is " + merge_2.provenance + ". "
    context = merge_2.context.str.cat(sep=' ')
    context += node_value + " has a " + node_context[0]["data"]["properties"]["source"] + " identifier of " + node_context[0]["data"]["properties"]["identifier"] + " and Provenance of this is from " + node_context[0]["data"]["properties"]["source"] + "."



In [154]:
high_similarity_context = ['Disease multiple sclerosis associates Gene HLA-DQA1 and Provenance of this association is GWAS. ',
                          'Disease multiple sclerosis associates Gene HLA-DRB1 and Provenance of this association is DISEASES. ',
                          'Disease multiple sclerosis associates Gene ATXN1 and Provenance of this association is GWAS. ']

merge_2[merge_2.context.isin(high_similarity_context)]
merge_2.context.values[0]


'Disease secondary progressive multiple sclerosis isa Disease multiple sclerosis and Provenance of this association is Disease Ontology. '

In [134]:
high_similarity_context = ['Disease multiple sclerosis associates Gene HLA-DQA1 and Provenance of this association is GWAS.',
                          'Disease multiple sclerosis associates Gene HLA-DRB1 and Provenance of this association is DISEASES.',
                          'Disease multiple sclerosis associates Gene ATXN1 and Provenance of this association is GWAS.']
high_similarity_context.append('Gene Xs sds associates with Disease multiple sclerosis and Provenance of this association is GWAS.')
node_name = 'multiple sclerosis'
node_types = nbr_nodes_df.node_type.unique()


nodes = list(filter(None, list(map(lambda x:x if '_' not in x['data']['neo4j_type'] else None, node_context))))
edges = list(filter(None, list(map(lambda x:x if '_' in x['data']['neo4j_type'] else None, node_context))))

source_node_id = list(map(lambda x:x['data']['id'] if x['data']['properties']['name'] == node_name else None, nodes))[0]


In [132]:
import numpy as np
sentence = high_similarity_context[0]

for node_type in node_types:
    if node_type in sentence:        
        print(np.where(node_type == np.array(sentence.split(' '))))

sentence.split('Gene ')[-1]

(array([0]),)
(array([4]),)


'HLA-DQA1 and Provenance of this association is GWAS.'

['Disease multiple sclerosis associates Gene HLA-DQA1 and Provenance of this association is GWAS.',
 'Disease multiple sclerosis associates Gene HLA-DRB1 and Provenance of this association is DISEASES.',
 'Disease multiple sclerosis associates Gene ATXN1 and Provenance of this association is GWAS.']

In [45]:
item["data"]["properties"]

{'diseases_identifiers': ['https://diseases.jensenlab.org/Entity?documents=10&type1=9606&id1=ENSP00000369889&type2=-26&id2=DOID:0080044',
  'MedlinePlus'],
 'diseases_scores': ['6.503', 'CURATED'],
 'sources': ['DISEASES'],
 'diseases_sources': ['textmining', 'knowledge'],
 'diseases_confidences': [3.252, 5.0]}

In [104]:
source = 'multiple sclerosis'
target = 'COL2A1'

nodes = list(filter(None, list(map(lambda x:x if '_' not in x['data']['neo4j_type'] else None, node_context))))
edges = list(filter(None, list(map(lambda x:x if '_' in x['data']['neo4j_type'] else None, node_context))))


In [37]:
# def get_node_id(inp_node)

list(map(lambda x:x['data']['id'] if x['data']['properties']['name'] == source else None, nodes))[0]


152375