# Arachne Pipeline

This notebook creates a network using Arachne-generated data.

## Imports Python Packages Needed in the Pipeline

In [107]:
import ndex2.client as nc
import json
import pandas as pd
import sys
import jsonschema
import requests
import ndexutil.tsv.tsv2nicecx as t2n
from os.path import isfile, expanduser
from os import listdir

### NDEx Connection Function

In [108]:
def load_tutorial_config(connection_name):
    username = "foo"
    password = None
    config_file = expanduser("~/ndex_tutorial_config.json")
    server = "http://public.ndexbio.org"

    if isfile(config_file):
        file = open(config_file, "r")
        config = json.load(file)
        file.close()
        print(json.dumps(config))
        connections = config.get("connections")
        if connections:
            if connection_name:
                connection = connections.get(connection_name)
                if connection.get("password") and connection.get("username"):
                    if connection.get("server"):
                        server = connection.get("server")
                    username = connection.get("username")
                    password = connection.get("password")
                else:
                    print("Error: " + str(connection_name) + " does not define both username and password")
            else:
                print("Error: " + str(connection_name) + "connection name is not defined")
        else:
            print("Error: " + config_file + " does not define any connections")
    else:
        print("Error: " + config_file + " was not found")

    return server, username, password

### MyGene.info Access Functions

In [116]:

def query(q, tax_id='9606', entrezonly=True):
    if entrezonly:
        r = requests.get('http://mygene.info/v3/query?q='+q+'&species='+tax_id+'&entrezonly=true')
    else:
        r = requests.get('http://mygene.info/v3/query?q='+q+'&species='+tax_id)
    result = r.json()
    result['query'] = q
    return result

# run multiple queries and gather results
def query_list(queries, tax_id='9606', fields='sym'):
    results = []
    for q in queries:
        results.append(query(q, tax_id))
    return results

# use the mygene.info efficient batch query method
# to process multiple identifers in one operation
def query_batch(
        query_string,
        tax_id='9606',
        scopes="symbol, entrezgene, alias, uniprot",
        fields="symbol, entrezgene"):
    data = {'species': tax_id,
            'scopes': scopes,
            'fields': fields,
            'q': query_string}
    r = requests.post('http://mygene.info/v3/query', data)
    json = r.json()
    return json

def split_to_lists(input_list, target_length=5000):
    '''splits list of strings into sublists, each 
    having string length at most 5000'''
    out = [[]]
    while input_list:
        if len("', '".join(out[-1])) + len(input_list[0]) < target_length:
            out[-1].append(input_list.pop(0))
        else:
            if not out[-1]: # string longer than target_length
                out[-1] = [input_list.pop(0)]
            out.append([])
    return out

def query_large_batch(
        input_ids,
        scopes=["symbol", "entrezgene", "uniprot"],
        field ="entrezgene",
        tax_id='9606',
        prefix = "ncbigene"):
    all_query_results = []
    input_id_lists = split_to_lists(input_ids)
    for id_list in input_id_lists:
        query_string = ", ".join(id_list)
        scope_string = ", ".join(scopes)
        query_results = query_batch(
            query_string,
            scopes=scope_string,
            fields=field,
            tax_id=tax_id)
        print("query batch: ids: " + str(len(id_list)) + " -> results: " + str(len(query_results)))
        all_query_results.extend(query_results)
    return all_query_results

# --------------------------------------------------
#
#  translation methods
#
# --------------------------------------------------

# return:
# 1. a dictionary mapping identifiers in input_ids
# to identifiers types in scopes. When there are ambiguous
# mappings, the highest ranked mapping is chosen and the
# others are placed in an "alternatives" attribute.
# 2. a list of unmapped input_ids
#
# The mygene.info search is limited to the identifier types in scopes.
#
# The results are limited to the species specified by tax_id.

# Optionally, prefix_map may be set to a dictionary mapping
# identifier types to prefixes to prepend to mapped ids, e.g. <prefix>:<id>
#
def get_identifier_map(
        input_ids,
        scopes=["symbol", "entrezgene", "uniprot"],
        field ="entrezgene",
        tax_id='9606',
        prefix = "ncbigene"):
    query_results = query_large_batch(
        input_ids,
        scopes=scopes,
        field =field,
        tax_id=tax_id,
        prefix = prefix)
    map = {}
    for result in query_results:
        # print(json.dumps(result))
        output_id = result.get(field)
        input_id = result.get("query")
        if output_id:
            mapping = map.get(input_id)
            if mapping:
                # if there is already a mapping, 
                # we test to see the new mapping
                # has a better score
                if mapping.get("_score") < result.get("_score"):
                    map[input_id] = result
            else:
                map[input_id] = result

    # make output map, add prefixes
    output_map = {}
    for input_id, mapping in map.items():
        output_map[input_id] = prefix + ":" + str(mapping.get(field))
    
    # scan query results and populate unmmaped
    unmapped = []
    for result in query_results:
        input_id = result.get("query")
        if not output_map.get(input_id):
            unmapped.append(input_id)
    unmapped = list(set(unmapped))    
    return output_map, unmapped
                

### Set variables
Note: The load_tutorial_config() function uses a file in the user's home directory (**~/ndex_tutorial_config.json**).  A sample json file is included in the root of this project (also named "ndex_tutorial_config.json").  Edit this file and copy it to your user home directory.  Alternatively you can just use the hardcoded variables below.

In [111]:
my_server, my_username, my_password = load_tutorial_config("main")
# alternatively, edit and uncomment these lines to set the connection parameters manually
#my_server = "public.ndexbio.org"
#my_username = 'username'
#my_password = 'password'

if 'dev.ndexbio.org' in my_server:
    cytoscape_visual_properties_template_id = 'c7075eb1-231e-11e8-894b-525400c25d22' # DEV
else:
    cytoscape_visual_properties_template_id = '06afeea7-2e04-11e8-b939-0ac135e8bacf' # PUBLIC

my_ndex=nc.Ndex2(my_server, my_username, my_password)
print(my_server)
print(my_username)
print(my_password)

{"connections": {"main": {"username": "drh", "password": "drh"}}}
http://public.ndexbio.org
drh
drh


### Set load plan
Note: To see the field mapping (load plan) open **arachne_load_plan.json**. This json file resides in the same directory as this notebook

In [112]:
path_to_load_plan = 'load_plan.json'
load_plan = None
with open(path_to_load_plan, 'r') as lp:
    load_plan = json.load(lp)

### Process Arachne networks

In [117]:
def process_arachne_networks(load_plan):
    print('starting...')
    for file in listdir("data"):
        if file.endswith(".txt"):   
            print("processing " + file)
            process_arachne_network(load_plan, "data/" + file)
    print('finished...')
        
def process_arachne_network(load_plan, file):
    # @CONTEXT is set from the load plan
    with open(file, 'r') as tsvfile:
        header = [h.strip() for h in tsvfile.readline().split('\t')]
        df = pd.read_csv(
            tsvfile, 
            delimiter='\t', 
            na_filter=False, 
            engine='python', names=header)
    
    # ncbigene symbols are prefixed
    # create translation map for gene symbols
    node_identifiers = list(set(df["Gene1"].tolist() + df["Gene2"].tolist()))
    print(str(len(node_identifiers)) + " node_identifiers")
    identifier_map, unmapped_identifiers = get_identifier_map(
        node_identifiers,
        scopes = ["symbol", "alias"],
        field = "entrezgene",
        tax_id='9606',
        prefix = "ncbigene")    
    print(json.dumps(unmapped_identifiers))
    print(str(len(identifier_map.keys())) + " map entries")
    
    # for input_id, output_id in identifier_map.items():
    #    print(input_id + " -> " + output_id)
    
    # add columns Rep1 and Rep2 and translate Gene1 and Gene2
    df['Rep1'] = df['Gene1'].apply(lambda x: identifier_map.get(x)) # if identifier_map.get(x) else x)
    df['Rep2'] = df['Gene2'].apply(lambda x: identifier_map.get(x)) # if identifier_map.get(x) else x)
    
    print(df)

    # if the represents field is empty, substitute the gene name
#    df['Rep1'].replace('', df['Gene1'], inplace=True)
#    df['Rep2'].replace('', df['Gene2'], inplace=True)

#def convert_pandas_to_nice_cx_with_load_plan(pandas_dataframe, load_plan, max_rows=None,
#                                            name=None, description=None,
#                                            network_attributes=None, provenance=None):
    network = t2n.convert_pandas_to_nice_cx_with_load_plan(df, load_plan)

#    network.set_network_attribute("organism", "Human, 9606, Homo sapiens")
#    network.union_node_attributes('alias', 'alias2', 'alias')

    network.apply_template(username=my_username, password=my_password, server=my_server,
                           uuid=cytoscape_visual_properties_template_id)
    message = network.upload_to(my_server, my_username, my_password)


In [118]:
process_arachne_networks(load_plan)

starting...
processing BreastCancerMCF7-CMAP.txt
9571 node_identifiers
query batch: ids: 54 -> results: 60
query batch: ids: 54 -> results: 61
query batch: ids: 55 -> results: 64
query batch: ids: 54 -> results: 61
query batch: ids: 57 -> results: 62
query batch: ids: 53 -> results: 63
query batch: ids: 53 -> results: 60
query batch: ids: 54 -> results: 63
query batch: ids: 53 -> results: 60
query batch: ids: 55 -> results: 61
query batch: ids: 56 -> results: 59
query batch: ids: 55 -> results: 65
query batch: ids: 55 -> results: 59
query batch: ids: 55 -> results: 57
query batch: ids: 54 -> results: 59
query batch: ids: 54 -> results: 55
query batch: ids: 57 -> results: 77
query batch: ids: 55 -> results: 62
query batch: ids: 54 -> results: 60
query batch: ids: 55 -> results: 58
query batch: ids: 54 -> results: 63
query batch: ids: 55 -> results: 66
query batch: ids: 55 -> results: 63
query batch: ids: 54 -> results: 65
query batch: ids: 55 -> results: 70
query batch: ids: 54 -> resul

RuntimeError: Id value is missing.