# Steps which may best happen before you run the workflow locally
* be running python3.6+  (say, in a virtual enviroment)
* you will need to have  `pip install jupyter` into the venv.
* have cloned this repo using `--recursive`

This workflow invokes "owltools" (https://github.com/owlcollab/owltools/wiki/Install-OWLTools)
which is a 37M Java language tool. This needs has to be installed into a location visible to the Notebook (like ```${HOME}/bin```).  The 0.3.0 release application binary can be used (it is a bash wrapper around a Java JAR but your download tool (e.g. Mac OSX) may inadvertently give it the cryptic file extension '.dms' (No... it is **NOT** an Amiga DMS archive!)  Just rename the file (if necessary) to ```owltools```,  make it executable and move it into a suitable location like ```/anaconda3/bin``` or ```/usr/bin```.

Alternately,

```curl http://build.berkeleybop.org/userContent/owltools/owltools > ~/bin/owltools && chmod +x /usr/bin/owltools```

should do the job.

# Install Python Dependencies (including the NCATS MVP Module Library)

In [1]:
import sys
import shutil

pyptha = sys.executable.split('/')
pyptha[-2]= 'lib'
pypth='/'.join(pyptha) + '*/site-packages'

# Hack to get around problematic updating of distutils installed PyYAML and a slightly older pandas requiring a compatible numpy
shutil.rmtree(pypth + '/PyYAML*', ignore_errors=True)
shutil.rmtree(pypth + '/numpy*', ignore_errors=True)

sys.path.append("../mvp-module-library")
# Install pip requirements
!{sys.executable} -m pip install -r requirements.txt



In [2]:
from BioLink.biolink_client import BioLinkWrapper
import pandas as pd

In [3]:
def output_file(tag,title,ext):
    filename = title.replace(" ","_")
    output = open("./Tidbit/"+tag+"/"+filename+"."+ext,"w+")
    output.info = { 'tag' : tag, 'title' : title }
    return output

def dump_html(output,body):
    output.write("<html>\n")
    output.write("<head></head>\n")
    output.write("<body>\n")
    output.write("<h1>"+output.info['title']+" for "+output.info['tag']+"</h1>")
    output.write(body.to_html())
    output.write("</body>")
    output.write("</html>")

In [4]:
from Modules.Mod0_lookups import LookUp


def diseaseLookUp(input_disease_symbol, input_disease_mondo):
    
    # workflow input is a disease identifier
    lu = LookUp()
    
    input_object = {
        'input': input_disease_mondo,
        'parameters': {
            'taxon': 'human',
            'threshold': None,
        },
    }

    lu.load_input_object(input_object=input_object)
    
    # input_object = lu.input_object # not sure why this extra assignment is necessary
    # get genes associated with disease from Biolink
    disease_associated_genes = lu.disease_geneset_lookup()
    
    # create list of gene curies for downstream module input
    input_curie_set = disease_associated_genes[['hit_id', 'hit_symbol']].to_dict(orient='records')
    
    # show the disease associated genes
    disease_associated_genes['modules'] = 'Mod0'
    
    # save the seed gene definition and gene list to a 
    # file under the "Tidbit/<symbol>" subdirectory
    
    output = output_file(input_disease_symbol,"Definition","json")
    lu.echo_input_object(output)
    output.close()
    
    output = output_file(input_disease_symbol,"Disease Associated Genes","html")
    dump_html(output,disease_associated_genes)
    output.close()
    
    # genes to investigate
    return lu.input_object, disease_associated_genes, input_curie_set


In [5]:
input_disease_symbol = "VHL"
input_disease_mondo = 'MONDO:0008667'

input_object, disease_associated_genes, input_curie_set = diseaseLookUp(input_disease_symbol, input_disease_mondo)

#  Echo to console
disease_associated_genes

Unnamed: 0,input_id,input_symbol,hit_id,hit_symbol,relation,sources,modules
0,MONDO:0008667,von Hippel-Lindau disease,HGNC:12687,VHL,pathogenic_for_condition,"ctd, omim, orphane, clinvar",Mod0
1,MONDO:0008667,von Hippel-Lindau disease,HGNC:1582,CCND1,contributes to,"omim, ctd",Mod0
2,MONDO:0008667,von Hippel-Lindau disease,HGNC:23057,BRK1,pathogenic_for_condition,clinvar,Mod0


In [6]:
def load_genes(model,data,threshold):
    
    # Module specification
    inputParameters = {
        'input': data,
        'parameters': {
            'taxon': 'human',
            'threshold': threshold,
        },
    }
    
    # Load the computation parameters
    model.load_input_object(inputParameters)
    model.load_gene_set()
    
def similarity( model, data, threshold, input_disease_symbol, module, title ):

    # Initialize
    load_genes(model,data,threshold)
    model.load_associations()
    
    # Perform the comparison
    results = model.compute_similarity()
    
    # Process the results
    results_table = pd.DataFrame(results)
    results_table = results_table[~results_table['hit_id'].isin(disease_associated_genes['hit_id'].tolist())].sort_values('score', ascending=False)
    results_table['module'] = module
    
    # save the gene list to a file under the "Tidbit" subdirectory
    output = output_file(input_disease_symbol,title,"html")
    dump_html(results_table)
    
    return results_table

# Mod1A Functional Similarity
## Find similar genes based on GO functional annotations using OntoBio Jaccard similarity

In [7]:
from Modules.Mod1A_functional_sim import FunctionalSimilarity

func_sim_human = FunctionalSimilarity()

# Using Jaccard index threshold
Mod1A_results = similarity( func_sim_human, input_curie_set, 0.75, input_disease_symbol, 'Mod1A', "Functionally Similar Genes" )

Mod1A_results

TypeError: dump_html() missing 1 required positional argument: 'body'

# MOD1B Phenotype Similarity
## Find similar genes based on OwlSim calculated Phenotype Similarity

## Mod1B Human

In [8]:
from Modules.Mod1B1_phenotype_similarity import PhenotypeSimilarity

pheno_sim_human = PhenotypeSimilarity()

# Using OwlSim calculation threshold
Mod1B_results = similarity( pheno_sim_human, input_curie_set, 0.50, input_disease_symbol, 'Mod1B', "Phenotypically Similar Genes" )

Mod1B_results

TypeError: __init__() got an unexpected keyword argument 'verbose'

# Mod1E Protein Interaction

In [None]:
def gene_interactions( model, data, input_disease_symbol, module, title ):
    
    # Initialize
    load_genes(model,data,None)
        
    results = model.get_interactions()
    
    results_table = pd.DataFrame(results)
    
    counts = results_table['hit_symbol'].value_counts().rename_axis('unique_values').to_frame('counts').reset_index()
    high_counts = counts[counts['counts'] > 12]['unique_values'].tolist()
    
    final_results_table = pd.DataFrame(result_tables[result_tables['hit_symbol'].isin(high_counts)])
    
    final_results_table['module'] = module
    
    # save the gene list to a file under the "Tidbit" subdirectory
    dump(final_results_table.head(),input_disease_symbol,title)
    
    # Echo to console

    final_results_table.head()  

## Mod1E Human

In [None]:
from Modules.Mod1E_interactions import GeneInteractions

interactions_human = GeneInteractions()

gene_interactions( interactions_human, input_curie_set, input_disease_symbol, 'Mod1E', "Gene Interactions" )

# Publish Aggregate Results

In [None]:
from Modules.StandardOutput import StandardOutput

def aggegrate_results(resultsA,resultsB):
    
    # aggregate results
    all_results = pd.concat([resultsA,resultsB])
    
    so = StandardOutput(results=all_results.to_dict(orient='records'), input_object=input_object)
    std_api_response_json = so.output_object
    std_api_response_json
    
    return std_api_response_json

std_api_response_json = aggegrate_results(Mod1A_results_human, Mod1B_results)

In [None]:
import requests

def publish_to_rtx(std_api_response_json):
    
    # get the URL for these results displayed in the RTX UI
    RTX_UI_REQUEST_URL = "https://rtx.ncats.io/api/rtx/v1/response/process"
    to_post = {"options": ["Store", "ReturnResponseId"], "responses": [std_api_response_json]}
    ui_url = requests.post(RTX_UI_REQUEST_URL, json=to_post)
    print("Please visit the following website: https://rtx.ncats.io/?r=%s" % ui_url.json()['response_id'])
    
    # Retrieving Details

    print("Please visit the following link to retrieve JSON results: https://rtx.ncats.io/api/rtx/v1/response/%s" % ui_url.json()['response_id'])

publish_to_rtx(std_api_response_json)

In [None]:
# Read a table of diseases and process
with open("diseases.tsv","r") as diseases:
    for entry in diseases.readlines():
        field = entry.split("\t")
        continue if field[1] == "Disease"
        
        input_disease_symbol = field[1]
        input_disease_mondo  = field[3]
        
        # process
        