# Refresh ermineJ GO files and run it


## 1. Refresh ermineJ data

Download and let python unzip the ontology and annotation files. ErmineJ can handle zipped files, but this allows us to dig into them out of curiosity if we like.

In [1]:
""" Update these paths for fresh data before running GO.
    If you enter the URL up to the final '/', without the file, you can browse and see modified dates.
"""

ontology = {
    'url': 'http://archive.geneontology.org/latest-termdb/go_daily-termdb.rdf-xml.gz',
    'path': './2019-07-09-erminej_go.rdf-xml',
}

annotation = {
    'url': 'https://gemma.msl.ubc.ca/annots/Generic_human_ncbiIds_noParents.an.txt.gz',
    'path': './2019-11-20-erminej_human_annotation_entrezid.txt',
}


In [2]:
""" Download a fresh version of each compressed file and save it decompressed. """

import urllib.request as request
import gzip

for data_file in [ontology, annotation, ]:
    response = request.urlopen(data_file['url'])
    with open(data_file['path'], "wb") as f:
        f.write(gzip.decompress(response.read()))


## 2. Prepare a scored gene list

We have tables of probes with average rankings across 32 split-quarters. This is what we want to feed ermineJ, but they are indexed by probe_id values, not gene labels. Convert probe_ids to gene_names, then save out as a tsv file for use by ermineJ.


In [3]:
""" Load four dataframes, one for each distance-mask, and save each in a format ermineJ can read. """

import os
import pandas as pd


base_dir = "/home/mike/ge_data"
masks = ["00", "16", "32", "64", ]
rank_files = []
os.makedirs("gene_scores", exist_ok=True)

dfs = {}
for mask in masks:
    # Determine variables
    filename = "hcpww{}ss4peak_ranked_full.csv".format(mask)
    filepath = os.path.join(base_dir, "plots", filename)

    # Load data, and sort by mean ranking
    dfs[mask] = pd.read_csv(filepath).sort_values('raw_mean', ascending=True).set_index('entrez_id')[['raw_mean']]
    rank_file_path = "gene_scores/mean_ranks_{}.tsv".format(mask)
    rank_files.append(rank_file_path)
    dfs[mask].to_csv(rank_file_path, sep="\t")


## 3. Execute ermineJ analyses

The easiest way, perhaps, to do this is to use the java GUI to select an analysis and determine which options are the best bet. Then look at the text file saved out as results. It will have a full command with all options used toward the top of the output file. Use those options to seed the command below, then it can be re-run, looped over, etc. with different inputs or settings.


In [6]:
""" Update paths to your own installation. Then this should just work. """

import subprocess
import time


os.makedirs("./results", exist_ok=True)
result_files = []

for rank_file in rank_files:
    start_time = time.time()
    result_file = rank_file.replace("gene_scores", "results").replace(".tsv", "_erminej_gsr_results.txt")
    
    # Run ermineJ
    p = subprocess.run(
        [
            'ermineJ.sh',
            '-d', '/home/mike/Dropbox/Projects/GE-Conn/gene_ontology/ermineJ.data',
            '--annots', annotation['path'],
            '--classFile', ontology['path'],
            '--scoreFile', rank_file,
            '--test', 'GSR',                 # Method for computing significance. GSR best for gene scores
            '--mtc', 'FDR',                  # FDR indicates Benjamini-Hochberg corrections for false discovery rate
            '--reps', 'BEST',                # If a gene has multiple scores in input, use BEST
            '--genesOut',                    # Include gene symbols in output
            '--minClassSize', '5',           # smallest gene set size to be considered
            '--maxClassSize', '128',         # largest gene set size to be considered
            '-aspects', 'BCM',               # Test against all three GO components
            '-b', 'false',                   # Big is not better, rankings are low==good
            '--logTrans', 'false',           # If we fed p-values, we would set this to true
            '--output', result_file,
        ],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )
    
    # Write the log file
    result_files.append(result_file)
    with open(result_file.replace(".txt", ".log"), "w") as f:
        f.write("STDOUT:\n")
        f.write(p.stdout.decode())
        f.write("STDERR:\n")
        f.write(p.stderr.decode())
        
    end_time = time.time()

    print("Gene ontology for {} took {:0.1f} seconds.".format(rank_file, end_time - start_time))
    

Gene ontology for gene_scores/mean_ranks_00.tsv took 24.7 seconds.
Gene ontology for gene_scores/mean_ranks_16.tsv took 24.5 seconds.
Gene ontology for gene_scores/mean_ranks_32.tsv took 25.2 seconds.
Gene ontology for gene_scores/mean_ranks_64.tsv took 24.6 seconds.


## 4a. Display results in this notebook

We can parse the result files as tsv data after stripping headers and footers.

In [None]:
""" Create functions to manipulate ermineJ output """

import re
import pandas as pd


def tsvify_erminej_result(result_file):
    """ Take in a result from ermineJ and write it out with headers stripped. """

    tsv_file = result_file.replace(".txt", ".tsv")
    with open(result_file, "r") as f_in:
        with open(tsv_file, "w") as f_out:
            for i, line in enumerate(f_in):
                head_match = re.search('^#!\t', line)
                if head_match:
                    f_out.write(line[3:].rstrip() + "\n")
                data_match = re.search('^!\t', line)
                if data_match:
                    f_out.write(line[2:].rstrip() + "\n")
    return tsv_file


def describe_top_results(tsv_file, top_n=10):
    """ Read a tsv-based results file and print results legibly. """
    
    def describe_go_term(row):
        import pdb; pdb.set_trace()
        return "{:<12}: {:<48} p{}".format(
            row['ID'], row['Name'],
            "={:0.5f}".format(row['CorrectedPvalue']) if row['CorrectedPvalue'] > 0.00001 else "<0.00001"
        )
    
    df = pd.read_csv(tsv_file, sep='\t').sort_values('CorrectedPvalue', ascending=True)
    df.loc[df['CorrectedPvalue'] < 0.05, :].iloc[:top_n, :].apply(lambda row: print(describe_go_term(row)), axis='columns')
    

In [5]:
"""
tsv_file = "results/mean_ranks_00_erminej_gsr_results.tsv"
df = pd.read_csv(tsv_file, sep='\t').sort_values('CorrectedPvalue', ascending=True)
df.loc[df['CorrectedPvalue'] < 0.5, :]
"""
pass

In [None]:
""" Interpret and report on results. """

for result_file in result_files:
    tsv_file = tsvify_erminej_result(result_file)
    describe_top_results(tsv_file)


## 4b. Explore results in ermineJ

In [None]:
""" You'll need to manually load the results into ermineJ to view them.
    You may even have to manually select annotation and GO files (it seems to ignore these when using --gui)
    After [start]ing, Ctrl-L to load results.
"""

p = subprocess.run(
    [
        'ermineJ.sh',
        '-d', '/home/mike/Dropbox/Projects/GE-Conn/gene_ontology/ermineJ.data',
        '--annots', annotation['path'],
        '--classFile', ontology['path'],
        '--gui',
    ],
    stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
