# Prepare per observation details

In [2]:
import collections
import json
import bz2
import os
import configparser

import pandas
import numpy
from scipy.special import logit
#from neo4j.v1 import GraphDatabase

import hetio.readwrite
import hetio.neo4j

In [3]:
coef_df = pandas.read_table('model/coefficient.tsv')
feature_df = pandas.read_table('features/transformed-features.tsv.bz2', low_memory=False)

CParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [3]:
drop = {'prior_logit', 'intercept'}
coef_df = coef_df.query("feature not in @drop")
coef_df = coef_df.query("coef != 0")
coef_df = coef_df.sort_values('coef')

## Term contribution

In [4]:
coef_series = pandas.Series(data=coef_df.coef.tolist(), index=coef_df.feature)
contrib_df = feature_df[coef_df.feature].mul(coef_series, axis='columns')
contrib_df = feature_df[['compound_id', 'disease_id']].join(contrib_df)

In [5]:
contrib_df.head(2)

Unnamed: 0,compound_id,disease_id,degree_CrC,degree_CiPC,dwpc_CiPCiCdGaD,dwpc_CbGdDdGaD,dwpc_CrCuGaDrD,dwpc_CrCdGiGaD,dwpc_CrCuG<rGaD,dwpc_CrCdGeAlD,...,dwpc_CbGeAlD,dwpc_CbGpPWpGaD,dwpc_CpDpCtD,dwpc_CcSEcCtD,dwpc_CrCrCtD,dwpc_CtDrD,dwpc_CrCtD,dwpc_CiPCiCtD,dwpc_CbGaD,dwpc_CbGbCtD
0,DB01048,DOID:10652,0.111438,-0.102151,-0.144767,-0.166562,-0.020455,-0.032502,-0.032995,0.003374,...,-0.01148,0.04151,-0.005732,0.142106,-0.020583,-0.016584,-0.016985,-0.016315,-0.070953,-0.114095
1,DB05812,DOID:10652,0.203591,0.05518,0.020172,0.005132,0.005451,-0.039674,-0.016017,0.004326,...,0.001456,0.059534,-0.005732,0.137955,-0.020583,-0.016584,-0.016985,-0.016315,0.243182,0.296602


In [6]:
with bz2.open('./predictions/term-contribution.tsv.bz2', 'wt') as write_file:
    contrib_df.to_csv(write_file, float_format='%.5g', sep='\t', index=False)

## Metapath contribution

In [7]:
pos_dwpc_coef_df = coef_df[(coef_df.feature.str.startswith('dwpc_')) & (coef_df.coef > 0)].copy()
pos_dwpc_coef_df['metapath'] = pos_dwpc_coef_df.feature.map(lambda x: x.split('_')[1])
pos_dwpc_coef_df.head(2)

Unnamed: 0,feature,coef,zcoef,metapath
37,dwpc_CbGdCrCtD,0.000694,0.001141,CbGdCrCtD
101,dwpc_CrCbGaD,0.002035,0.002283,CrCbGaD


In [8]:
# Estimate of percent contribution of each positive term with a positive coefficient
pos_contrib_df = contrib_df[pos_dwpc_coef_df.feature].copy()
pos_contrib_df[pos_contrib_df < 0] = 0
observation_total = pos_contrib_df.sum(axis='columns')
pos_contrib_df = pos_contrib_df.div(observation_total, axis='index')
pos_contrib_df.columns = [x.split('_')[1] for x in pos_contrib_df.columns]
pos_contrib_df = contrib_df[['compound_id', 'disease_id']].join(pos_contrib_df)
#pos_contrib_df = predict_df.merge(pos_contrib_df)
#pos_contrib_df.sort_values('prediction', inplace=True, ascending=True)
pos_contrib_df.head(2)

Unnamed: 0,compound_id,disease_id,CbGdCrCtD,CrCbGaD,CbGeAlD,CbGpPWpGaD,CpDpCtD,CcSEcCtD,CrCrCtD,CtDrD,CrCtD,CiPCiCtD,CbGaD,CbGbCtD
0,DB01048,DOID:10652,0.0,0.0,0.0,0.22607,0.0,0.77393,0.0,0.0,0.0,0.0,0.0,0.0
1,DB05812,DOID:10652,0.0,0.0,0.001971,0.08059,0.0,0.186746,0.0,0.0,0.0,0.0,0.32919,0.401503


In [9]:
with bz2.open('./predictions/metapath-contribution.tsv.bz2', 'wt') as write_file:
    pos_contrib_df.to_csv(write_file, float_format='%.4g', sep='\t', index=False)

## Contribution by path

In [10]:
predict_df = (
    pandas.read_table('predictions/probabilities.tsv', low_memory=False)
#    .query("prediction > 0.50") # Reduce the prediction set for easy testing
)
predict_df.head()
len(predict_df)

209168

In [11]:
# Create a dictionary of (compound_id, disease_id, metapath) keys to lookup untransformed DWPCs
untran_df = pandas.read_table('features/features.tsv.bz2', low_memory=False)

dwpc_melt_df = pandas.melt(untran_df, id_vars=['compound_id', 'disease_id'],
    value_vars=list(pos_dwpc_coef_df.feature), var_name='metapath', value_name='dwpc')
untran_dwpc_map = dict()
for row in dwpc_melt_df.itertuples():
    key = row.compound_id, row.disease_id, row.metapath
    untran_dwpc_map[key] = row.dwpc

## Prepare a list of observations

In [12]:
obj = list()

pos_contrib_df

for i, row in predict_df.merge(pos_contrib_df).iterrows():
    observation = collections.OrderedDict()
    compound_id = row['compound_id']
    disease_id = row['disease_id']
    for key in ['compound_id', 'compound_name', 'disease_id', 'disease_name', 'category', 'status', 'prediction', 'training_prediction', 'compound_percentile', 'disease_percentile', 'n_trials']:
        value = row[key]
        if pandas.notnull(value):
            observation[key] = value
    contribs = collections.OrderedDict()
    for metapath in pos_dwpc_coef_df.metapath:
        percent_contrib = row[metapath]
        if percent_contrib == 0 or pandas.isnull(percent_contrib):
            continue
        contribs[metapath] = percent_contrib
    observation['metapath_contribution'] = contribs
    obj.append(observation)
len(obj)

209168

In [13]:
predict_df.head()

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prior_prob,prediction,training_prediction,compound_percentile,disease_percentile,n_trials,status_trials,status_drugcentral
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0,0.004753,0.00093,0.001129,0.125,0.154746,0.0,0.0,0.0
1,DB05812,Abiraterone,DOID:10652,Alzheimer's disease,,0,0.004753,0.003795,0.004604,0.757353,0.842653,0.0,0.0,0.0
2,DB00659,Acamprosate,DOID:10652,Alzheimer's disease,,0,0.004753,0.01623,0.019638,0.985294,0.988296,0.0,0.0,0.0
3,DB00284,Acarbose,DOID:10652,Alzheimer's disease,,0,0.004753,0.001469,0.001783,0.595588,0.368661,0.0,0.0,0.0
4,DB01193,Acebutolol,DOID:10652,Alzheimer's disease,,0,0.004753,0.001774,0.002153,0.772059,0.472042,0.0,0.0,0.0


## Load metapaths info

In [14]:
def create_path_template(metarels):
    # Create cypher query
    q = '(:' + metarels[0][0] + ' {{identifier: {}}})'
    for i, (source_label, target_label, rel_type, direction) in enumerate(metarels):
        kwargs = {
            'i': i + 1,
            'rel_type': rel_type,
            'target_label': ':{}'.format(target_label),
            'dir0': '<-' if direction == 'backward' else '-',
            'dir1': '->' if direction == 'forward' else '-',
        }
        q += '{dir0}[:{rel_type}]{dir1}({target_label} {{{{identifier: {{}}}}}})'.format(**kwargs)
    return q

def get_paths(elem):
    c_id = elem['compound_id']
    d_id = elem['disease_id']
    dfs = list()
    for metapath, contribution in elem['metapath_contribution'].items():
        untran_dwpc = untran_dwpc_map[(c_id, d_id, 'dwpc_' + metapath)]
        pdp_query = metapath_to_query[metapath]
        parameters = {
            'source': c_id,
            'target': d_id,
            'dwpc': untran_dwpc,
            'metapath_contribution': elem['metapath_contribution'][metapath],
        }
        result = session.run(pdp_query, parameters)
        df = pandas.DataFrame((x.values() for x in result), columns=result.keys())
        df['source_edge'] = df['nodes'].map(lambda x: '—'.join([x[0], metapath_to_source_metaedge[metapath], x[1]]))
        df['target_edge'] = df['nodes'].map(lambda x: '—'.join([x[-1], metapath_to_target_metaedge[metapath], x[-2]]))
        df['nodes'] = df['nodes'].map(lambda x: '—'.join(x))
        df['metapath'] = metapath
        dfs.append(df)
    if not dfs:
        return None
    df = pandas.concat(dfs).sort_values('percent_of_prediction', ascending=False)
    return df

def format_property(x):
    if isinstance(x, int):
        return str(x)
    if isinstance(x, str):
        return '"{}"'.format(x)
    assert False

def get_summary_cypher(path_df, n_max = 5):
    path_df = path_df.iloc[:n_max, :]
    if path_df.empty:
        return None
    path_queries = list()
    for i, row in enumerate(path_df.itertuples()):
        path_template = metapath_to_cypher[row.metapath]
        path_query = path_template.format(*map(format_property, row.node_ids))
        path_query = 'MATCH p{} = {}'.format(i, path_query)
        path_queries.append(path_query)
    return_query = 'RETURN [{}]'.format(', '.join('p{}'.format(i) for i in range(len(path_df))))
    return '\n'.join(path_queries) + '\n' + return_query

def get_directory(compound_id, disease_id):
    base_path = '../../het.io-rep-data/prediction-info'
    directory = os.path.join(base_path, compound_id, disease_id.replace(':', '_'))
    os.makedirs(directory, exist_ok=True)
    return directory

In [15]:
config = configparser.ConfigParser()
config.read('../config.ini')
commit = config['hetnet']['integrate_commit']
url = 'https://github.com/dhimmel/integrate/raw/{}/data/metagraph.json'.format(commit)
metagraph = hetio.readwrite.read_metagraph(url)

In [16]:
with open('features/metapaths.json') as read_file:
    metapaths = json.load(read_file)

In [17]:
metapath_to_cypher = dict()
metapath_to_source_metaedge = dict()
metapath_to_target_metaedge = dict()
for metapath in metapaths:
    metapath['object'] = metagraph.metapath_from_abbrev(metapath['abbreviation'])
    metapath['metarels'] = hetio.neo4j.metapath_to_metarels(metapath['object'])
    metapath['path_template'] = create_path_template(metapath['metarels'])
    abbrev = metapath['abbreviation']
    metapath_to_cypher[abbrev] = metapath['path_template']
    metapath_obj = metapath['object']
    metapath_to_source_metaedge[abbrev] = metapath_obj[0].kind
    metapath_to_target_metaedge[abbrev] = metapath_obj[-1].kind

In [18]:
metapath_to_query = dict()

for metapath in metapaths:
    dwpc_query = metapath['dwpc_query']
    pdp_query = dwpc_query.split('RETURN')[0] + \
    '''\
    WITH
    extract(n in nodes(path)| n.name) AS nodes,
    extract(n in nodes(path)| n.identifier) AS node_ids,
    sum(reduce(pdp = 1.0, d in degrees| pdp * d ^ -0.4)) / { dwpc } AS percent_of_DWPC
    WITH
    nodes, node_ids,
    percent_of_DWPC,
    percent_of_DWPC * { metapath_contribution } AS percent_of_prediction
    RETURN nodes, percent_of_prediction, percent_of_DWPC, node_ids
    '''
    metapath_to_query[metapath['abbreviation']] = pdp_query

In [19]:
driver = GraphDatabase.driver("bolt://neo4j.het.io")
session = driver.session()

In [20]:
%%time

for elem in obj:
    directory = get_directory(elem['compound_id'], elem['disease_id'])
    
    path = os.path.join(directory, 'info.json')
    with open(path, 'wt') as write_file:
        json.dump(elem, write_file, indent=2, sort_keys=True)
    
    # Save path_df
    path_df = get_paths(elem)
    if path_df is None:
        continue
        
    path = os.path.join(directory, 'paths.tsv')
    path_df.drop('node_ids', axis='columns').to_csv(path, sep='\t', index=False, float_format='%.3g')
    
    # Create a cypher query with the most influential paths
    path = os.path.join(directory, 'highlights.cyp')
    summary_cypher = get_summary_cypher(path_df, 10)
    with open(path, 'wt') as write_file:
        write_file.write(summary_cypher)

CPU times: user 4h 38min 19s, sys: 4min 13s, total: 4h 42min 32s
Wall time: 14h 3min 4s


In [21]:
elem

OrderedDict([('compound_id', 'DB01624'),
             ('compound_name', 'Zuclopenthixol'),
             ('disease_id', 'DOID:1245'),
             ('disease_name', 'vulva cancer'),
             ('status', 0),
             ('prediction', 0.000932554855109785),
             ('compound_percentile', 0.411764705882353),
             ('disease_percentile', 0.140442132639792),
             ('n_trials', 0.0),
             ('metapath_contribution', OrderedDict())])

In [22]:
# Close Neo4j driver session
session.close()