# Combine all features into a single matrix

In [1]:
import bz2
import os
import configparser
import itertools

import pandas

In [2]:
unperm_name = 'rephetio-v2.0'

## Read partitions

In [3]:
# Read partition information
part_df = pandas.read_table('data/partitions.tsv')
part_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,primary
31750,rephetio-v2.0_perm-4,DB08906,DOID:9970,0,0
31751,rephetio-v2.0_perm-5,DB08906,DOID:9970,0,0


## Create DWPC matrix

In [4]:
# Read DWPC results
dwpc_df = pandas.read_table('data/dwpc.tsv.bz2')
dwpc_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,metapath,PC,w,DWPC,seconds
285766,rephetio-v2.0_perm-3,DB08906,DOID:9970,CbGaDaGaD,316,0.4,0.006338,0.01362
285767,rephetio-v2.0_perm-5,DB08906,DOID:9970,CbGaDaGaD,481,0.4,0.004732,0.01617


In [5]:
dwpc_spread_df = pandas.pivot_table(dwpc_df, values='DWPC', index=['hetnet', 'compound_id', 'disease_id'], columns='metapath')
dwpc_spread_df = dwpc_spread_df.reset_index()
dwpc_spread_df.head(2)

metapath,hetnet,compound_id,disease_id,CbGaD,CbGaDaGaD,CbGaDtCtD,CbGbCbGaD,CbGbCtD,CtDaGaD,CtDaGbCtD,CtDtCbGaD,CtDtCtD
0,rephetio-v2.0,DB00014,DOID:10283,0,0.008211,0,0.001262,0.055663,0.039983,0.014743,0.011052,0.008672
1,rephetio-v2.0,DB00014,DOID:11934,0,0.001923,0,0.0,0.0,0.028904,0.006647,0.005894,0.02409


In [6]:
# Remove metapaths with missing DWPCs
# See https://github.com/dhimmel/learn/issues/1 for potential cause
is_complete = pandas.isnull(dwpc_spread_df).sum() == 0

# Remove columns with missing data
dwpc_spread_df = dwpc_spread_df.loc[:, is_complete]

# The following columns have missing values and were removed
is_complete.index[-is_complete].tolist()

[]

In [7]:
path = 'data/matrix/dwpc.tsv.bz2'
with bz2.open(path, 'wt') as wf:
    dwpc_spread_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')

## Calculate Degree features

In [8]:
# config = configparser.ConfigParser()
# config.read('../config.ini')
# commit = config['hetnet']['integrate_commit']

# url = 'https://github.com/dhimmel/integrate/raw/{}/data/summary/metaedge-styles.tsv'.format(commit)

url = "../../integrate/data/summary/metaedge-styles.tsv"

metaedge_style_df = pandas.read_table(url)
metaedge_to_abbreviation = dict(zip(metaedge_style_df.metaedge, metaedge_style_df.abbreviation))

# url = 'https://github.com/dhimmel/integrate/raw/{}/data/summary/degrees.xlsx'.format(commit)
url = "../../integrate/data/summary/degrees.xlsx"

disease_degree_df = pandas.read_excel(url, sheetname='Disease')
disease_degree_df = disease_degree_df.rename(columns={'node_id': 'disease_id'}).drop('node_name', axis='columns')
disease_degree_df = disease_degree_df.rename(columns=metaedge_to_abbreviation)

compound_degree_df = pandas.read_excel(url, sheetname='Compound')
compound_degree_df = compound_degree_df.rename(columns={'node_id': 'compound_id'}).drop('node_name', axis='columns')
compound_degree_df = compound_degree_df.rename(columns=metaedge_to_abbreviation)

In [9]:
compound_degree_df.head(2)

Unnamed: 0,compound_id,CbG,CtD
0,DB00014,2,2
1,DB00035,5,0


In [10]:
disease_degree_df.head(2)

Unnamed: 0,disease_id,DaG,DtC
0,DOID:0050156,18,0
1,DOID:0050425,12,0


In [11]:
compound_degree_df.to_csv('data/matrix/compound_degree.tsv', index=False, sep='\t')
disease_degree_df.to_csv('data/matrix/disease_degree.tsv', index=False, sep='\t')

## Compute prior dataset

In [12]:
# Read compound and disease degrees
compound_df = pandas.read_table('../summary/compounds.tsv')
disease_df = pandas.read_table('../summary/diseases.tsv')
total_pairs = len(compound_df) * len(disease_df)
nonzero_prior_pairs = sum(compound_df.treats > 0) * sum(disease_df.treats > 0)
total_pairs, nonzero_prior_pairs

(186662, 24674)

In [13]:
rows = list(itertools.product(compound_df.compound_id, disease_df.disease_id))
prior_df = pandas.DataFrame(rows, columns=['compound_id', 'disease_id']).merge(
  pandas.read_table('../prior/data/observation-prior.tsv')[['compound_id', 'disease_id', 'prior_perm']], how='left'
).fillna(0)
prior_df = prior_df.rename(columns={'prior_perm': 'prior_prob'})
prior_df.head(2)

Unnamed: 0,compound_id,disease_id,prior_prob
0,DB01048,DOID:10652,0.004455
1,DB01048,DOID:9206,0.0


In [14]:
sum(prior_df.prior_prob)

615.00003228999424

In [15]:
(prior_df.prior_prob > 0).value_counts(True)

False    0.867815
True     0.132185
Name: prior_prob, dtype: float64

In [16]:
prior_df.to_csv('data/matrix/prior.tsv', index=False, sep='\t', float_format='%.5g')

## Create a single matrix-like dataframe

In [17]:
matrix_df = part_df.merge(disease_df.iloc[:, :2]).merge(compound_df.iloc[:, :2])
matrix_df = matrix_df.merge(prior_df)
matrix_df = matrix_df.merge(compound_degree_df).merge(disease_degree_df)
matrix_df = matrix_df.merge(dwpc_spread_df)

In [18]:
matrix_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,primary,disease_name,compound_name,prior_prob,CbG,CtD,...,DtC,CbGaD,CbGaDaGaD,CbGaDtCtD,CbGbCbGaD,CbGbCtD,CtDaGaD,CtDaGbCtD,CtDtCbGaD,CtDtCtD
0,rephetio-v2.0_perm-2,DB00014,DOID:1024,0,1,leprosy,Goserelin,0.006101,2,2,...,2,0,0.000545,0.000299,0.001267,0,0.001198,0.003134,0.0,0.014391
1,rephetio-v2.0_perm-1,DB00136,DOID:1024,0,1,leprosy,Calcitriol,0.002948,4,1,...,2,0,0.00251,8.1e-05,0.000907,0,0.004485,0.002194,0.000736,0.0


In [19]:
df_creators = [
    {'feature_type': 'prior', 'feature': ['prior_prob']},
    {'feature_type': 'degree', 'feature': compound_degree_df.columns[1:]},
    {'feature_type': 'degree', 'feature': disease_degree_df.columns[1:]},
    {'feature_type': 'dwpc', 'feature': dwpc_spread_df.columns[3:]},
]
feature_df = pandas.concat(map(pandas.DataFrame, df_creators))

In [20]:
unperm_matrix_df = matrix_df.query("hetnet == @unperm_name").drop('hetnet', axis='columns')
feature_df['unperm_mean'] = list(unperm_matrix_df[feature_df.feature].mean())
feature_df['unperm_sd'] = list(unperm_matrix_df[feature_df.feature].std())
feature_df.head(2)

Unnamed: 0,feature,feature_type,unperm_mean,unperm_sd
0,prior_prob,prior,0.037868,0.066775
0,CbG,degree,11.54439,12.697428


In [21]:
feature_df.to_csv('data/matrix/feature-type.tsv', index=False, sep='\t', float_format='%.5g')

path = 'data/matrix/features.tsv.bz2'
with bz2.open(path, 'wt') as wf:
    matrix_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')

In [22]:
# Save hetnet specific feature files
directory = os.path.join('data', 'matrix', unperm_name)
if not os.path.exists(directory):
    os.mkdir(directory)
path = os.path.join(directory, 'features.tsv.bz2')
with bz2.open(path, 'wt') as wf:
    unperm_matrix_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')