# Combine all features into a single matrix

In [1]:
import bz2
import os
import configparser
import itertools

import pandas

In [2]:
unperm_name = 'wikidata-v0.1'

## Read partitions

In [3]:
# Read partition information
part_df = pandas.read_table('data/partitions.tsv')
part_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,primary
161283,wikidata-v0.1_perm-1,Q999805,Q911386,0,1
161284,wikidata-v0.1_perm-2,Q999805,Q954674,1,1


## Create DWPC matrix

In [4]:
# Read DWPC results
dwpc_df = pandas.read_table('data/dwpc.tsv.bz2')
dwpc_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,metapath,PC,w,DWPC,seconds
47579073,wikidata-v0.1_perm-4,Q999805,Q907921,CduftDso>Dso>Dso>D,0,0.4,0,0.0212
47579074,wikidata-v0.1_perm-1,Q999805,Q911386,CduftDso>Dso>Dso>D,0,0.4,0,0.01918


In [5]:
dwpc_spread_df = pandas.pivot_table(dwpc_df, values='DWPC', index=['hetnet', 'compound_id', 'disease_id'], columns='metapath')
dwpc_spread_df = dwpc_spread_df.reset_index()
dwpc_spread_df.head(2)

metapath,hetnet,compound_id,disease_id,C<hpC<hpC<hpCduftD,C<hpC<hpC<ioCduftD,C<hpC<hpCduftD,C<hpC<hpCduftD<soD,C<hpC<hpCduftDso>D,C<hpC<hpChp>CduftD,C<hpC<hpCio>CduftD,...,CsdiCpiwPeGgaD,CsdiCpiwPpiwCduftD,CsdiCsdiC<hpCduftD,CsdiCsdiC<ioCduftD,CsdiCsdiCduftD,CsdiCsdiCduftD<soD,CsdiCsdiCduftDso>D,CsdiCsdiChp>CduftD,CsdiCsdiCio>CduftD,CsdiCsdiCsdiCduftD
0,wikidata-v0.1,Q10354103,Q1435822,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.001942
1,wikidata-v0.1,Q10354103,Q177190,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [6]:
# Remove metapaths with missing DWPCs
# See https://github.com/dhimmel/learn/issues/1 for potential cause
is_complete = pandas.isnull(dwpc_spread_df).sum() == 0

# Remove columns with missing data
dwpc_spread_df = dwpc_spread_df.loc[:, is_complete]

# The following columns have missing values and were removed
is_complete.index[-is_complete].tolist()

[]

In [7]:
path = 'data/matrix/dwpc.tsv.bz2'
with bz2.open(path, 'wt') as wf:
    dwpc_spread_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')

## Calculate Degree features

In [8]:

file = '../metaedge-styles.tsv'
metaedge_style_df = pandas.read_table(file)
metaedge_to_abbreviation = dict(zip(metaedge_style_df.metaedge, metaedge_style_df.abbreviation))

file = '../degrees.xlsx'

disease_degree_df = pandas.read_excel(file, sheetname='Disease')
disease_degree_df = disease_degree_df.rename(columns={'node_id': 'disease_id'}).drop('node_name', axis='columns')
disease_degree_df = disease_degree_df.rename(columns=metaedge_to_abbreviation)

compound_degree_df = pandas.read_excel(file, sheetname='Compound')
compound_degree_df = compound_degree_df.rename(columns={'node_id': 'compound_id'}).drop('node_name', axis='columns')
compound_degree_df = compound_degree_df.rename(columns=metaedge_to_abbreviation)

In [9]:
compound_degree_df.head(2)

Unnamed: 0,compound_id,CduftD,CpiwP,CsdiC,C<hpC,C<ioC,Chp>C,Cio>C
0,Q1002165,0,1,0,0,0,0,0
1,Q1014287,0,0,0,0,0,1,0


In [10]:
disease_degree_df.head(2)

Unnamed: 0,disease_id,DduftC,DgaG,D<soD,Dso>D
0,Q1002195,0,0,0,1
1,Q1004647,1,0,0,4


In [11]:
compound_degree_df.to_csv('data/matrix/compound_degree.tsv', index=False, sep='\t')
disease_degree_df.to_csv('data/matrix/disease_degree.tsv', index=False, sep='\t')

## Compute prior dataset

In [12]:
# Read compound and disease degrees
compound_df = pandas.read_table('../summary/compounds.tsv')
disease_df = pandas.read_table('../summary/diseases.tsv')
total_pairs = len(compound_df) * len(disease_df)
nonzero_prior_pairs = sum(compound_df.treats > 0) * sum(disease_df.treats > 0)
total_pairs, nonzero_prior_pairs

(36146736, 637894)

In [13]:
rows = list(itertools.product(compound_df.compound_id, disease_df.disease_id))
prior_df = pandas.DataFrame(rows, columns=['compound_id', 'disease_id']).merge(
  pandas.read_table('../prior/data/observation-prior.tsv')[['compound_id', 'disease_id', 'prior_perm']], how='left'
).fillna(0)
prior_df = prior_df.rename(columns={'prior_perm': 'prior_prob'})
prior_df.head(2)

Unnamed: 0,compound_id,disease_id,prior_prob
0,Q161459,Q4596888,0
1,Q161459,Q3335660,0


In [14]:
sum(prior_df.prior_prob)

2968.9997121526762

In [15]:
(prior_df.prior_prob > 0).value_counts(True)

False    0.982353
True     0.017647
Name: prior_prob, dtype: float64

In [16]:
prior_df.to_csv('data/matrix/prior.tsv', index=False, sep='\t', float_format='%.5g')

## Create a single matrix-like dataframe

In [17]:
matrix_df = part_df.merge(disease_df.iloc[:, :2]).merge(compound_df.iloc[:, :2])
matrix_df = matrix_df.merge(prior_df)
matrix_df = matrix_df.merge(compound_degree_df).merge(disease_degree_df)
matrix_df = matrix_df.merge(dwpc_spread_df)

In [18]:
matrix_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,primary,disease_name,compound_name,prior_prob,CduftD,CpiwP,...,CsdiCpiwPeGgaD,CsdiCpiwPpiwCduftD,CsdiCsdiC<hpCduftD,CsdiCsdiC<ioCduftD,CsdiCsdiCduftD,CsdiCsdiCduftD<soD,CsdiCsdiCduftDso>D,CsdiCsdiChp>CduftD,CsdiCsdiCio>CduftD,CsdiCsdiCsdiCduftD
0,wikidata-v0.1_perm-1,Q10354103,Q1048084,1,1,opisthorchiasis,probucol,0.001999,3,1,...,0,0,0,0,0.010103,0,0,0,0,0.014973
1,wikidata-v0.1_perm-5,Q118551,Q1048084,1,1,opisthorchiasis,clarithromycin,0.006174,9,0,...,0,0,0,0,0.0,0,0,0,0,0.0


In [19]:
df_creators = [
    {'feature_type': 'prior', 'feature': ['prior_prob']},
    {'feature_type': 'degree', 'feature': compound_degree_df.columns[1:]},
    {'feature_type': 'degree', 'feature': disease_degree_df.columns[1:]},
    {'feature_type': 'dwpc', 'feature': dwpc_spread_df.columns[3:]},
]
feature_df = pandas.concat(map(pandas.DataFrame, df_creators))

In [20]:
unperm_matrix_df = matrix_df.query("hetnet == @unperm_name").drop('hetnet', axis='columns')
feature_df['unperm_mean'] = list(unperm_matrix_df[feature_df.feature].mean())
feature_df['unperm_sd'] = list(unperm_matrix_df[feature_df.feature].std())
feature_df.head(2)

Unnamed: 0,feature,feature_type,unperm_mean,unperm_sd
0,prior_prob,prior,0.008933,0.020257
0,CduftD,degree,3.366723,3.1916


In [21]:
feature_df.to_csv('data/matrix/feature-type.tsv', index=False, sep='\t', float_format='%.5g')

path = 'data/matrix/features.tsv.bz2'
with bz2.open(path, 'wt') as wf:
    matrix_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')

In [22]:
# Save hetnet specific feature files
directory = os.path.join('data', 'matrix', unperm_name)
if not os.path.exists(directory):
    os.mkdir(directory)
path = os.path.join(directory, 'features.tsv.bz2')
with bz2.open(path, 'wt') as wf:
    unperm_matrix_df.to_csv(wf, index=False, sep='\t', float_format='%.5g')