# Convert features into a matrix

In [1]:
import bz2

import pandas
import numpy
from scipy.special import logit

In [2]:
# Read compound-disease pairs (observations)
pair_df = pandas.read_table('features/compound-disease-pairs.tsv.bz2')
pair_df.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status
0,Q161459,"(+)-1,2-diaminocyclohexane",Q4596888,2-hydroxyglutaric aciduria,,0
1,Q161459,"(+)-1,2-diaminocyclohexane",Q3335660,3-M syndrome,,0


In [3]:
# Read prior probabilities
prior_df = pandas.read_table('../all-features/data/matrix/prior.tsv')
prior_df.tail(2)

Unnamed: 0,compound_id,disease_id,prior_prob
36146734,Q11595687,Q23037766,0
36146735,Q11595687,Q970826,0


In [4]:
# Read degree features
compound_degree_df = pandas.read_table('../all-features/data/matrix/compound_degree.tsv')
disease_degree_df = pandas.read_table('../all-features/data/matrix/disease_degree.tsv')

for df in compound_degree_df, disease_degree_df:
    df.rename(columns={k: 'degree_' + k for k in df.columns[1:]}, inplace=True)

disease_degree_df.head(2)

Unnamed: 0,disease_id,degree_DduftC,degree_DgaG,degree_D<soD,degree_Dso>D
0,Q1002195,0,0,0,1
1,Q1004647,1,0,0,4


In [5]:
# Read DWPCs
dwpc_df = pandas.read_table('features/dwpc.tsv.bz2')
dwpc_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,metapath,PC,w,DWPC,seconds
0,hetio-ind,Q1087499,Q1004647,CpiwPpiwCduftD,0,0.4,0,0.07579
1,hetio-ind,Q1188290,Q1004647,CpiwPpiwCduftD,0,0.4,0,0.06617


In [6]:
dwpc_mat_df = dwpc_df.pivot_table(values = 'DWPC', index=['compound_id', 'disease_id'], columns = 'metapath')
dwpc_mat_df.columns = 'dwpc_' + dwpc_mat_df.columns
dwpc_mat_df = dwpc_mat_df.reset_index()
dwpc_mat_df.head(2)

Unnamed: 0,compound_id,disease_id,dwpc_CduftDduftCduftD,dwpc_CduftDso>D<soD,dwpc_CduftDso>D<soD<soD,dwpc_CduftDso>DduftCduftD,dwpc_CduftDso>Dso>D<soD,dwpc_CpiwPpiwCduftD,dwpc_CsdiCsdiCduftD
0,Q1002165,Q1004647,0,0,0,0,0,0,0
1,Q1002165,Q1017169,0,0,0,0,0,0,0


### Combine all observation-by-feature matrixes

In [7]:
col = dwpc_mat_df.columns

In [16]:
feature_mat_df = pair_df.merge(prior_df).merge(compound_degree_df).merge(disease_degree_df).merge(dwpc_mat_df, how='outer')
feature_mat_df[col] = feature_mat_df[col].fillna(0)

In [9]:
feature_mat_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prior_prob,degree_CduftD,degree_CpiwP,degree_CsdiC,...,degree_DgaG,degree_D<soD,degree_Dso>D,dwpc_CduftDduftCduftD,dwpc_CduftDso>D<soD,dwpc_CduftDso>D<soD<soD,dwpc_CduftDso>DduftCduftD,dwpc_CduftDso>Dso>D<soD,dwpc_CpiwPpiwCduftD,dwpc_CsdiCsdiCduftD
0,Q161459,"(+)-1,2-diaminocyclohexane",Q4596888,2-hydroxyglutaric aciduria,,0,0,0,0,0,...,0,2,1,0,0,0,0,0,0,0
1,Q15222640,(+)-3-carene,Q4596888,2-hydroxyglutaric aciduria,,0,0,0,0,0,...,0,2,1,0,0,0,0,0,0,0


In [10]:
feature_mat_df.shape

(36146736, 25)

In [11]:
with bz2.open('features/features.tsv.bz2', 'wt') as write_file:
    feature_mat_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')

## Transform

In [18]:
trans_df = feature_mat_df.copy()
degree_features = list(trans_df.columns[trans_df.columns.str.startswith('degree_')])
dwpc_features = list(trans_df.columns[trans_df.columns.str.startswith('dwpc_')])

# Transform prior
trans_df.insert(7, 'prior_logit', logit(trans_df.prior_prob))

# Transform degree features
for feature in degree_features:
    trans_df[feature] = numpy.arcsinh(trans_df[feature])

# Transform DWPC features
for feature in dwpc_features:
    x = trans_df[feature]
    trans_df[feature] = numpy.arcsinh(x / x.mean())

# Standardize all features besides the prior
for feature in degree_features + dwpc_features:
    x = trans_df[feature]
    trans_df[feature] = (x - x.mean()) / x.std()

trans_df.head(3)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prior_prob,prior_logit,degree_CduftD,degree_CpiwP,...,degree_DgaG,degree_D<soD,degree_Dso>D,dwpc_CduftDduftCduftD,dwpc_CduftDso>D<soD,dwpc_CduftDso>D<soD<soD,dwpc_CduftDso>DduftCduftD,dwpc_CduftDso>Dso>D<soD,dwpc_CpiwPpiwCduftD,dwpc_CsdiCsdiCduftD
0,Q161459,"(+)-1,2-diaminocyclohexane",Q4596888,2-hydroxyglutaric aciduria,,0,0,-inf,-0.49777,-0.594259,...,-0.142181,1.047884,-0.421521,-0.032861,-0.035169,-0.04985,-0.02129,-0.034444,-0.020873,-0.015785
1,Q15222640,(+)-3-carene,Q4596888,2-hydroxyglutaric aciduria,,0,0,-inf,-0.49777,-0.594259,...,-0.142181,1.047884,-0.421521,-0.032861,-0.035169,-0.04985,-0.02129,-0.034444,-0.020873,-0.015785
2,Q25366173,(+)-alpha-conhydrine,Q4596888,2-hydroxyglutaric aciduria,,0,0,-inf,-0.49777,-0.594259,...,-0.142181,1.047884,-0.421521,-0.032861,-0.035169,-0.04985,-0.02129,-0.034444,-0.020873,-0.015785


In [19]:
trans_df.describe()

Unnamed: 0,status,prior_prob,prior_logit,degree_CduftD,degree_CpiwP,degree_CsdiC,degree_C<hpC,degree_C<ioC,degree_Chp>C,degree_Cio>C,...,degree_DgaG,degree_D<soD,degree_Dso>D,dwpc_CduftDduftCduftD,dwpc_CduftDso>D<soD,dwpc_CduftDso>D<soD<soD,dwpc_CduftDso>DduftCduftD,dwpc_CduftDso>Dso>D<soD,dwpc_CpiwPpiwCduftD,dwpc_CsdiCsdiCduftD
count,36146736.0,36146736.0,36146740.0,36146740.0,36146740.0,36146740.0,36146740.0,36146740.0,36146740.0,36146740.0,...,36146740.0,36146740.0,36146740.0,36146740.0,36146740.0,36146740.0,36146740.0,36146740.0,36146740.0,36146740.0
mean,8.2e-05,8.2e-05,-inf,-6.649934e-15,-1.501305e-16,-8.269699e-16,-1.02956e-16,-8.276359e-16,-3.193794e-15,-5.851839e-16,...,2.921622e-16,-3.995747e-15,2.400593e-14,8.348638000000001e-17,4.6362630000000004e-17,1.15688e-16,4.178998e-17,-4.96383e-17,-3.0962410000000005e-17,-6.005601e-17
std,0.009063,0.001509,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,0.0,0.0,-inf,-0.4977698,-0.5942587,-0.1692346,-0.1133185,-0.1005009,-0.9487562,-0.1473903,...,-0.1421814,-0.5464281,-3.483886,-0.03286104,-0.03516933,-0.04984999,-0.02129031,-0.0344444,-0.02087328,-0.01578532
25%,0.0,0.0,-inf,-0.4977698,-0.5942587,-0.1692346,-0.1133185,-0.1005009,-0.9487562,-0.1473903,...,-0.1421814,-0.5464281,-0.421521,-0.03286104,-0.03516933,-0.04984999,-0.02129031,-0.0344444,-0.02087328,-0.01578532
50%,0.0,0.0,-inf,-0.4977698,-0.5942587,-0.1692346,-0.1133185,-0.1005009,-0.9487562,-0.1473903,...,-0.1421814,-0.5464281,-0.421521,-0.03286104,-0.03516933,-0.04984999,-0.02129031,-0.0344444,-0.02087328,-0.01578532
75%,0.0,0.0,-inf,-0.4977698,0.5480529,-0.1692346,-0.1133185,-0.1005009,0.8613732,-0.1473903,...,-0.1421814,0.4269371,-0.421521,-0.03286104,-0.03516933,-0.04984999,-0.02129031,-0.0344444,-0.02087328,-0.01578532
max,1.0,0.51847,0.07391363,5.092336,4.623245,8.088882,13.15244,20.41815,2.785896,6.784708,...,16.76632,5.449616,5.703209,44.02062,42.77796,31.69092,69.28418,45.16288,65.47996,87.69619


In [20]:
with bz2.open('features/transformed-features.tsv.bz2', 'wt') as write_file:
    trans_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')