
Accepted format for unput data:

It should be a csv file, with rows as samples and columns as features. 
samples normalizarion should be done since no normalization step is
included in the this script

Node types should be consistent with "intraedges" keys in the conf object



In [13]:
import os
import networkx as nx
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api

Define Functions 

In [14]:

def nodetype_dict(df):
    nodetype = {}
    for i in range(len(df.columns)):
        nodetype[df.iloc[1,i]] = df.iloc[0,i]
    return nodetype

def exclude_redundant_samples(df, conf):
    df = df.drop(index = [x for x in df.index if x[-1] != conf['sampletype'][conf['samples']]])
    return df

def preprocess_data(df):
    # create nodetype dictionary
    nodetype = nodetype_dict(df)
    # remove extra rows and set the columns names
    df.columns = df.iloc[1,:]
    df.drop(index = ['type','feature'],inplace = True)

    for col in df.columns:
        df[col] = df[col].apply(lambda x : float(x))

    return df, nodetype

def correlation(df):

    if conf['correlation_function']=='spearman':
        corr,pvalues = stats.spearmanr(df.values)
    elif conf['correlation_function']=='pearson':
        corr = pd.DataFrame(index = range(len(df.columns)),columns = range(len(df.columns)))
        pvalues =  pd.DataFrame(index = range(len(df.columns)),columns = range(len(df.columns)))
        for i in range(len(df.columns)):
            for j in range(len(df.columns)):
                c , p =stats.pearsonr(df.iloc[:,i].values.tolist(),df.iloc[:,j].values.tolist())
                corr.loc[i,j] , pvalues.loc[i,j]  = c , p

    # correct nan values
    pvalues = np.nan_to_num(pvalues,nan=1)
    corr = np.nan_to_num(corr,nan=0)
    fea_names = df.columns.to_list()
    return corr, pvalues, fea_names

def correct_pvalues(pvalues, conf):
    FDRres= statsmodels.stats.multitest.multipletests([j for r in pvalues for j in r], alpha=0.05, method= conf['correction_method'], is_sorted=False, returnsorted=False)# bonferroni
    corrected_pvalues = FDRres[1].reshape((len(pvalues),-1))
    return corrected_pvalues

def construct_network(corr,corrected_pvalues,fea_names,nodetype,conf):
    name = conf['network_name']
    pvalue_th, corr_coef_th = conf['pvalue_th'], conf['corr_coef_th']
    g = nx.Graph()
    for i in range(len(corr)):
        for j in range(i+1,len(corrected_pvalues)):
            if corrected_pvalues[i][j]<=pvalue_th and abs(corr[i][j])>=corr_coef_th:

                if  nodetype[fea_names[i]] == nodetype[fea_names[j]]:
                    # ignore if the nointraedge is True for this nodetype
                    if conf['nointraedge'][nodetype[fea_names[i]]]:
                        continue

                if fea_names[i] not in g.nodes:
                    g.add_node(fea_names[i])
                    g.nodes[fea_names[i]]['type'] = nodetype[fea_names[i]]

                if fea_names[j] not in g.nodes:
                    g.add_node(fea_names[j])
                    g.nodes[fea_names[j]]['type'] = nodetype[fea_names[j]]

                g.add_edge(fea_names[i],fea_names[j],weight= np.around(abs(corr[i][j]) ,decimals= 2),corr= np.around(corr[i][j] ,decimals= 2) , pvalue = np.around(corrected_pvalues[i][j],decimals=3),sign = np.sign(corr[i][j]))

    if "networks" not in os.listdir("./"):
        os.mkdir("./networks")
    nx.write_gexf(g,f"networks/{name}_pval-{conf['pvalue_th']}-corr{conf['corr_coef_th']}-{conf['correlation_function']}.gexf")
    return g


Run the code below after specifying the options in the conf dictionary object.

In [15]:

conf = {
    'datapath':'test.csv',
    'sampletype':{'Healthy': 'B', 'CVD': 'A'},
    'samples':['Healthy','CVD','all'][2],
    'corr_coef_th': 0.3,
    'pvalue_th':0.05,
    'correction_method' : 'fdr_bh', ## or 'bonferroni',
    'network_name':'test2', ## specify the features type in network name. E.x : otu_bile_blood_Healthy
    'correlation_function':['spearman','pearson'][0],
     ## for each nodetype specify whether the intra_edsges should be ignored or not True = ignore intra edges
    'nointraedge':
        {
         'blood': [True,False][0],
         'stool':  [True,False][1],
         'out': [True,False][0],
         'bileblood':  [True,False][0], 
         'bilestool':  [True,False][0],
         'bile':  [True,False][0],
         'sial':  [True,False][0], 
         'SCFA':  [True,False][0],
         'path':  [True,False][0],
         'meta':  [True,False][0],

        }

}

df = pd.read_csv(conf['datapath'],header = None, index_col=0)

df, nodetype = preprocess_data(df)

# remove redundant samples
if conf['samples']!='all':
    df = exclude_redundant_samples(df, conf)

# calculate correlations
corr, pvalues, fea_names = correlation(df)

# Pvalue correction
corrected_pvalues = correct_pvalues(pvalues, conf)

# Create network
g = construct_network(corr,corrected_pvalues,fea_names,nodetype,conf)


In [16]:
df


feature,Cholesterol,Coprostanol,7alphaHydroxycholesterol,7dehydrocholesterol,betaSitosterol,Campesterol,Cholestanol,Desmosterol,Lanosterol,Lathosterol,...,fRuminococcaceaeg,gOscillospira,gAkkermansia,fRuminococcaceae,fChristensenellaceaeg,fLachnospiraceaeg,fEnterobacteriaceae,gBacteroides,gRuminococcus,gClostridium
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P001B,0.348868,0.499076,7.2e-05,0.000313,0.082372,0.035032,0.016038,0.000494,0.005281,0.005718,...,0.038431,0.008462,0.002521,0.012032,0.000293,0.015901,0.001236,0.174693,0.022861,0.005717
P002B,0.392886,0.287443,8.8e-05,0.000244,0.185754,0.092733,0.010891,0.000436,0.004622,0.005142,...,0.00462,0.024967,0.001629,0.002626,0.000412,0.014894,0.0,0.101612,0.038768,0.002204
P003B,0.587921,0.111228,9.8e-05,0.00029,0.221185,0.041377,0.009597,0.000737,0.005816,0.00586,...,0.024802,0.010618,0.000288,0.006931,0.000221,0.006643,0.0,0.176456,0.017594,0.012445
P004B,0.745734,0.027748,0.000174,0.000542,0.131371,0.067203,0.009842,0.001279,0.002402,0.003256,...,0.003149,0.00482,2.3e-05,0.003213,7.6e-05,0.026828,0.021811,0.271224,0.002937,0.021705
P005B,0.323546,0.478779,8.3e-05,0.000119,0.115116,0.048576,0.014738,0.000327,0.006994,0.004971,...,0.005046,0.00348,7.6e-05,0.002221,8.5e-05,0.029329,0.000375,0.151217,0.002151,0.0247
P006B,0.601248,0.029374,0.00011,0.000739,0.215715,0.112906,0.007986,0.001804,0.004484,0.004062,...,0.000377,0.013376,6.3e-05,0.006912,0.000725,0.018161,0.010887,0.423606,0.008282,0.014496
P007B,0.136815,0.74385,2.6e-05,8.2e-05,0.041045,0.017534,0.016338,0.000137,0.020921,0.019858,...,0.023907,0.035768,0.010816,0.005501,0.001755,0.015697,0.000118,0.112555,0.057204,0.01606
P008B,0.117078,0.598,3.6e-05,0.00072,0.196331,0.032782,0.019993,0.001124,0.013761,0.010195,...,0.056868,0.032487,0.003762,0.015219,0.003887,0.015845,0.000924,0.126942,0.021516,0.005749
P009B,0.358711,0.155118,4.8e-05,0.00033,0.307813,0.127488,0.008071,0.000538,0.003757,0.003708,...,0.0,0.000577,4.4e-05,0.002967,0.0,0.074726,0.001324,0.234034,0.000157,0.016319
P010B,0.218823,0.466268,3.3e-05,0.000164,0.179582,0.103872,0.011818,0.00048,0.008079,0.007156,...,0.025784,0.023195,0.0,0.016422,0.001554,0.011313,0.0,0.039214,0.009391,0.003527
