In [22]:
import pandas as pd
import numpy as np

from causallearn.search.ConstraintBased.CDNOD import cdnod
from causallearn.utils.GraphUtils import GraphUtils
from causallearn.utils.cit import CIT
from causallearn.utils.PCUtils import SkeletonDiscovery, UCSepset, Meek
from causallearn.utils.PCUtils.BackgroundKnowledge import BackgroundKnowledge
from causallearn.utils.PCUtils.BackgroundKnowledgeOrientUtils import \
    orient_by_background_knowledge

# Feature Pruning via Logistic Regression LASSO
rpy2 was not downloading packages via the CRAN website, so please see the `src/LASSO.R` file for the implementation

In [2]:
# read in LASSO covariates
LASSO_covariates = pd.read_csv("../src/LASSO_covariates.txt")
LASSO_covariates

Unnamed: 0,(Intercept),region,Acetitomaculum,Adlercreutzia,Aerococcus,Agathobaculum,Alistipes,Allisonella,Anaerosphaera,Angelakisella,...,Sphingobium,Subdoligranulum,Succinivibrio,TM7a,Tyzzerella,UCG.002,UCG.005,Vicinamibacteraceae,Victivallaceae,Zoogloea
0,-0.121412,0.32459,-5.794715,-0.4663,-2.618474,-0.401735,0.075194,0.414123,0.393944,-1.157109,...,0.335083,-0.027555,-0.01716,1.185681,0.053192,-0.008564,-0.080577,8.561456,7.784932,62.418771


In [3]:
LASSO_covariates.columns

Index(['(Intercept)', 'region', 'Acetitomaculum', 'Adlercreutzia',
       'Aerococcus', 'Agathobaculum', 'Alistipes', 'Allisonella',
       'Anaerosphaera', 'Angelakisella', 'Aureispira', 'Bacteroides',
       'Blautia', 'Brevundimonas', 'Burkholderia',
       'Burkholderia.Caballeronia.Paraburkholderia', 'Butyrivibrio', 'CAG.873',
       'Catenisphaera', 'Chelonobacter', 'Christensenellaceae',
       'Clostridia_UCG.014', 'Clostridia_vadinBB60_group', 'Coprobacillus',
       'Coriobacteriaceae_UCG.002', 'Coriobacteriaceae_UCG.003',
       'Corynebacterium', 'Croceifilum', 'Cryptanaerobacter',
       'Cryptobacterium', 'Desulfovibrio', 'Duganella', 'Escherichia.Shigella',
       'Eubacterium', 'Eubacterium_eligens_group', 'Faecalibacterium',
       'Flintibacter', 'Gottschalkia', 'Halomonas', 'Holdemanella',
       'Incertae_Sedis', 'Irregularibacter', 'Kineothrix', 'Knoellia',
       'Lachnoclostridium', 'Lachnospiraceae_UCG.003', 'Massiliprevotella',
       'Mesoplasma', 'Methanospha

In [7]:
# check to make sure the original set of columns does not have '.' in column names
data = pd.read_csv("../data/clean.csv")
print(data.shape)

for col in data.columns:
    if '.' in col:
        print(col)

(948, 1130)
env.OPS_17


The only column in the original data frame that has a '.' is 'env.OPS_17'. Manually add this to check.

In [8]:
# manually fix the '.' to '-' conversion done when reading table into R
LASSO_ogcolumns = []
for col in LASSO_covariates.columns:
    if '.' in col:
        if col != 'env.OPS_17':
            LASSO_ogcolumns.append(col.replace('.', '-'))
    else:
        LASSO_ogcolumns.append(col)

In [13]:
# prune pcostest
pcos_short = data[['group'] + LASSO_ogcolumns[1:]]
pcos_short

Unnamed: 0,group,region,Acetitomaculum,Adlercreutzia,Aerococcus,Agathobaculum,Alistipes,Allisonella,Anaerosphaera,Angelakisella,...,Sphingobium,Subdoligranulum,Succinivibrio,TM7a,Tyzzerella,UCG-002,UCG-005,Vicinamibacteraceae,Victivallaceae,Zoogloea
0,0,0,0.0,0.033847,0.0,0.0,9.552015,0.000000,0.0,0.000000,...,0.00000,2.705316,0.0,0.000000,0.140222,3.503131,0.834079,0.0,0.0,0.0
1,0,0,0.0,0.000000,0.0,0.0,3.172147,0.000000,0.0,0.000000,...,0.00000,0.000000,0.0,0.000000,0.000000,2.961799,0.041103,0.0,0.0,0.0
2,0,0,0.0,0.000000,0.0,0.0,1.526039,0.000000,0.0,0.007288,...,0.00000,0.421228,0.0,0.000000,0.000000,0.918247,0.033523,0.0,0.0,0.0
3,0,0,0.0,0.000000,0.0,0.0,1.700405,0.000000,0.0,0.000000,...,0.00000,0.893744,0.0,0.000000,0.516385,0.863188,0.200137,0.0,0.0,0.0
4,0,0,0.0,0.034935,0.0,0.0,6.038828,0.000000,0.0,0.000000,...,0.00000,1.664421,0.0,0.007486,0.000000,2.450467,1.766732,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,1,1,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.00000,3.224689,0.0,0.000000,0.000000,0.267280,0.009899,0.0,0.0,0.0
944,1,1,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.00000,0.554893,0.0,0.000000,0.050800,0.000000,0.000000,0.0,0.0,0.0
945,1,1,0.0,0.155638,0.0,0.0,0.000000,0.426662,0.0,0.000000,...,0.00805,8.090485,0.0,0.000000,0.000000,0.966028,0.037568,0.0,0.0,0.0
946,1,1,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.00000,0.104998,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0


# Causal Discovery via CD-NOD

In [16]:
# CD-NOD with 'group' as the c_indx
cg = cdnod(pcos_short.iloc[:, 1:].values, pcos_short[['group']].values) # 'group' must be the first column

  0%|          | 0/79 [00:00<?, ?it/s]

In [17]:
pyd = GraphUtils.to_pydot(cg.G, labels=list(pcos_short.columns[1:]) + [pcos_short.columns[0]])
pyd.write_png('cdnod_nobk.png')

In [23]:
# we can't have edges pointing to the 'region' variable
# rerun CD-NOD with some background knowledge
# essentially, tweak the CD-NOD algorithm to add the same assumption of the c_indx variable to the 'region' variable

if 'region' in pcos_short.columns:
    
    # repeat the first part of CD-NOD
    data_aug = np.concatenate((pcos_short.iloc[:, 1:].values, pcos_short[['group']].values), axis=1)
    indep_test = CIT(data_aug, 'fisherz')
    cg_1 = SkeletonDiscovery.skeleton_discovery(data_aug, 0.05, indep_test, stable=True)

    c_indx_id = data_aug.shape[1] - 1
    for i in cg_1.G.get_adjacent_nodes(cg_1.G.nodes[c_indx_id]):
        cg_1.G.add_directed_edge(cg_1.G.nodes[c_indx_id], i)

    # add assumption to the 'region' node
    nodes = cg_1.G.get_nodes()

    # 'region' and 'group' cannot be connected
    bk = BackgroundKnowledge() \
        .add_forbidden_by_node(nodes[0], nodes[pcos_short.shape[1] - 1])

    # all edges from 'region' must be pointing away
    for i in cg_1.G.get_adjacent_nodes(cg_1.G.nodes[0]):
        cg_1.G.add_directed_edge(cg_1.G.nodes[0], i)

    # remaining steps of CD-NOD
    orient_by_background_knowledge(cg_1, bk)

    cg_2 = UCSepset.uc_sepset(cg_1, 2)
    cg_withbk = Meek.meek(cg_2)

  0%|          | 0/79 [00:00<?, ?it/s]

In [24]:
pyd = GraphUtils.to_pydot(cg_withbk.G, labels=list(pcos_short.columns[1:]) + [pcos_short.columns[0]])
pyd.write_png('cdnod_withbk.png')

In [27]:
# print the microbes directly linked to 'group'
adj_nodes = []
for node in cg_withbk.G.get_adjacent_nodes(cg_withbk.G.get_node(f'X{pcos_short.shape[1]}')):
    adj_nodes.append(int(node.get_name().replace('X', '')))

print('The following microbes are directly linked to the \'group\' node:')
print('\n'.join(list(pcos_short.columns[adj_nodes])))

The following microbes are directly linked to the 'group' node:
Alistipes
Burkholderia
Holdemanella
Knoellia
Prevotellaceae_NK3B31_group
Ruminococcus
Ruminococcus_gnavus_group


In [28]:
# print the microbes directly linked to 'region'
adj_nodes = []
for node in cg_withbk.G.get_adjacent_nodes(cg_withbk.G.get_node(f'X1')):
    adj_nodes.append(int(node.get_name().replace('X', '')))

print('The following microbes are directly linked to the \'region\' node:')
print('\n'.join(list(pcos_short.columns[adj_nodes])))

The following microbes are directly linked to the 'region' node:
Adlercreutzia
Alistipes
Allisonella
Anaerosphaera
Blautia
Desulfovibrio
Escherichia-Shigella
Eubacterium_eligens_group
Holdemanella
Incertae_Sedis
Ruminococcus
Ruminococcus_gnavus_group
Sellimonas
Solobacterium
Succinivibrio
Tyzzerella
UCG-002


In [30]:
test_col = list(pcos_short.columns) + ['study_site']
test_col

['group',
 'region',
 'Acetitomaculum',
 'Adlercreutzia',
 'Aerococcus',
 'Agathobaculum',
 'Alistipes',
 'Allisonella',
 'Anaerosphaera',
 'Angelakisella',
 'Aureispira',
 'Bacteroides',
 'Blautia',
 'Brevundimonas',
 'Burkholderia',
 'Burkholderia-Caballeronia-Paraburkholderia',
 'Butyrivibrio',
 'CAG-873',
 'Catenisphaera',
 'Chelonobacter',
 'Christensenellaceae',
 'Clostridia_UCG-014',
 'Clostridia_vadinBB60_group',
 'Coprobacillus',
 'Coriobacteriaceae_UCG-002',
 'Coriobacteriaceae_UCG-003',
 'Corynebacterium',
 'Croceifilum',
 'Cryptanaerobacter',
 'Cryptobacterium',
 'Desulfovibrio',
 'Duganella',
 'Escherichia-Shigella',
 'Eubacterium',
 'Eubacterium_eligens_group',
 'Faecalibacterium',
 'Flintibacter',
 'Gottschalkia',
 'Halomonas',
 'Holdemanella',
 'Incertae_Sedis',
 'Irregularibacter',
 'Kineothrix',
 'Knoellia',
 'Lachnoclostridium',
 'Lachnospiraceae_UCG-003',
 'Massiliprevotella',
 'Mesoplasma',
 'Methanosphaera',
 'Mitsuokella',
 'Mobilitalea',
 'NK4A214_group',
 'Oxal

In [37]:
np.array(test_col)[adj_nodes]

array(['Adlercreutzia', 'Alistipes', 'Allisonella', 'Anaerosphaera',
       'Blautia', 'Desulfovibrio', 'Escherichia-Shigella',
       'Eubacterium_eligens_group', 'Holdemanella', 'Incertae_Sedis',
       'Ruminococcus', 'Ruminococcus_gnavus_group', 'Sellimonas',
       'Solobacterium', 'Succinivibrio', 'Tyzzerella', 'UCG-002'],
      dtype='<U42')