In [1]:
import sys
import json
import pandas as pd
import numpy as np
import itertools
import glob, os
import matplotlib.pyplot as plt
from matplotlib_venn import venn2_unweighted
from biom.table import Table

sys.path.append('../')

from src.etl import *
from src.eda import *
from src.graph import *
from src.sparsify import *

In [2]:
# sam-t2d config
disease = 'sam-t2d'
otu_table_fp = '../data/sam-t2d/otu_table.csv'
metadata_fp = '../data/sam-t2d/metadata.csv'
rare_otu_threshold = 1
transformation = 'norm'
disease_col = 't2d'
group0 = 'No'
group1 = 'Yes'

In [3]:
otu_table = pd.read_csv(otu_table_fp, index_col=0)
filtered_otu_table = filter_rare_otus(otu_table, rare_otu_threshold)
metadata = pd.read_csv(metadata_fp, index_col=0)

if disease == 'sam-t2d':
    merged = metadata.merge(filtered_otu_table, how='inner', left_index=True, right_index=True)
else: 
    merged = pd.concat([metadata, filtered_otu_table], axis=1)
healthy = merged[merged[disease_col] == 0] #.drop(columns=[disease_col])
diseased = merged[merged[disease_col] == 1] #.drop(columns=[disease_col])

In [4]:
data_loglasso = prune_lasso(merged, metadata, f'../data/{disease}/lasso_covariates_{transformation}.txt')
samt2dcdnod = run_cdnod(data_loglasso, disease, f'{disease}/cdnod_{transformation}')

The pruned dataset has the following dimensions:  (811, 77)


  0%|          | 0/77 [00:00<?, ?it/s]

The following genera are directly linked to the 't2d' node: 
 region_num
d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella
d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Oscillospirales;f__Ruminococcaceae;g__Faecalibacterium
d__Bacteria;p__Actinomycetota;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella
d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Alistipes_A_871400
d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Christensenellales;f__CAG-138;g__Aphodomorpha
d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Pseudobutyrivibrio
d__Bacteria;p__Pseudomonadota;c__Alphaproteobacteria;o__RF32;f__CAG-239;g__CAG-267
d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Monoglobales;f__Monoglobaceae;g__
The following genera are directly linked to the 'study' node: 
 region_num
d__Bacteria;p__Actinomycetota;c__Actinomycetes;o__A

In [26]:
adj_nodes = []
for node in samt2dcdnod.get_adjacent_nodes(samt2dcdnod.get_node('X73')):
    adj_nodes.append(int(node.get_name().replace('X', '')))

data_aug_col = np.array(['placeholder'] + list(data_loglasso.columns))

[x.replace(';', '.') for x in list(data_aug_col[adj_nodes])]

['t2d',
 'd__Bacteria.p__Pseudomonadota.c__Gammaproteobacteria.o__Enterobacterales_737866.f__Enterobacteriaceae_A_725029.g__Escherichia',
 'd__Bacteria.p__Bacillota_A_368345.c__Clostridia_258483.o__Oscillospirales.f__Acutalibacteraceae.g__CAG-488',
 'd__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Marinifilaceae.g__Butyricimonas']

In [13]:
adj_nodes

[3, 4, 5, 10, 25, 45, 51, 71, 73]

In [11]:
samt2dcdnod.get_node('X1')

<causallearn.graph.GraphNode.GraphNode at 0x15d6567c0>

In [7]:
samt2dcdnod.get_adjacent_nodes('region_num')

KeyError: 'region_num'