In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from causallearn.search.ConstraintBased.PC import pc
from causallearn.utils.GraphUtils import GraphUtils
from causallearn.utils.PCUtils.BackgroundKnowledge import BackgroundKnowledge
from causallearn.graph.GraphNode import GraphNode
import pydot
from IPython.display import Image, display

In [3]:
# gut microbes
gut_microbes = pd.read_csv("../datasets/gut_16s_abundance.txt", sep = "\t")
gut_microbes.head()

Unnamed: 0,SampleID,phylum_Actinobacteria,phylum_Bacteroidetes,phylum_Firmicutes,phylum_Proteobacteria,phylum_Verrucomicrobia,phylum_unclassified_Bacteria,class_Actinobacteria,class_Bacilli,class_Bacteroidia,...,genus_Veillonella,genus_unclassified_Bacteria,genus_unclassified_Clostridiales,genus_unclassified_Clostridiales_Incertae.Sedis.XIII,genus_unclassified_Coriobacteriaceae,genus_unclassified_Erysipelotrichaceae,genus_unclassified_Firmicutes,genus_unclassified_Lachnospiraceae,genus_unclassified_Porphyromonadaceae,genus_unclassified_Ruminococcaceae
0,ZOZOW1T-1010,0.000449,0.650866,0.22508,0.007364,0.015835,0.097431,0.000449,0.000138,0.650866,...,0.0,0.097431,0.007883,3.5e-05,0.000415,0.002386,0.015697,0.027452,0.0,0.074128
1,ZOZOW1T-1011,0.000176,0.730518,0.184808,0.0039,0.008186,0.070902,0.000176,7e-05,0.730518,...,3.5e-05,0.070902,0.008713,0.000141,0.000176,0.000808,0.008608,0.011805,0.0,0.069215
2,ZOZOW1T-1012,0.000597,0.178364,0.796861,0.000358,0.002788,0.020911,0.000597,0.000119,0.178364,...,8e-05,0.020911,0.022903,0.000518,0.000398,0.001713,0.000876,0.018362,0.0,0.615351
3,ZOZOW1T-1015,5.8e-05,0.740726,0.226516,0.005977,0.001207,0.024971,5.8e-05,2.9e-05,0.740726,...,2.9e-05,0.024971,0.013046,8.6e-05,2.9e-05,0.004109,0.00773,0.043849,2.9e-05,0.067096
4,ZOZOW1T-1021,0.001113,0.55458,0.364195,0.038358,5.9e-05,0.041286,0.001113,5.9e-05,0.55458,...,0.00041,0.041286,0.006208,0.000117,0.000176,0.000468,0.000996,0.132584,0.015753,0.044624


In [13]:
# subject data
subject_info = pd.read_csv("../datasets/subject_file.csv")
subject_info.head()

Unnamed: 0,SubjectID,Study,Race,Sex,Age,BMI,SSPG,IR_IS_classification
0,ZIS22OE,HMP,C,F,,,,Unknown
1,ZJBOZ2X,Exercise,C,F,65.3,19.82,,Unknown
2,ZJOSZHK,HMP,C,M,41.43,19.42,,Unknown
3,ZJTKAE3,HMP,C,F,58.65,31.24,162.0,IR
4,ZJXC41N,"HMP, Exercise",B,F,49.69,28.24,75.0,IS


In [6]:
# add SubjectID column to gut_microbes
gut_microbes['SubjectID'] = gut_microbes['SampleID'].str.split('-').str[0]

In [19]:
# find unique number of subjects in both datasets
# data cleaning: take their intersection (remove a few subjects)
print('Gut Microbes:', gut_microbes['SubjectID'].nunique(), '\n',
      'Subject Info:', subject_info['SubjectID'].nunique())

merged_df = pd.merge(gut_microbes, subject_info, on='SubjectID', how='inner')
merged_df.head()

Gut Microbes: 96 
 Subject Info: 107


Unnamed: 0,SampleID,phylum_Actinobacteria,phylum_Bacteroidetes,phylum_Firmicutes,phylum_Proteobacteria,phylum_Verrucomicrobia,phylum_unclassified_Bacteria,class_Actinobacteria,class_Bacilli,class_Bacteroidia,...,genus_unclassified_Porphyromonadaceae,genus_unclassified_Ruminococcaceae,SubjectID,Study,Race,Sex,Age,BMI,SSPG,IR_IS_classification
0,ZOZOW1T-1010,0.000449,0.650866,0.22508,0.007364,0.015835,0.097431,0.000449,0.000138,0.650866,...,0.0,0.074128,ZOZOW1T,"HMP, Exercise",C,M,59.48,21.47,91.5,IS
1,ZOZOW1T-1011,0.000176,0.730518,0.184808,0.0039,0.008186,0.070902,0.000176,7e-05,0.730518,...,0.0,0.069215,ZOZOW1T,"HMP, Exercise",C,M,59.48,21.47,91.5,IS
2,ZOZOW1T-1012,0.000597,0.178364,0.796861,0.000358,0.002788,0.020911,0.000597,0.000119,0.178364,...,0.0,0.615351,ZOZOW1T,"HMP, Exercise",C,M,59.48,21.47,91.5,IS
3,ZOZOW1T-1015,5.8e-05,0.740726,0.226516,0.005977,0.001207,0.024971,5.8e-05,2.9e-05,0.740726,...,2.9e-05,0.067096,ZOZOW1T,"HMP, Exercise",C,M,59.48,21.47,91.5,IS
4,ZOZOW1T-1021,0.001113,0.55458,0.364195,0.038358,5.9e-05,0.041286,0.001113,5.9e-05,0.55458,...,0.015753,0.044624,ZOZOW1T,"HMP, Exercise",C,M,59.48,21.47,91.5,IS


In [29]:
# 6 phyla: Actinobacteria, Bacteroidetes, Firmicutes, Proteobacteria, Verrucomicrobia, Unclassified
# 12 classes
# 12 orders
# 21 families (includes unclassified)
# 45 genus (includes unclassified)

taxonomy = ['phylum', 'class', 'order', 'family', 'genus']
taxonomy_counts = {col: 0 for col in taxonomy}
for col in merged_df.columns:
    for tax in taxonomy:
        if tax in col:
            taxonomy_counts[tax] += 1
            
taxonomy_counts