In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from causallearn.search.ConstraintBased.PC import pc
from causallearn.utils.cit import fastkci

In [2]:
clean = pd.read_csv("../data/clean.csv")

In [2]:
gut_microbes = pd.read_csv("../data/raw/gut_16s_abundance.txt", sep = "\t")
samplelist = pd.read_csv("../data/raw/S3_SampleList.csv")

samplelist_gut_healthy = samplelist[(samplelist['Gut_16S'] == 1) & (samplelist['CL4'] == 'Healthy')]

In [3]:
merged_df = pd.merge(gut_microbes, samplelist_gut_healthy, on='SampleID', how='inner')
merged_df

Unnamed: 0,SampleID,phylum_Actinobacteria,phylum_Bacteroidetes,phylum_Firmicutes,phylum_Proteobacteria,phylum_Verrucomicrobia,phylum_unclassified_Bacteria,class_Actinobacteria,class_Bacilli,class_Bacteroidia,...,CL3,CL4,Cytokines,ClinicLabs,Metabolites,Proteins,Transcripts,Gut_16S,Nasal_16S,Num_Type
0,ZOZOW1T-4026,0.018972,0.210724,0.576439,0.048086,0.004521,0.124651,0.018972,0.000042,0.210682,...,,Healthy,1,1,1,1,1,1,0,6
1,ZOZOW1T-44,0.005518,0.481819,0.324148,0.057698,0.000092,0.130191,0.005518,0.000107,0.481819,...,,Healthy,0,0,0,0,0,1,1,2
2,ZOZOW1T-59,0.003718,0.480834,0.470759,0.003606,0.000000,0.036807,0.003718,0.001970,0.480537,...,,Healthy,0,1,1,0,1,1,1,5
3,ZOZOW1T-6021,0.000338,0.637629,0.308550,0.003753,0.002365,0.046652,0.000338,0.000000,0.637629,...,,Healthy,1,1,1,1,1,1,0,6
4,ZOZOW1T-6031,0.001135,0.452997,0.484910,0.013626,0.002988,0.042610,0.001135,0.000000,0.452997,...,,Healthy,1,1,1,1,1,1,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,ZJTKAE3-01,0.013645,0.582611,0.298357,0.104188,0.000128,0.000663,0.013645,0.000969,0.582611,...,,Healthy,0,0,0,0,1,1,1,3
447,ZJTKAE3-04,0.013592,0.276709,0.644225,0.019779,0.044381,0.001072,0.013592,0.001803,0.276709,...,,Healthy,1,1,1,1,1,1,1,7
448,ZJTKAE3-06,0.004519,0.554200,0.420132,0.019591,0.000951,0.000303,0.004519,0.000843,0.554157,...,,Healthy,1,1,1,1,1,1,1,7
449,ZJTKAE3-6011,0.002607,0.776424,0.204213,0.002080,0.013300,0.001318,0.002607,0.000293,0.774578,...,,Healthy,1,1,1,0,0,1,1,5


In [4]:
genus_col = []
for col in merged_df.columns:
    if "genus" in col:
        genus_col.append(col)

In [5]:
X = merged_df.loc[:, genus_col]
X

Unnamed: 0,genus_Akkermansia,genus_Alistipes,genus_Anaerotruncus,genus_Anaerovorax,genus_Bacteroides,genus_Barnesiella,genus_Bilophila,genus_Blautia,genus_Butyricicoccus,genus_Butyricimonas,...,genus_Veillonella,genus_unclassified_Bacteria,genus_unclassified_Clostridiales,genus_unclassified_Clostridiales_Incertae.Sedis.XIII,genus_unclassified_Coriobacteriaceae,genus_unclassified_Erysipelotrichaceae,genus_unclassified_Firmicutes,genus_unclassified_Lachnospiraceae,genus_unclassified_Porphyromonadaceae,genus_unclassified_Ruminococcaceae
0,0.004521,0.010944,0.000296,0.000211,0.161709,0.003465,0.000042,0.012296,0.000634,0.003085,...,0.000084,0.124651,0.018254,0.000296,0.000423,0.002197,0.008409,0.067143,0.000000,0.057297
1,0.000092,0.077644,0.000917,0.000413,0.320556,0.016140,0.000825,0.028765,0.003577,0.002247,...,0.000015,0.130191,0.022987,0.000550,0.001849,0.017409,0.010561,0.074174,0.000046,0.048894
2,0.000000,0.046771,0.000149,0.000186,0.265494,0.007770,0.000037,0.022419,0.000781,0.001115,...,0.000149,0.036807,0.024092,0.000446,0.001153,0.004536,0.171134,0.027959,0.000000,0.102428
3,0.002365,0.007882,0.000000,0.000300,0.581182,0.001276,0.000037,0.012573,0.000488,0.000751,...,0.000000,0.046652,0.040422,0.000263,0.000188,0.003040,0.008482,0.121603,0.001239,0.028149
4,0.002988,0.010518,0.000359,0.000299,0.400227,0.002988,0.000060,0.007769,0.001195,0.002630,...,0.000000,0.042610,0.012431,0.000120,0.000239,0.001494,0.024861,0.167454,0.002331,0.027789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,0.000128,0.013492,0.000281,0.000179,0.535579,0.000000,0.013518,0.019690,0.001709,0.000025,...,0.000102,0.000663,0.003902,0.000408,0.008952,0.008697,0.000025,0.030657,0.003698,0.013798
447,0.044381,0.008818,0.000390,0.003118,0.258440,0.000049,0.001072,0.161007,0.003118,0.000000,...,0.000000,0.001072,0.003167,0.006577,0.009938,0.064111,0.000244,0.046670,0.002485,0.077118
448,0.000951,0.028284,0.000216,0.000389,0.499816,0.000043,0.000195,0.036826,0.000800,0.000022,...,0.000043,0.000303,0.010769,0.000692,0.001189,0.019094,0.001038,0.043940,0.002703,0.025387
449,0.013300,0.023699,0.000498,0.001611,0.742764,0.000000,0.000029,0.010898,0.000439,0.000000,...,0.000000,0.001318,0.038903,0.003281,0.000205,0.018016,0.000029,0.009784,0.001728,0.021649


In [4]:
def numerical_encoding(data):
    '''
    Encodes categorical variables in the subject datset into numeric discrete variables. Returns the resulting dataframe. 

    :param: data: raw subject dataset
    ''' 
    X = data.copy()
    X['Ethnicity'] = X['Ethnicity'].map({'C': 0, 'A': 1, 'B': 2, 'H': 3, 'unknown': 4})
    X['Gender'] = X['Gender'].map({'M': 0, 'F': 1})
    X['IRIS'] = X['IRIS'].map({'IR': 0, 'IS': 1, 'Unknown': 2})
    return X

In [8]:
data = numerical_encoding(clean).drop(columns=["Ethnicity", "Gender", "Adj.age", "BMI", "SSPG", "IRIS"])
data

Unnamed: 0,genus_Akkermansia,genus_Alistipes,genus_Anaerotruncus,genus_Anaerovorax,genus_Bacteroides,genus_Barnesiella,genus_Bilophila,genus_Blautia,genus_Butyricicoccus,genus_Butyricimonas,...,genus_Veillonella,genus_unclassified_Bacteria,genus_unclassified_Clostridiales,genus_unclassified_Clostridiales_Incertae.Sedis.XIII,genus_unclassified_Coriobacteriaceae,genus_unclassified_Erysipelotrichaceae,genus_unclassified_Firmicutes,genus_unclassified_Lachnospiraceae,genus_unclassified_Porphyromonadaceae,genus_unclassified_Ruminococcaceae
0,0.004521,0.010944,0.000296,0.000211,0.161709,0.003465,0.000042,0.012296,0.000634,0.003085,...,0.000084,0.124651,0.018254,0.000296,0.000423,0.002197,0.008409,0.067143,0.000000,0.057297
1,0.000092,0.077644,0.000917,0.000413,0.320556,0.016140,0.000825,0.028765,0.003577,0.002247,...,0.000015,0.130191,0.022987,0.000550,0.001849,0.017409,0.010561,0.074174,0.000046,0.048894
2,0.000000,0.046771,0.000149,0.000186,0.265494,0.007770,0.000037,0.022419,0.000781,0.001115,...,0.000149,0.036807,0.024092,0.000446,0.001153,0.004536,0.171134,0.027959,0.000000,0.102428
3,0.002365,0.007882,0.000000,0.000300,0.581182,0.001276,0.000037,0.012573,0.000488,0.000751,...,0.000000,0.046652,0.040422,0.000263,0.000188,0.003040,0.008482,0.121603,0.001239,0.028149
4,0.002988,0.010518,0.000359,0.000299,0.400227,0.002988,0.000060,0.007769,0.001195,0.002630,...,0.000000,0.042610,0.012431,0.000120,0.000239,0.001494,0.024861,0.167454,0.002331,0.027789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,0.000128,0.013492,0.000281,0.000179,0.535579,0.000000,0.013518,0.019690,0.001709,0.000025,...,0.000102,0.000663,0.003902,0.000408,0.008952,0.008697,0.000025,0.030657,0.003698,0.013798
327,0.044381,0.008818,0.000390,0.003118,0.258440,0.000049,0.001072,0.161007,0.003118,0.000000,...,0.000000,0.001072,0.003167,0.006577,0.009938,0.064111,0.000244,0.046670,0.002485,0.077118
328,0.000951,0.028284,0.000216,0.000389,0.499816,0.000043,0.000195,0.036826,0.000800,0.000022,...,0.000043,0.000303,0.010769,0.000692,0.001189,0.019094,0.001038,0.043940,0.002703,0.025387
329,0.013300,0.023699,0.000498,0.001611,0.742764,0.000000,0.000029,0.010898,0.000439,0.000000,...,0.000000,0.001318,0.038903,0.003281,0.000205,0.018016,0.000029,0.009784,0.001728,0.021649


In [9]:
pc_result = pc(data.values, 0.05, fastkci)

  0%|          | 0/45 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [12]:
pc_result.draw_pydot_graph()

FileNotFoundError: [Errno 2] "dot" not found in path.