In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys

sys.path.append('../')

from src.etl import *
from src.eda import *
from src.graph import *
from src.sparsify import *

In [None]:
import os


In [5]:
subject_fp, sample_fp, gut_fp = ['../data/raw/S1_Subjects.csv', '../data/raw/S3_SampleList.csv', '../data/raw/gut_16s_abundance.txt']
subjects = pd.read_csv(subject_fp)
samples = pd.read_csv(sample_fp)
gut_16s = pd.read_csv(gut_fp, sep='\t')

# filter out subjects who had an unknown "disease" (IR/IS) status
subjects_IRIS_known = subjects[subjects['IRIS'] != 'Unknown'][['IRIS', 'Gender', 'Ethnicity', 'SubjectID']]

# filter out samples that were not on a "Healthy" visit
samples_healthy = samples[(samples['Gut_16S'] == 1) & (samples['CL4'] == 'Healthy')][['SubjectID', 'SampleID']]

# only get genera, transform into percentages
genera = []
for col in gut_16s.columns:
    if 'genus_' in col:
        genera.append(col)
gut_16s_genera = gut_16s[genera] * 100
gut_16s_genera = pd.concat([gut_16s[['SampleID']], gut_16s_genera], axis=1)


# merge three dataframes
merged_df = pd.merge(gut_16s_genera, samples_healthy, on='SampleID', how='inner')
merged_df = pd.merge(subjects_IRIS_known, merged_df, on='SubjectID', how='inner')
merged_df = merged_df.set_index('SampleID')

In [19]:
raw_fp = '../data/raw/pcosyang2024.xlsx'
pcosyang2024 = pd.read_excel(raw_fp, engine='openpyxl')
pcosyang2024 = pcosyang2024.T
pcosyang2024.columns = pcosyang2024.iloc[0, :]
pcosyang2024 = pcosyang2024.iloc[1:, :]

unclass_col = ['uclassified', 'unclassified', 'unidentified']
pcosyang2024['Unclassified'] = pcosyang2024[unclass_col[0]] + pcosyang2024[unclass_col[1]] + pcosyang2024[unclass_col[2]]
pcosyang2024 = pcosyang2024.drop(columns = unclass_col)

pcosyang2024['region'] = pcosyang2024['region'].map({'Europe': 0, 'Asia': 1})
pcosyang2024['group'] = pcosyang2024['group'].map({'HC': 0, 'PCOS': 1})
pcosyang2024 = pcosyang2024.drop(columns = ['T'])

for col in pcosyang2024.columns[2:]:
    pcosyang2024[col] = pcosyang2024[col].astype(float)

# study assigned by order presented in paper and self-matching sample size numbers & region
study_site = {
    1: [0, 19+24],
    2: [19+24, 19+24+48+73],
    3: [19+24+48+73, 19+24+48+73+12+14],
    4: [19+24+48+73+12+14, 19+24+48+73+12+14+12],
    5: [19+24+48+73+12+14+12, 19+24+48+73+12+14+12+20+20],
    6: [19+24+48+73+12+14+12+20+20, 19+24+48+73+12+14+12+20+20+131+68],
    7: [19+24+48+73+12+14+12+20+20+131+68, 19+24+48+73+12+14+12+20+20+131+68+41+47],
    8: [19+24+48+73+12+14+12+20+20+131+68+41+47, 19+24+48+73+12+14+12+20+20+131+68+41+47+24+24],
    9: [19+24+48+73+12+14+12+20+20+131+68+41+47+24+24, 19+24+48+73+12+14+12+20+20+131+68+41+47+24+24+36+98],
    10: [19+24+48+73+12+14+12+20+20+131+68+41+47+24+24+36+98, 19+24+48+73+12+14+12+20+20+131+68+41+47+24+24+36+98+20+20],
    11: [19+24+48+73+12+14+12+20+20+131+68+41+47+24+24+36+98+20+20, 19+24+48+73+12+14+12+20+20+131+68+41+47+24+24+36+98+20+20+37+45],
    12: [19+24+48+73+12+14+12+20+20+131+68+41+47+24+24+36+98+20+20+37+45, 19+24+48+73+12+14+12+20+20+131+68+41+47+24+24+36+98+20+20+37+45+15+18],
    13: [19+24+48+73+12+14+12+20+20+131+68+41+47+24+24+36+98+20+20+37+45+15+18, 19+24+48+73+12+14+12+20+20+131+68+41+47+24+24+36+98+20+20+37+45+15+18+15+33],
    14: [19+24+48+73+12+14+12+20+20+131+68+41+47+24+24+36+98+20+20+37+45+15+18+15+33, 19+24+48+73+12+14+12+20+20+131+68+41+47+24+24+36+98+20+20+37+45+15+18+15+33+17+17]
}

pcosyang2024['study_site'] = np.full(pcosyang2024.shape[0], 0)
for i in range(1, 15):
    pcosyang2024.iloc[study_site[i][0]:study_site[i][1], -1] = i

otu_table = pcosyang2024.loc[:, [x for x in pcosyang2024.columns if (x != 'group') & (x != 'region') & (x !='study_site')]]
otu_table

  pcosyang2024['study_site'] = np.full(pcosyang2024.shape[0], 0)


sample,01D2Z36,0319-6G20,0319-7L14,27F-1492R,37-13,67-14,A2,Abiotrophia,Absconditabacteriales_(SR1),Abyssivirga,...,WWH38,Xanthomonas,Xenorhabdus,Xylanibacillus,Yersinia,Youngiibacter,Zavarzinia,Zoogloea,ZOR0006,Unclassified
SRR4457864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.019341,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.640935
SRR4457873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.630561
SRR4457876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.448002
SRR4457877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.010694,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.459400
SRR4457878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.031991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR24874401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009899,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.423813
SRR24874402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.704500
SRR24874404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.894488
SRR24874405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050805,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111772


In [None]:
pcosyang2024

In [8]:
data = pd.read_csv('../data/t2d/clean.csv', index_col=0)
data

Unnamed: 0,IRIS,Gender,Ethnicity,genus_Akkermansia,genus_Alistipes,genus_Anaerotruncus,genus_Anaerovorax,genus_Bacteroides,genus_Barnesiella,genus_Bilophila,...,genus_Veillonella,genus_unclassified_Bacteria,genus_unclassified_Clostridiales,genus_unclassified_Clostridiales_Incertae.Sedis.XIII,genus_unclassified_Coriobacteriaceae,genus_unclassified_Erysipelotrichaceae,genus_unclassified_Firmicutes,genus_unclassified_Lachnospiraceae,genus_unclassified_Porphyromonadaceae,genus_unclassified_Ruminococcaceae
0,0,0,0,0.452125,1.094397,0.029578,0.021127,16.170878,0.346489,0.004230,...,0.008450,12.465140,1.825403,0.029578,0.042255,0.219724,0.840869,6.714274,0.000000,5.729739
1,0,0,0,0.009170,7.764379,0.091705,0.041267,32.055573,1.614013,0.082535,...,0.001530,13.019090,2.298745,0.055023,0.184939,1.740871,1.056139,7.417427,0.004590,4.889419
2,0,0,0,0.000000,4.677102,0.014871,0.018589,26.549429,0.777038,0.003720,...,0.014871,3.680708,2.409191,0.044615,0.115254,0.453582,17.113433,2.795851,0.000000,10.242778
3,0,0,0,0.236451,0.788170,0.000000,0.030025,58.118150,0.127608,0.003750,...,0.000000,4.665215,4.042186,0.026272,0.018766,0.304008,0.848221,12.160336,0.123855,2.814893
4,0,0,0,0.298811,1.051814,0.035857,0.029881,40.022710,0.298811,0.005980,...,0.000000,4.261041,1.243053,0.011952,0.023905,0.149405,2.486105,16.745354,0.233072,2.778940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,1,1,0,0.012752,1.349214,0.028055,0.017854,53.557947,0.000000,1.351765,...,0.010202,0.066313,0.390226,0.040808,0.895225,0.869720,0.002550,3.065701,0.369822,1.379820
327,1,1,0,4.438057,0.881766,0.038973,0.311785,25.844010,0.004870,0.107176,...,0.000000,0.107176,0.316656,0.657670,0.993813,6.411068,0.024358,4.667024,0.248453,7.711794
328,1,1,0,0.095145,2.828414,0.021624,0.038923,49.981620,0.004320,0.019462,...,0.004320,0.030273,1.076873,0.069197,0.118932,1.909396,0.103795,4.393988,0.270300,2.538653
329,1,1,0,1.329974,2.369932,0.049801,0.161120,74.276424,0.000000,0.002930,...,0.000000,0.131826,3.890321,0.328099,0.020506,1.801617,0.002930,0.978439,0.172838,2.164870


In [9]:
filter_rare = pd.read_csv('../data/t2d/filter_rare.csv', index_col=0)
filter_rare

Unnamed: 0,IRIS,Gender,Ethnicity,genus_Akkermansia,genus_Alistipes,genus_Bacteroides,genus_Barnesiella,genus_Bilophila,genus_Blautia,genus_Butyricicoccus,...,genus_Streptococcus,genus_Veillonella,genus_unclassified_Bacteria,genus_unclassified_Clostridiales,genus_unclassified_Coriobacteriaceae,genus_unclassified_Erysipelotrichaceae,genus_unclassified_Firmicutes,genus_unclassified_Lachnospiraceae,genus_unclassified_Porphyromonadaceae,genus_unclassified_Ruminococcaceae
0,0,0,0,0.452125,1.094397,16.170878,0.346489,0.004230,1.229612,0.063382,...,0.004230,0.008450,12.465140,1.825403,0.042255,0.219724,0.840869,6.714274,0.000000,5.729739
1,0,0,0,0.009170,7.764379,32.055573,1.614013,0.082535,2.876488,0.357650,...,0.009170,0.001530,13.019090,2.298745,0.184939,1.740871,1.056139,7.417427,0.004590,4.889419
2,0,0,0,0.000000,4.677102,26.549429,0.777038,0.003720,2.241886,0.078076,...,0.066922,0.014871,3.680708,2.409191,0.115254,0.453582,17.113433,2.795851,0.000000,10.242778
3,0,0,0,0.236451,0.788170,58.118150,0.127608,0.003750,1.257319,0.048792,...,0.000000,0.000000,4.665215,4.042186,0.018766,0.304008,0.848221,12.160336,0.123855,2.814893
4,0,0,0,0.298811,1.051814,40.022710,0.298811,0.005980,0.776908,0.119524,...,0.000000,0.000000,4.261041,1.243053,0.023905,0.149405,2.486105,16.745354,0.233072,2.778940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,1,1,0,0.012752,1.349214,53.557947,0.000000,1.351765,1.968986,0.170883,...,0.038257,0.010202,0.066313,0.390226,0.895225,0.869720,0.002550,3.065701,0.369822,1.379820
327,1,1,0,4.438057,0.881766,25.844010,0.004870,0.107176,16.100745,0.311785,...,0.082818,0.000000,0.107176,0.316656,0.993813,6.411068,0.024358,4.667024,0.248453,7.711794
328,1,1,0,0.095145,2.828414,49.981620,0.004320,0.019462,3.682560,0.080009,...,0.084333,0.004320,0.030273,1.076873,0.118932,1.909396,0.103795,4.393988,0.270300,2.538653
329,1,1,0,1.329974,2.369932,74.276424,0.000000,0.002930,1.089759,0.043942,...,0.020506,0.000000,0.131826,3.890321,0.020506,1.801617,0.002930,0.978439,0.172838,2.164870


In [10]:
prune_lasso(filter_rare, '../data/t2d/lasso_covariates.txt')

The pruned dataset has the following dimensions:  (331, 27)


Unnamed: 0,IRIS,Gender,Ethnicity,genus_Akkermansia,genus_Alistipes,genus_Barnesiella,genus_Blautia,genus_Butyricicoccus,genus_Butyricimonas,genus_Clostridium.IV,...,genus_Flavonifractor,genus_Lachnospiracea_incertae_sedis,genus_Odoribacter,genus_Oscillibacter,genus_Parasutterella,genus_Roseburia,genus_Ruminococcus,genus_unclassified_Bacteria,genus_unclassified_Firmicutes,genus_unclassified_Lachnospiraceae
0,0,0,0,0.452125,1.094397,0.346489,1.229612,0.063382,0.308459,1.711316,...,0.042255,7.335418,0.232401,5.307192,0.021127,0.367616,0.000000,12.465140,0.840869,6.714274
1,0,0,0,0.009170,7.764379,1.614013,2.876488,0.357650,0.224678,0.076421,...,0.114632,3.840922,1.048497,2.379751,0.209394,0.623596,0.000000,13.019090,1.056139,7.417427
2,0,0,0,0.000000,4.677102,0.777038,2.241886,0.078076,0.111537,0.014871,...,0.037179,1.676767,0.185894,1.364464,0.018589,0.338328,0.000000,3.680708,17.113433,2.795851
3,0,0,0,0.236451,0.788170,0.127608,1.257319,0.048792,0.075064,0.112596,...,0.015013,3.790722,0.439123,0.270230,0.060051,0.604264,0.000000,4.665215,0.848221,12.160336
4,0,0,0,0.298811,1.051814,0.298811,0.776908,0.119524,0.262953,2.976155,...,0.017929,8.725273,0.442240,3.131537,0.125500,0.998028,0.000000,4.261041,2.486105,16.745354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,1,1,0,0.012752,1.349214,0.000000,1.968986,0.170883,0.002550,0.020404,...,0.494797,4.453173,0.285656,0.841665,4.414915,0.002550,0.015303,0.066313,0.002550,3.065701
327,1,1,0,4.438057,0.881766,0.004870,16.100745,0.311785,0.000000,0.014615,...,1.554051,12.690603,0.019487,1.076631,1.690457,0.004870,0.029230,0.107176,0.024358,4.667024
328,1,1,0,0.095145,2.828414,0.004320,3.682560,0.080009,0.002160,0.041085,...,0.594659,13.212239,0.088658,0.553573,1.889934,0.012974,0.748189,0.030273,0.103795,4.393988
329,1,1,0,1.329974,2.369932,0.000000,1.089759,0.043942,0.000000,0.020506,...,0.345676,2.074057,0.052730,2.618936,0.155261,0.000000,1.868995,0.131826,0.002930,0.978439


In [19]:
disease = 't2d'
group0 = 'IS'
group1 = 'IR'

healthy = pd.read_csv(f'../data/{disease}/{group0}.csv', index_col=0)
diseased = pd.read_csv(f'../data/{disease}/{group1}.csv', index_col=0)

In [26]:
data_sparse_healthy = get_sig_cor_pairs_sparcc(
    f'../data/{disease}/sparcc_{group0}.csv', 
    f'../data/{disease}/sparcc_{group0}_pvals_one_sided.csv', 
    f'../data/{disease}/{group0}.csv')
data_sparse_diseased = get_sig_cor_pairs_sparcc(
    f'../data/{disease}/sparcc_{group1}.csv', 
    f'../data/{disease}/sparcc_{group1}_pvals_one_sided.csv', 
    f'../data/{disease}/{group1}.csv')

In [33]:
len(np.unique(list(data_sparse_healthy['genus_A']) + list(data_sparse_healthy['genus_B'])))

39

In [20]:
data_sparse_healthy, data_sparse_diseased, keep_nodes_healthy, keep_nodes_diseased = get_sig_cor_pairs_glasso(
    f'../data/{disease}/glasso_{group0}.csv', 
    f'../data/{disease}/glasso_{group1}.csv', 
    list(healthy.columns))

In [25]:
diseased[keep_nodes_diseased]

Unnamed: 0,Ethnicity,genus_Akkermansia,genus_Alistipes,genus_Bacteroides,genus_Barnesiella,genus_Blautia,genus_Clostridium.IV,genus_Collinsella,genus_Coprococcus,genus_Dorea,...,genus_Roseburia,genus_Ruminococcus,genus_Veillonella,genus_unclassified_Bacteria,genus_unclassified_Clostridiales,genus_unclassified_Erysipelotrichaceae,genus_unclassified_Firmicutes,genus_unclassified_Lachnospiraceae,genus_unclassified_Porphyromonadaceae,genus_unclassified_Ruminococcaceae
23,1,0.000000,1.613503,53.256197,0.00531,0.711215,0.448490,0.137997,0.281301,0.164535,...,1.889496,0.000000,0.015923,0.013269,0.007960,0.002650,0.153920,1.847036,0.002650,0.337031
24,1,0.000000,0.792812,63.197674,0.00000,2.737844,0.000000,0.882664,0.280127,0.898520,...,0.491543,0.000000,0.021142,0.000000,0.005290,0.015856,0.073996,4.201903,0.005290,0.301268
25,1,0.000000,0.231924,86.796271,0.00000,0.356980,0.000000,0.095498,0.143247,0.197817,...,0.063665,0.002270,0.002270,0.002270,0.000000,0.000000,0.004550,0.793542,0.000000,0.068213
26,1,0.000000,0.779727,89.151038,0.00000,0.060496,0.047052,0.067218,0.114270,0.120992,...,0.134436,0.000000,0.013444,0.000000,0.000000,0.000000,0.026887,0.477247,0.000000,0.282315
27,1,0.000000,0.003740,68.622392,0.00000,4.491063,0.000000,0.000000,0.000000,0.000000,...,5.470795,0.000000,0.650662,0.007480,0.003740,0.014958,0.781542,5.433401,0.000000,0.078528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,0,0.012752,1.349214,53.557947,0.00000,1.968986,0.020404,0.007650,0.275454,1.017650,...,0.002550,0.015303,0.010202,0.066313,0.390226,0.869720,0.002550,3.065701,0.369822,1.379820
327,0,4.438057,0.881766,25.844010,0.00487,16.100745,0.014615,0.004870,0.862279,5.353924,...,0.004870,0.029230,0.000000,0.107176,0.316656,6.411068,0.024358,4.667024,0.248453,7.711794
328,0,0.095145,2.828414,49.981620,0.00432,3.682560,0.041085,0.281112,0.423830,1.202292,...,0.012974,0.748189,0.004320,0.030273,1.076873,1.909396,0.103795,4.393988,0.270300,2.538653
329,0,1.329974,2.369932,74.276424,0.00000,1.089759,0.020506,0.216780,0.090813,0.424772,...,0.000000,1.868995,0.000000,0.131826,3.890321,1.801617,0.002930,0.978439,0.172838,2.164870


In [24]:
data_sparse_healthy

Unnamed: 0,genus_A,genus_B,precision
0,Ethnicity,genus_Bacteroides,0.001110
1,Ethnicity,genus_Prevotella,-0.005160
2,genus_Akkermansia,genus_Bacteroides,0.001723
3,genus_Akkermansia,genus_Faecalibacterium,0.000582
4,genus_Akkermansia,genus_unclassified_Ruminococcaceae,-0.005417
...,...,...,...
60,genus_Prevotella,genus_unclassified_Ruminococcaceae,0.007348
61,genus_unclassified_Bacteria,genus_unclassified_Clostridiales,-0.005215
62,genus_unclassified_Bacteria,genus_unclassified_Ruminococcaceae,-0.000520
63,genus_unclassified_Clostridiales,genus_unclassified_Porphyromonadaceae,0.001599


In [22]:
data_sparse_diseased

Unnamed: 0,genus_A,genus_B,precision
0,Ethnicity,genus_Bacteroides,-0.001406
1,genus_Akkermansia,genus_Bacteroides,0.001044
2,genus_Akkermansia,genus_Prevotella,0.000418
3,genus_Alistipes,genus_Bacteroides,0.002433
4,genus_Alistipes,genus_Blautia,0.003847
5,genus_Alistipes,genus_Faecalibacterium,0.001406
6,genus_Alistipes,genus_Lachnospiracea_incertae_sedis,0.00039
7,genus_Alistipes,genus_Prevotella,0.004445
8,genus_Alistipes,genus_unclassified_Ruminococcaceae,-0.004823
9,genus_Bacteroides,genus_Barnesiella,0.003297
