In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [61]:
def clean_data_sam_t2d(feature_table_fp, metadata_fp):
    '''
    Returns the otu table and metadata for the T2D meta-analyses studies.

    :param: feature_table_fp: filepath to the feature table from qiime2
    :param: metadata_fp: filepath to the metadata 

    :return: pandas DataFrame of the otu table, pandas DataFrame of the metadata
    '''
    otu_table = pd.read_csv(feature_table_fp, sep="\t", header=1)
    otu_table = otu_table.T
    otu_table.columns = otu_table.iloc[0, :]
    otu_table = otu_table.iloc[1:, :] * 100
    # otu_table = otu_table.reset_index()
    # otu_table = otu_table.rename(columns={'index': 'sample-id'})

    metadata = pd.read_csv(metadata_fp, sep="\t").set_index('sample-id')

    # remove china and india
    metadata = metadata[~metadata['country'].isin(['China', 'India'])]
    bioproj_map = dict()
    for i, bioproj in enumerate(metadata['bioproject'].unique()):
        bioproj_map[bioproj] = i+1
    metadata['study'] = metadata['bioproject'].map(bioproj_map)

    # africa: sudan
    # east asia: japan, china
    # south asia: pakistan, india
    # south east asia: indonesia, vietnam
    # europe: finland
    region_map = {'Sudan': 'Africa',
                  'Japan': 'East Asia',
                  'China': 'East Asia',
                  'Pakistan1': 'South Asia',
                  'Pakistan2': 'South Asia',
                  'India': 'South Asia',
                  'Indonesia': 'Southeast Asia',
                  'Vietnam': 'Southeast Asia',
                  'Finland': 'Europe'
                 }
    region_num_map = {'Africa': 0,
                      'East Asia': 1,
                      'South Asia': 2,
                      'Southeast Asia': 3,
                      'Europe': 4
                     }
    metadata['region'] = metadata['country'].map(region_map)
    metadata['region_num'] = metadata['region'].map(region_num_map)
    metadata['t2d'] = metadata['t2d'].map({'No': 0, 'Yes': 1})
    metadata = metadata.loc[:, ['t2d', 'study', 'region_num']]
    
    return otu_table, metadata

In [62]:
otu_table, metadata = clean_data_sam_t2d("../qiime2/relative-frequency/feature-table-genus.tsv", "../qiime2/our-metadata.tsv")

In [63]:
otu_table

#OTU ID,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;__,d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__Erysipelotrichales;f__Coprobacillaceae;g__Faecalibacillus,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiaceae_A_595427;g__Sutterella,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Enterobacterales_737866;f__Enterobacteriaceae_A_725029;g__Enterobacter_B_683926,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Oscillospirales;f__Ruminococcaceae;g__Faecalibacterium,d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Holdemanella,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Lachnospira,d__Bacteria;p__Actinomycetota;c__Actinomycetes;o__Actinomycetales;f__Bifidobacteriaceae;g__Bifidobacterium_388775,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Enterobacterales_737866;f__Enterobacteriaceae_A_725029;__,...,d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__Lactobacillales;f__Lactobacillaceae;g__Fructilactobacillus,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiaceae_A_595427;g__,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Lachnospirales;f__Anaerotignaceae;g__Fimicola,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;g__SFTJ01,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Enterobacterales_737866;f__Enterobacteriaceae_A_729055;g__Serratia_L_726994,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Shuttleworthia,d__Archaea;p__Thermoplasmatota;c__Thermoplasmata_1773;o__Methanomassiliicoccales;f__Methanomethylophilaceae;g__Methanomethylophilus,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Oscillospirales;f__Acutalibacteraceae;g__Limousia,d__Bacteria;p__Actinomycetota;c__Actinomycetes;o__Actinomycetales;f__Cellulomonadaceae;g__Populibacterium,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__UBA932;g__Bact-08
SRR16113010,13.057867,4.774962,26.068457,1.011725,0.463313,1.569592,1.456127,0.0,16.064675,0.208018,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR16112701,6.788602,0.0,0.0,0.0,6.736543,0.006941,0.0,0.0,27.5813,0.524069,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR9679959,2.311604,0.033023,20.447791,0.574599,0.0,3.810845,0.0,0.184928,14.186646,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR9763111,8.936595,0.0,4.421146,0.0,0.011094,12.148444,0.343929,0.0,29.677706,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR9763104,6.218302,0.0,1.058434,0.0,0.0,9.558986,0.055127,0.022051,3.164278,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR16112703,31.017103,0.0,0.006981,0.0,1.01082,0.013962,0.0,0.0,5.898778,0.568237,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR16112937,10.750951,2.595237,0.0,0.541977,38.631508,4.236802,0.0,2.438897,2.986086,0.062536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR16112717,8.312378,0.0,33.116667,0.232481,0.009489,0.056934,1.556199,0.075912,2.386488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR16112648,12.971435,1.455279,15.474947,0.371024,0.172904,15.168762,4.812507,1.494903,0.691618,0.295378,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
metadata

Unnamed: 0_level_0,t2d,study,region_num
sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SRR9679957,1,1,2
SRR9679958,0,1,2
SRR9679959,0,1,2
SRR9679960,1,1,2
SRR9679961,0,1,2
...,...,...,...
SRR12929838,1,7,3
SRR12929837,1,7,3
SRR12929836,1,7,3
SRR12929835,1,7,3


In [53]:
otu_table.merge(metadata, how='inner', left_index=True, right_index=True)

Unnamed: 0,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;__,d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__Erysipelotrichales;f__Coprobacillaceae;g__Faecalibacillus,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiaceae_A_595427;g__Sutterella,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Enterobacterales_737866;f__Enterobacteriaceae_A_725029;g__Enterobacter_B_683926,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Oscillospirales;f__Ruminococcaceae;g__Faecalibacterium,d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Holdemanella,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Lachnospira,d__Bacteria;p__Actinomycetota;c__Actinomycetes;o__Actinomycetales;f__Bifidobacteriaceae;g__Bifidobacterium_388775,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Enterobacterales_737866;f__Enterobacteriaceae_A_725029;__,...,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;g__SFTJ01,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Enterobacterales_737866;f__Enterobacteriaceae_A_729055;g__Serratia_L_726994,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Shuttleworthia,d__Archaea;p__Thermoplasmatota;c__Thermoplasmata_1773;o__Methanomassiliicoccales;f__Methanomethylophilaceae;g__Methanomethylophilus,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Oscillospirales;f__Acutalibacteraceae;g__Limousia,d__Bacteria;p__Actinomycetota;c__Actinomycetes;o__Actinomycetales;f__Cellulomonadaceae;g__Populibacterium,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__UBA932;g__Bact-08,t2d,study,region_num
SRR16113010,13.057867,4.774962,26.068457,1.011725,0.463313,1.569592,1.456127,0.0,16.064675,0.208018,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Yes,4,1
SRR16112701,6.788602,0.0,0.0,0.0,6.736543,0.006941,0.0,0.0,27.5813,0.524069,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Yes,4,1
SRR9679959,2.311604,0.033023,20.447791,0.574599,0.0,3.810845,0.0,0.184928,14.186646,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No,2,2
ERR9763111,8.936595,0.0,4.421146,0.0,0.011094,12.148444,0.343929,0.0,29.677706,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No,3,2
ERR9763104,6.218302,0.0,1.058434,0.0,0.0,9.558986,0.055127,0.022051,3.164278,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR16112703,31.017103,0.0,0.006981,0.0,1.01082,0.013962,0.0,0.0,5.898778,0.568237,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Yes,4,1
SRR16112937,10.750951,2.595237,0.0,0.541977,38.631508,4.236802,0.0,2.438897,2.986086,0.062536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Yes,4,1
SRR16112717,8.312378,0.0,33.116667,0.232481,0.009489,0.056934,1.556199,0.075912,2.386488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No,4,1
SRR16112648,12.971435,1.455279,15.474947,0.371024,0.172904,15.168762,4.812507,1.494903,0.691618,0.295378,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Yes,4,1


In [47]:
otu_table = pd.read_csv("../qiime2/relative-frequency/feature-table-genus.tsv", sep="\t", header=1)
otu_table = otu_table.T
otu_table.columns = otu_table.iloc[0, :]
otu_table = otu_table.iloc[1:, :] * 100
otu_table = otu_table.rename(columns={'index': 'sample-id'})
otu_table

#OTU ID,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;__,d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__Erysipelotrichales;f__Coprobacillaceae;g__Faecalibacillus,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiaceae_A_595427;g__Sutterella,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Enterobacterales_737866;f__Enterobacteriaceae_A_725029;g__Enterobacter_B_683926,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Oscillospirales;f__Ruminococcaceae;g__Faecalibacterium,d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Holdemanella,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Lachnospira,d__Bacteria;p__Actinomycetota;c__Actinomycetes;o__Actinomycetales;f__Bifidobacteriaceae;g__Bifidobacterium_388775,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Enterobacterales_737866;f__Enterobacteriaceae_A_725029;__,...,d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__Lactobacillales;f__Lactobacillaceae;g__Fructilactobacillus,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiaceae_A_595427;g__,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Lachnospirales;f__Anaerotignaceae;g__Fimicola,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;g__SFTJ01,d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Enterobacterales_737866;f__Enterobacteriaceae_A_729055;g__Serratia_L_726994,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Lachnospirales;f__Lachnospiraceae;g__Shuttleworthia,d__Archaea;p__Thermoplasmatota;c__Thermoplasmata_1773;o__Methanomassiliicoccales;f__Methanomethylophilaceae;g__Methanomethylophilus,d__Bacteria;p__Bacillota_A_368345;c__Clostridia_258483;o__Oscillospirales;f__Acutalibacteraceae;g__Limousia,d__Bacteria;p__Actinomycetota;c__Actinomycetes;o__Actinomycetales;f__Cellulomonadaceae;g__Populibacterium,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__UBA932;g__Bact-08
SRR16113010,13.057867,4.774962,26.068457,1.011725,0.463313,1.569592,1.456127,0.0,16.064675,0.208018,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR16112701,6.788602,0.0,0.0,0.0,6.736543,0.006941,0.0,0.0,27.5813,0.524069,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR9679959,2.311604,0.033023,20.447791,0.574599,0.0,3.810845,0.0,0.184928,14.186646,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR9763111,8.936595,0.0,4.421146,0.0,0.011094,12.148444,0.343929,0.0,29.677706,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR9763104,6.218302,0.0,1.058434,0.0,0.0,9.558986,0.055127,0.022051,3.164278,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR16112703,31.017103,0.0,0.006981,0.0,1.01082,0.013962,0.0,0.0,5.898778,0.568237,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR16112937,10.750951,2.595237,0.0,0.541977,38.631508,4.236802,0.0,2.438897,2.986086,0.062536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR16112717,8.312378,0.0,33.116667,0.232481,0.009489,0.056934,1.556199,0.075912,2.386488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR16112648,12.971435,1.455279,15.474947,0.371024,0.172904,15.168762,4.812507,1.494903,0.691618,0.295378,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
otu_table.to_csv("../data/sam/otu_table.csv")

In [32]:
# covariates are bioproject, country
metadata = pd.read_csv("../qiime2/our-metadata.tsv", sep="\t")
metadata

Unnamed: 0,sample-id,bioproject,sample-alias,t2d,country,age,bmi,sex,metformin,diet-type
0,SRR12599850,PRJNA661673,,Yes,China,,,,,
1,SRR12599861,PRJNA661673,,Yes,China,,,,,
2,SRR12599862,PRJNA661673,,Yes,China,,,,,
3,SRR12599863,PRJNA661673,,Yes,China,,,,,
4,SRR12599864,PRJNA661673,,Yes,China,,,,,
...,...,...,...,...,...,...,...,...,...,...
953,SRR12929838,PRJNA668251,D3,Yes,Vietnam,66.0,,Female,,
954,SRR12929837,PRJNA668251,D4,Yes,Vietnam,52.0,,Female,,
955,SRR12929836,PRJNA668251,D5,Yes,Vietnam,55.0,,Female,,
956,SRR12929835,PRJNA668251,D6,Yes,Vietnam,59.0,,Female,,


In [45]:
bioproj_map = dict()
for i, bioproj in enumerate(metadata['bioproject'].unique()):
    bioproj_map[bioproj] = i+1
bioproj_map

{'PRJNA661673': 1,
 'PRJNA554535': 2,
 'PRJEB53017': 3,
 'PRJNA766337': 4,
 'PRJDB9293': 5,
 'PRJNA588353': 6,
 'PRJNA399246': 7,
 'PRJEB53018': 8,
 'PRJNA668251': 9}

In [46]:
metadata['bioproject'].map(bioproj_map)

0      1
1      1
2      1
3      1
4      1
      ..
953    9
954    9
955    9
956    9
957    9
Name: bioproject, Length: 958, dtype: int64

In [41]:
for i, bioproject in enumerate(metadata['bioproject'].unique()):
    print(i+1, bioproject)

1 PRJNA661673
2 PRJNA554535
3 PRJEB53017
4 PRJNA766337
5 PRJDB9293
6 PRJNA588353
7 PRJNA399246
8 PRJEB53018
9 PRJNA668251


In [37]:
metadata['bioproject'].value_counts()

bioproject
PRJNA766337    497
PRJNA661673    120
PRJEB53017      94
PRJNA554535     60
PRJEB53018      50
PRJDB9293       48
PRJNA588353     48
PRJNA399246     27
PRJNA668251     14
Name: count, dtype: int64

In [38]:
metadata['country'].value_counts()

country
Japan        497
China        120
Pakistan1     94
Pakistan2     60
Finland       50
Indonesia     48
Sudan         48
India         27
Vietnam       14
Name: count, dtype: int64

In [None]:
# africa: sudan
# east asia: japan, china
# south asia: pakistan, india
# south east asia: indonesia, vietnam
# europe: finland