In [2]:
from biom import load_table
import pandas as pd

# Load table
biom_table = load_table('./data/AG_100nt_even10k.biom')
otu_table = biom_table.to_dataframe()
taxonomy = biom_table.metadata_to_dataframe('observation')

genera_of_interest = ["Lactobacillus", "Bifidobacterium", "Clostridium", "Bacteroides", "Prevotella"]

def extract_genus(row):
    g = row["taxonomy_5"].replace("g__", "")
    if g in genera_of_interest:
       return g
    else:
        p = row["taxonomy_1"].replace("p__", "")
        p = p.replace("[", "").replace("]", "")
        return p

taxonomy["Genus"] = taxonomy.apply(extract_genus, axis=1)

genus_abundance = otu_table.groupby(taxonomy["Genus"]).sum()

total_abundance = genus_abundance.sum(axis=1)
threshold = 0.001 * total_abundance.sum()  # 0.1% of total counts
rare_taxa = [tax for tax in total_abundance[total_abundance < threshold].index 
             if tax not in genera_of_interest]

# Replace rare taxa with 'Other' and sum
genus_abundance = genus_abundance.rename(index=lambda x: "Other" if x in rare_taxa else x)
genus_abundance = genus_abundance.groupby(genus_abundance.index).sum()

# Normalize to relative abundance
genus_abundance = genus_abundance.div(genus_abundance.sum(axis=0), axis=1)

# Check results
samples = genus_abundance.T.copy()
samples.index.name = 'SampleID'
samples.head()

Genus,Actinobacteria,Bacteroides,Bacteroidetes,Bifidobacterium,Clostridium,Cyanobacteria,Firmicutes,Fusobacteria,Lactobacillus,Other,Prevotella,Proteobacteria,Tenericutes,Verrucomicrobia
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
7117.1075649,0.3244,0.0013,0.0315,0.0,0.0013,0.0352,0.2629,0.0082,0.0027,0.0461,0.0028,0.2821,0.001,0.0005
5636.1053788,0.2811,0.0,0.0055,0.0003,0.0,0.0134,0.5261,0.0059,0.0008,0.0019,0.0012,0.1633,0.0003,0.0002
5637.1053909,0.2378,0.0004,0.0088,0.0003,0.0001,0.012,0.4414,0.0052,0.0004,0.0029,0.0025,0.2878,0.0,0.0004
5634.1053886,0.4447,0.0005,0.0084,0.0002,0.0,0.0088,0.3892,0.0039,0.0,0.0037,0.0007,0.1399,0.0,0.0
7115.1075661,0.0347,0.0087,0.1735,0.0018,0.0149,0.0289,0.1783,0.0032,0.0166,0.0153,0.0025,0.5196,0.0,0.002


In [3]:
# Merge demographic data onto samples
ag = pd.read_csv('./data/AG.txt', sep="\t")
ag = ag.rename(columns={'#SampleID': 'SampleID'})
columns_of_interest = ['SampleID', 'AGE', 'SEX', 'PREGNANT']

samples_index_df = samples.reset_index().rename(columns={'index': 'SampleID'})

ag_merged = samples_index_df.merge(
    ag[columns_of_interest],  # metadata
    on='SampleID',
    how='left'  # keeps all rows from samples
)

ag_merged = ag_merged.rename(columns={"AGE": "Age", "SEX": "Sex"})

# Remove pregnant participants
ag_filtered = ag_merged[ag_merged['PREGNANT'].str.lower() != 'yes']
ag_filtered = ag_filtered.drop(columns=['PREGNANT'])

# Remove participants with missing age or sex
ag_filtered = ag_filtered[ag_filtered['Sex'].str.lower().isin(['male', 'female'])]
ag_filtered['Age'] = pd.to_numeric(ag_filtered['Age'], errors='coerce')
ag_filtered = ag_filtered[ag_filtered['Age'].notna()]

# Convert age to integer and sex to 0 (female) and 1 (male)
ag_filtered['Age'] = ag_filtered['Age'].astype(int)
ag_filtered['Sex'] = ag_filtered['Sex'].map({'female': 0, 'male': 1})

# Remove children
ag_filtered = ag_filtered[ag_filtered['Age'] >= 18]

ag_filtered

  ag = pd.read_csv('./data/AG.txt', sep="\t")


Unnamed: 0,SampleID,Actinobacteria,Bacteroides,Bacteroidetes,Bifidobacterium,Clostridium,Cyanobacteria,Firmicutes,Fusobacteria,Lactobacillus,Other,Prevotella,Proteobacteria,Tenericutes,Verrucomicrobia,Age,Sex
0,000007117.1075649,0.3244,0.0013,0.0315,0.0,0.0013,0.0352,0.2629,0.0082,0.0027,0.0461,0.0028,0.2821,0.001,0.0005,59,1
4,000007115.1075661,0.0347,0.0087,0.1735,0.0018,0.0149,0.0289,0.1783,0.0032,0.0166,0.0153,0.0025,0.5196,0.0,0.002,59,1
5,000007123.1075697,0.0493,0.0,0.0153,0.0,0.0002,0.1265,0.0451,0.0025,0.1222,0.0024,0.0007,0.6347,0.0,0.0011,59,1
6,000009713.1130401,0.4052,0.001,0.0091,0.0,0.0002,0.0265,0.4219,0.0026,0.0015,0.0055,0.0062,0.1191,0.0,0.0012,70,0
7,000005598.1130569,0.2394,0.0072,0.0401,0.0001,0.001,0.0255,0.2582,0.0013,0.017,0.0209,0.0024,0.3828,0.0003,0.0038,72,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3100,000003719.1257129,0.0018,0.0096,0.0028,0.3933,0.0,0.0001,0.1794,0.0,0.0016,0.0,0.0004,0.4108,0.0001,0.0001,57,0
3103,000015353.fixed1024,0.083,0.0006,0.3497,0.0,0.0001,0.0,0.0971,0.0029,0.0,0.0019,0.0279,0.436,0.0008,0.0,35,1
3104,000011980.1210764,0.005,0.0008,0.0003,0.0,0.0,0.0001,0.0023,0.0,0.9781,0.0,0.0003,0.0117,0.0014,0.0,52,0
3105,000005567.1131812,0.043,0.0006,0.0,0.0,0.0,0.0,0.0187,0.0003,0.0,0.0001,0.0012,0.9361,0.0,0.0,51,1


In [4]:
# Save the data to a CSV
ag_filtered.to_csv("data/combined_ag_data.csv", index=False)