In [25]:
import pandas as pd

# NHANES datasets
# Search Variables: https://wwwn.cdc.gov/nchs/nhanes/search/default.aspx
# Questionnaire Data: https://wwwn.cdc.gov/Nchs/Nhanes/Search/DataPage.aspx?Component=Questionnaire&Cycle=2013-2014

# Demographic Data Documentation: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/DEMO_H.htm
# RIAGENDR - Gender (1: M, 2: F, .: Missing)
# RIDAGEYR - Age in years at screening (0-79: Range, 80: 80 years of age and over, .: missing)
# RIDEXPRG - Pregnancy status at exam (1: Pregnant, 2: Not pregnant, 3: Not sure, .: missing)

# Dietary Interview - Total Nutrient Intakes, First Day (DR1TOT_H) - Data Documentation:
# https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/DR1TOT_H.htm

# DR1TIRON
# Variable Description - Iron (mg)
# Data File Name - Dietary Interview - Total Nutrient Intakes, First Day (DR1TOT_H)
# Values: 0.13 to 130.76 - Range of Values, . - Missing

# DR1TTFAT
# Variable Description - Total fat (gm)
# Data File Name - Dietary Interview - Total Nutrient Intakes, First Day (DR1TOT_H)
# Values: 0 to 548.38	- Range of Values, . - Missing

# DR1TFIBE
# Variable Description - Dietary fiber (gm)
# Data File Name - Dietary Interview - Total Nutrient Intakes, First Day (DR1TOT_H)
# Values: 0 to 136.3 - Range of Values, . - Missing


# Standard Biochemistry Profile (BIOPRO_H) - Data Documentation:
# https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/BIOPRO_H.htm

# LBDSIRSI
# Variable Description - Iron, refrigerated serum (umol/L) (micromoles per litre)
# Data File Name - Standard Biochemistry Profile (BIOPRO_H)
# Values: 0.9 to 99.8 - Range of Values, . - Missing

# LBXSIR
# Variable Description - Iron, refrigerated serum (ug/dL) (micrograms per decilitre)
# Data File Name - Standard Biochemistry Profile (BIOPRO_H)
# Values: 5 to 557 - Range of Values, . - Missing


# Medical Conditions (MCQ_H) - Data Documentation:
# https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/MCQ_H.htm#MCQ053

# MCQ053
# SAS Label - Taking treatment for anemia/past 3 mos
# Variable Description - During the past 3 months, {have you/has SP} been on treatment for anemia (a-nee-me-a), sometimes called "tired blood" or "low blood"? [Include diet, iron pills, iron shots, transfusions as treatment.]
# Data File Name - Medical Conditions (MCQ_H)
# Values: 1: Yes, 2: No, 7: Refused, 9: Don't know, .: Missing

demographic_variables = ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDEXPRG"]
diet_variables = ["SEQN", "DR1TIRON", "DR1TTFAT", "DR1TFIBE"]
labs_variables = ["SEQN", "LBDSIRSI", "LBXSIR"]
questionnaire_variables = ["SEQN", "MCQ053"]

demographic_cols = {
    "RIAGENDR": "Gender",
    "RIDAGEYR": "Age",
    "RIDEXPRG": "Pregnant"
}

diet_cols = {
    "DR1TIRON": "Iron_intake_mg",
    "DR1TTFAT": "Total_Fat_intake_g",
    "DR1TFIBE": "Fiber_intake_g"
}

labs_cols = {
    "LBDSIRSI": "Iron_serum_umol/L",
    "LBXSIR": "Iron_serum_ug/dL"
}

questionnaire_cols = {
    "MCQ053": "Treatment_for_anemia"
}

demographic = pd.read_csv("NHANES/demographic.csv", usecols=lambda c: c in demographic_variables).rename(columns=demographic_cols)
diet = pd.read_csv("NHANES/diet.csv", usecols=lambda c: c in diet_variables).rename(columns=diet_cols)
labs = pd.read_csv("NHANES/labs.csv", usecols=lambda c: c in labs_variables).rename(columns=labs_cols)
questionnaire = pd.read_csv("NHANES/questionnaire.csv", usecols=lambda c: c in questionnaire_variables).rename(columns=questionnaire_cols)

df = (
    demographic
    .merge(diet, on="SEQN", how="left")
    .merge(labs, on="SEQN", how="left")
    .merge(questionnaire, on="SEQN", how="left")
)

df

Unnamed: 0,SEQN,Gender,Age,Pregnant,Fiber_intake_g,Total_Fat_intake_g,Iron_intake_mg,Iron_serum_ug/dL,Iron_serum_umol/L,Treatment_for_anemia
0,73557,1,69,,10.8,52.81,8.41,58.0,10.4,2.0
1,73558,1,54,,16.7,124.29,26.88,79.0,14.1,2.0
2,73559,1,72,,9.9,65.97,17.57,98.0,17.6,2.0
3,73560,1,9,,10.6,58.27,14.19,,,2.0
4,73561,2,73,,12.3,55.36,17.72,91.0,16.3,2.0
...,...,...,...,...,...,...,...,...,...,...
10170,83727,1,26,,30.4,193.51,47.01,73.0,13.1,2.0
10171,83728,2,2,,9.3,52.39,6.62,,,1.0
10172,83729,2,42,2.0,25.7,110.30,15.06,49.0,8.8,2.0
10173,83730,1,7,,,,,,,2.0


In [26]:
import pandas as pd

# Calculate range
col_range = df['Fiber_intake_g'].max() - df['Fiber_intake_g'].min()
print(f"Range: {col_range}")

# Calculate standard deviation
std_dev = df['Fiber_intake_g'].std()
print(f"Standard Deviation: {std_dev}")

# Calculate distribution (using describe)
distribution = df['Fiber_intake_g'].describe()
print("\nDistribution:")
print(distribution)

Range: 136.3
Standard Deviation: 10.132655979964545

Distribution:
count    8531.000000
mean       15.278045
std        10.132656
min         0.000000
25%         8.500000
50%        13.100000
75%        19.800000
max       136.300000
Name: Fiber_intake_g, dtype: float64


In [27]:
import pandas as pd

# Calculate range
col_range = df['Total_Fat_intake_g'].max() - df['Total_Fat_intake_g'].min()
print(f"Range: {col_range}")

# Calculate standard deviation
std_dev = df['Total_Fat_intake_g'].std()
print(f"Standard Deviation: {std_dev}")

# Calculate distribution (using describe)
distribution = df['Total_Fat_intake_g'].describe()
print("\nDistribution:")
print(distribution)

Range: 548.38
Standard Deviation: 45.50421029810836

Distribution:
count    8531.00000
mean       75.09880
std        45.50421
min         0.00000
25%        43.71500
50%        66.23000
75%        94.99000
max       548.38000
Name: Total_Fat_intake_g, dtype: float64


In [28]:
import pandas as pd

# Calculate range
col_range = df['Iron_intake_mg'].max() - df['Iron_intake_mg'].min()
print(f"Range: {col_range}")

# Calculate standard deviation
std_dev = df['Iron_intake_mg'].std()
print(f"Standard Deviation: {std_dev}")

# Calculate distribution (using describe)
distribution = df['Iron_intake_mg'].describe()
print("\nDistribution:")
print(distribution)

Range: 130.63
Standard Deviation: 8.544092329884053

Distribution:
count    8531.000000
mean       14.056128
std         8.544092
min         0.130000
25%         8.450000
50%        12.200000
75%        17.600000
max       130.760000
Name: Iron_intake_mg, dtype: float64


In [40]:
import biom

# Load table
table = biom.load_table('HMPv35_100nt_even10k.biom')
otu_table = table.to_dataframe()
taxonomy = table.metadata_to_dataframe('observation')

print("Taxonomy columns:", taxonomy.columns)
# Show first few
# print(taxonomy.head(50))


def extract_genus(row):
    g = row["taxonomy_5"].replace("g__", "")
    if g in ["Lactobacillus", "Bifidobacterium"]:
       return g
    else:
        p = row["taxonomy_1"].replace("p__", "")
        p = p.replace("[", "").replace("]", "")
        return p

taxonomy["Genus"] = taxonomy.apply(extract_genus, axis=1)

# print(taxonomy.head(70))


genus_abundance = otu_table.groupby(taxonomy["Genus"]).sum()

total_abundance = genus_abundance.sum(axis=1)
threshold = 0.001 * total_abundance.sum()  # 0.1% of total counts
rare_taxa = [tax for tax in total_abundance[total_abundance < threshold].index 
             if tax not in ["Lactobacillus", "Bifidobacterium"]]

# Replace rare taxa with 'Other'
genus_abundance = genus_abundance.rename(index=lambda x: "Other" if x in rare_taxa else x)
genus_abundance = genus_abundance.groupby(genus_abundance.index).sum()

# Normalize to relative abundance
genus_abundance = genus_abundance.div(genus_abundance.sum(axis=0), axis=1)

# Check results
print(genus_abundance.head())
print(genus_abundance.index.unique())


Taxonomy columns: Index(['taxonomy_0', 'taxonomy_1', 'taxonomy_2', 'taxonomy_3', 'taxonomy_4',
       'taxonomy_5', 'taxonomy_6'],
      dtype='object')
                 SRS048275.SRX022242  SRS044902.SRX020573  \
Genus                                                       
Actinobacteria                0.0028               0.2723   
Bacteroidetes                 0.0003               0.0001   
Bifidobacterium               0.0001               0.3397   
Firmicutes                    0.9905               0.3761   
Fusobacteria                     0.0                  0.0   

                 SRS056240.SRX020524  SRS054553.SRX020514  \
Genus                                                       
Actinobacteria                0.2689               0.0029   
Bacteroidetes                    0.0               0.0023   
Bifidobacterium                  0.0                  0.0   
Firmicutes                    0.5066               0.9797   
Fusobacteria                     0.0                 

In [41]:
genus_abundance = genus_abundance.T
genus_abundance

Genus,Actinobacteria,Bacteroidetes,Bifidobacterium,Firmicutes,Fusobacteria,Lactobacillus,Other,Proteobacteria,Spirochaetes,Tenericutes
SRS048275.SRX022242,0.0028,0.0003,0.0001,0.9905,0.0,0.0062,0.0,0.0,0.0,0.0001
SRS044902.SRX020573,0.2723,0.0001,0.3397,0.3761,0.0,0.0115,0.0,0.0003,0.0,0.0
SRS056240.SRX020524,0.2689,0.0,0.0,0.5066,0.0,0.0001,0.0,0.2244,0.0,0.0
SRS054553.SRX020514,0.0029,0.0023,0.0,0.9797,0.0,0.0096,0.0003,0.0021,0.0,0.0031
SRS065189.SRX020528,0.0068,0.0028,0.0,0.9802,0.0,0.0101,0.0,0.0001,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
SRS019367.SRX020552,0.9675,0.0,0.0,0.0307,0.0,0.0,0.0,0.0018,0.0,0.0
SRS042989.SRX020552,0.9543,0.0005,0.0001,0.0437,0.0,0.0,0.0,0.0013,0.0001,0.0
SRS066188.SRX022238,0.0001,0.0008,0.0,0.0018,0.0014,0.0,0.0001,0.9958,0.0,0.0
SRS066188.SRX022234,0.0001,0.0001,0.0,0.0013,0.0004,0.0,0.0,0.9981,0.0,0.0


In [42]:
# Print the column names to check availability
print(genus_abundance.columns)

# Adjust accordingly based on available columns

Index(['Actinobacteria', 'Bacteroidetes', 'Bifidobacterium', 'Firmicutes',
       'Fusobacteria', 'Lactobacillus', 'Other', 'Proteobacteria',
       'Spirochaetes', 'Tenericutes'],
      dtype='object', name='Genus')


In [53]:
genus_abundance["Lactobacillus"]

SRS048275.SRX022242    0.0062
SRS044902.SRX020573    0.0115
SRS056240.SRX020524    0.0001
SRS054553.SRX020514    0.0096
SRS065189.SRX020528    0.0101
                        ...  
SRS019367.SRX020552       0.0
SRS042989.SRX020552       0.0
SRS066188.SRX022238       0.0
SRS066188.SRX022234       0.0
SRS066188.SRX022236       0.0
Name: Lactobacillus, Length: 368, dtype: Sparse[float64, np.float64(0.0)]

In [61]:
import numpy as np
import pandas as pd

# Simulate dietary factors based on bacterial abundance
def simulate_dietary_intake(microbiome_sample):
    # Use specific bacteria for each intake
    lacto_abundance = microbiome_sample['Lactobacillus']
    bacteroides_abundance = microbiome_sample['Bacteroidetes']
    firmicutes_abundance = microbiome_sample['Firmicutes']
    
    # Calculate dietary factors using these abundances
    fiber_intake = max(0, (15.28 + lacto_abundance * 20) + np.random.normal(0, 10.13))
    fat_intake = max(0, (75.10 + bacteroides_abundance * 10) + np.random.normal(0, 15))
    iron_intake = (14.06 + firmicutes_abundance * 5) + np.random.normal(0, 2)
    
    return pd.Series({
        'Fiber_intake_g': fiber_intake,
        'Total_Fat_intake_g': fat_intake,
        'Iron_intake_mg': iron_intake
    })

# Apply simulation to each microbiome sample
simulated_diet = genus_abundance.apply(simulate_dietary_intake, axis=1)
simulated_diet


# Combine simulated diet with microbiome data
combined_data = pd.concat([genus_abundance, simulated_diet], axis=1)
combined_data

# # Assuming demographic data is already loaded as `demographic`
# df = df.reset_index(drop=True)

# # # Ensure the same number of samples
# n_samples = min(len(combined_data), len(df))
# combined_data = combined_data.iloc[:n_samples]
# df = df.iloc[:n_samples]

# # # Merge with real demographic data
# final_combined_df = pd.concat([df, combined_data], axis=1)

# # # Display the combined DataFrame
# # print(final_combined_df.head())

Unnamed: 0,Actinobacteria,Bacteroidetes,Bifidobacterium,Firmicutes,Fusobacteria,Lactobacillus,Other,Proteobacteria,Spirochaetes,Tenericutes,Fiber_intake_g,Total_Fat_intake_g,Iron_intake_mg
SRS048275.SRX022242,0.0028,0.0003,0.0001,0.9905,0.0,0.0062,0.0,0.0,0.0,0.0001,13.784223,44.874467,20.309417
SRS044902.SRX020573,0.2723,0.0001,0.3397,0.3761,0.0,0.0115,0.0,0.0003,0.0,0.0,20.473735,69.666804,19.586464
SRS056240.SRX020524,0.2689,0.0,0.0,0.5066,0.0,0.0001,0.0,0.2244,0.0,0.0,0.000000,74.585293,16.737036
SRS054553.SRX020514,0.0029,0.0023,0.0,0.9797,0.0,0.0096,0.0003,0.0021,0.0,0.0031,28.121452,84.864196,15.548578
SRS065189.SRX020528,0.0068,0.0028,0.0,0.9802,0.0,0.0101,0.0,0.0001,0.0,0.0,5.870741,75.551632,19.867257
...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRS019367.SRX020552,0.9675,0.0,0.0,0.0307,0.0,0.0,0.0,0.0018,0.0,0.0,14.855657,79.665816,17.603393
SRS042989.SRX020552,0.9543,0.0005,0.0001,0.0437,0.0,0.0,0.0,0.0013,0.0001,0.0,8.070785,56.265110,16.053421
SRS066188.SRX022238,0.0001,0.0008,0.0,0.0018,0.0014,0.0,0.0001,0.9958,0.0,0.0,17.559707,78.310272,17.444609
SRS066188.SRX022234,0.0001,0.0001,0.0,0.0013,0.0004,0.0,0.0,0.9981,0.0,0.0,4.895642,58.920182,14.018982


In [None]:
### NOT REALLY WORKING

# Ensure combined_data has dietary columns from simulation
simulated_dietary = combined_data[['Fiber_intake_g', 'Total_Fat_intake_g', 'Iron_intake_mg']]

# Extract initial NHANES columns (prevent overwriting during merge)
nhanes_demo_diet = df[['SEQN', 'Gender', 'Age', 'Pregnant', 'Fiber_intake_g', 'Total_Fat_intake_g', 'Iron_intake_mg', 
                       'Iron_serum_ug/dL', 'Iron_serum_umol/L', 'Treatment_for_anemia']].copy()

# Rename simulated columns to avoid conflict
simulated_dietary = simulated_dietary.rename(columns={
    'Fiber_intake_g': 'Sim_Fiber_intake_g',
    'Total_Fat_intake_g': 'Sim_Total_Fat_intake_g',
    'Iron_intake_mg': 'Sim_Iron_intake_mg'
})

# Randomly sample NHANES data
n_samples = min(len(simulated_dietary), len(nhanes_demo_diet))
nhanes_sample = nhanes_demo_diet.sample(n=n_samples, random_state=1).reset_index(drop=True)

# Concatenate NHANES with simulated data
final_combined_df = pd.concat([nhanes_sample, simulated_dietary], axis=1)

# Display the combined DataFrame
print(final_combined_df.head())

      SEQN  Gender   Age  Pregnant  Fiber_intake_g  Total_Fat_intake_g  \
0  73858.0     1.0  52.0       NaN            16.5               73.83   
1  73679.0     2.0   1.0       NaN             7.3               38.43   
2  73785.0     1.0  46.0       NaN            28.1               43.96   
3  73814.0     1.0   0.0       NaN             0.0               32.07   
4  73659.0     2.0  70.0       NaN             6.4               28.74   

   Iron_intake_mg  Iron_serum_ug/dL  Iron_serum_umol/L  Treatment_for_anemia  \
0           19.83             135.0               24.2                   2.0   
1            6.24               NaN                NaN                   2.0   
2           16.04               NaN                NaN                   2.0   
3           10.84               NaN                NaN                   NaN   
4            4.08              55.0                9.9                   2.0   

   Sim_Fiber_intake_g  Sim_Total_Fat_intake_g  Sim_Iron_intake_mg  
0     