In [1]:
import pandas as pd
diet_cols = {
    "SEQN": "SEQN",
    "DR1TIRON": "Iron_intake_mg",
    "DR1TTFAT": "Fat_intake_g",
    "DR1TFIBE": "Fiber_intake_g"
}

labs_cols = {
    "SEQN": "SEQN",
    "LBXSIR": "Iron_serum_ug/dL"
}

diet = pd.read_csv("data/NHANES/diet.csv", usecols=lambda c: c in diet_cols.keys()).rename(columns=diet_cols)
labs = pd.read_csv("data/NHANES/labs.csv", usecols=lambda c: c in labs_cols.keys()).rename(columns=labs_cols)

nhanes = diet.merge(labs, on="SEQN", how="left")
nhanes

Unnamed: 0,SEQN,Fiber_intake_g,Fat_intake_g,Iron_intake_mg,Iron_serum_ug/dL
0,73557,10.8,52.81,8.41,58.0
1,73558,16.7,124.29,26.88,79.0
2,73559,9.9,65.97,17.57,98.0
3,73560,10.6,58.27,14.19,
4,73561,12.3,55.36,17.72,91.0
...,...,...,...,...,...
9808,83727,30.4,193.51,47.01,73.0
9809,83728,9.3,52.39,6.62,
9810,83729,25.7,110.30,15.06,49.0
9811,83730,,,,


In [2]:
fiber_std = nhanes['Fiber_intake_g'].std()
fat_std = nhanes['Fat_intake_g'].std()
iron_intake_std = nhanes['Iron_intake_mg'].std()

# For comparison
print(f'Fiber intake std: {fiber_std}')
print(f'Fat intake std: {fat_std}')
print(f'Iron intake std: {iron_intake_std}')

Fiber intake std: 10.132655979964545
Fat intake std: 45.50421029810836
Iron intake std: 8.544092329884053


In [4]:
import numpy as np
import pandas as pd

fiber_mean = nhanes['Fiber_intake_g'].mean()
fat_mean = nhanes['Fat_intake_g'].mean()
iron_mean = nhanes['Iron_intake_mg'].mean()

# Version 2: Links from demigraphic info -> diet

# ----------------------------------------------------------------------
# Step 1: Simulate dietary intake
# ----------------------------------------------------------------------
def simulate_dietary_intake(sample):
    age = sample["Age"]
    sex = sample["Sex"]  # 0 = female, 1 = male

    # independent fiber & fat (sampled from realistic distributions)
    fiber_intake = np.random.normal(fiber_mean, fiber_std)
    fat_intake   = np.random.normal(fat_mean, fat_std)

    # iron intake still depends on demographics
    iron_intake = max(0, (
        iron_mean
        - 0.03 * age
        + (2 if sex == 1 else -1)   # males a little higher
        + np.random.normal(0, iron_intake_std * 0.2)  # moderate variation
    ))

    return pd.Series({
        "Fiber_intake_g": round(fiber_intake, 2),
        "Fat_intake_g":   round(fat_intake, 2),
        "Iron_intake_mg": round(iron_intake, 2)
    })



cleaned_ag_dataset = pd.read_csv('data/combined_ag_data.csv')

# Apply simulation to each microbiome sample
simulated_diet = cleaned_ag_dataset.apply(simulate_dietary_intake, axis=1)
print(simulated_diet.describe())

# Combine simulated diet with microbiome data
combined_data = pd.concat([cleaned_ag_dataset, simulated_diet], axis=1)
combined_data

       Fiber_intake_g  Fat_intake_g  Iron_intake_mg
count     2512.000000   2512.000000     2512.000000
mean        15.129323     74.740107       13.056290
std         10.349779     45.478516        2.321495
min        -19.070000    -63.270000        6.170000
25%          7.782500     43.615000       11.400000
50%         15.335000     75.905000       12.965000
75%         22.112500    105.550000       14.810000
max         48.080000    220.330000       20.440000


Unnamed: 0,SampleID,Actinobacteria,Bacteroides,Bacteroidetes,Bifidobacterium,Clostridium,Cyanobacteria,Firmicutes,Fusobacteria,Lactobacillus,Other,Prevotella,Proteobacteria,Tenericutes,Verrucomicrobia,Age,Sex,Fiber_intake_g,Fat_intake_g,Iron_intake_mg
0,000007117.1075649,0.3244,0.0013,0.0315,0.0000,0.0013,0.0352,0.2629,0.0082,0.0027,0.0461,0.0028,0.2821,0.0010,0.0005,59,1,15.77,106.89,14.93
1,000007115.1075661,0.0347,0.0087,0.1735,0.0018,0.0149,0.0289,0.1783,0.0032,0.0166,0.0153,0.0025,0.5196,0.0000,0.0020,59,1,17.89,18.97,13.87
2,000007123.1075697,0.0493,0.0000,0.0153,0.0000,0.0002,0.1265,0.0451,0.0025,0.1222,0.0024,0.0007,0.6347,0.0000,0.0011,59,1,18.28,67.33,12.27
3,000009713.1130401,0.4052,0.0010,0.0091,0.0000,0.0002,0.0265,0.4219,0.0026,0.0015,0.0055,0.0062,0.1191,0.0000,0.0012,70,0,3.62,110.53,14.22
4,000005598.1130569,0.2394,0.0072,0.0401,0.0001,0.0010,0.0255,0.2582,0.0013,0.0170,0.0209,0.0024,0.3828,0.0003,0.0038,72,1,19.49,24.62,15.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2507,000003719.1257129,0.0018,0.0096,0.0028,0.3933,0.0000,0.0001,0.1794,0.0000,0.0016,0.0000,0.0004,0.4108,0.0001,0.0001,57,0,-2.80,91.88,11.36
2508,000015353.fixed1024,0.0830,0.0006,0.3497,0.0000,0.0001,0.0000,0.0971,0.0029,0.0000,0.0019,0.0279,0.4360,0.0008,0.0000,35,1,15.52,109.90,15.17
2509,000011980.1210764,0.0050,0.0008,0.0003,0.0000,0.0000,0.0001,0.0023,0.0000,0.9781,0.0000,0.0003,0.0117,0.0014,0.0000,52,0,21.68,98.73,11.66
2510,000005567.1131812,0.0430,0.0006,0.0000,0.0000,0.0000,0.0000,0.0187,0.0003,0.0000,0.0001,0.0012,0.9361,0.0000,0.0000,51,1,18.26,60.07,15.20


In [5]:
# Step 2: Serum iron depends ONLY on iron intake
# ----------------------------------------------------------------------
# Target physiological range 5–550 µg/dL
base_mean = 120
base_std = 35

base = np.random.normal(base_mean, base_std, len(combined_data))

# Only iron intake as driver + small noise
simulated_serum_iron = (
    base
    + 4.5 * combined_data["Iron_intake_mg"]  # strong linear relationship
    + np.random.normal(0, 20, len(combined_data))  # biological variability
)

# Clip to realistic bounds
simulated_serum_iron = np.clip(simulated_serum_iron, 5, 550)

# Add to dataset
combined_data["Serum_iron_ug"] = np.round(simulated_serum_iron, 2)

# Keep or drop columns as needed
final_data = combined_data


In [6]:

print("\nSerum Iron Statistics:")
print(final_data["Serum_iron_ug"].describe())

print("\nFinal Dataset:")
print(final_data.head())


Serum Iron Statistics:
count    2512.000000
mean      179.151959
std        41.121599
min        30.840000
25%       152.487500
50%       179.430000
75%       207.095000
max       318.520000
Name: Serum_iron_ug, dtype: float64

Final Dataset:
            SampleID  Actinobacteria  Bacteroides  Bacteroidetes  \
0  000007117.1075649          0.3244       0.0013         0.0315   
1  000007115.1075661          0.0347       0.0087         0.1735   
2  000007123.1075697          0.0493       0.0000         0.0153   
3  000009713.1130401          0.4052       0.0010         0.0091   
4  000005598.1130569          0.2394       0.0072         0.0401   

   Bifidobacterium  Clostridium  Cyanobacteria  Firmicutes  Fusobacteria  \
0           0.0000       0.0013         0.0352      0.2629        0.0082   
1           0.0018       0.0149         0.0289      0.1783        0.0032   
2           0.0000       0.0002         0.1265      0.0451        0.0025   
3           0.0000       0.0002         0.0

In [7]:
# Save the data to a CSV
final_data.to_csv("data/iron_cleaned_data.csv", index=False)