In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

# Data directory
data_dir = Path('../data')
print(f"Data directory: {data_dir.absolute()}")
print(f"Available files: {list(data_dir.glob('*.csv'))}")

Data directory: /Users/kushaldsouza/Documents/Projects/foodprediction/analysis/../data
Available files: [PosixPath('../data/Untargeted_biomarkers_level5.csv'), PosixPath('../data/Metadata_500food.csv'), PosixPath('../data/featuretable_reformated - Kushal.csv')]


In [3]:
# Load metadata
metadata_file = data_dir / 'Metadata_500food.csv'
metadata_df = pd.read_csv(metadata_file)

print(f"Metadata shape: {metadata_df.shape}")
print("\nFirst 5 rows:")
print(metadata_df.head())

print("\nColumns:")
print(metadata_df.columns.tolist())

print("\nData types:")
print(metadata_df.dtypes)

Metadata shape: (500, 159)

First 5 rows:
            filename   sample_name      Platemap      description  \
0  P3_E8_G72464.mzML  11442.G72464  P3_E8_G72464    Pacifico beer   
1  P5_G5_G72471.mzML  11442.G72471  P5_G5_G72471  raw english pea   
2  P3_C12_72475.mzML  11442.G72475  P3_C12_72475           garlic   
3  P3_B9_G72492.mzML  11442.G72492  P3_B9_G72492        raspberry   
4  P3_B8_G72493.mzML  11442.G72493  P3_B8_G72493   brussel sprout   

  simple_complex sample_type sample_type_aquatic_land sample_type_group1  \
0         simple    beverage                     land              plant   
1         simple        food                     land              plant   
2         simple        food                     land              plant   
3         simple        food                     land              plant   
4         simple        food                     land              plant   

  sample_type_group2 sample_type_group3  ...             upc vegan  \
0              f

In [4]:
# Check for ndb_number (key for USDA linking)
print("ndb_number info:")
print(f"Non-null count: {metadata_df['ndb_number'].notna().sum()}")
print(f"Unique values: {metadata_df['ndb_number'].nunique()}")
print(f"Sample values: {metadata_df['ndb_number'].dropna().head(10).tolist()}")

# Check food ontology columns
ontology_cols = [col for col in metadata_df.columns if 'group' in col.lower() or 'family' in col.lower()]
print(f"\nOntology columns: {ontology_cols}")

for col in ontology_cols:
    print(f"\n{col} unique values: {metadata_df[col].nunique()}")
    print(f"Sample values: {metadata_df[col].dropna().unique()[:5].tolist()}")

ndb_number info:
Non-null count: 500
Unique values: 159
Sample values: ['14003', '45182628', '11215', '45096876', '11098', '9181', '11979', '9020', '45120594', '45057949']

Ontology columns: ['sample_type_group1', 'sample_type_group2', 'sample_type_group3', 'sample_type_group4', 'sample_type_group5', 'sample_type_group6', 'sample_type_groupB1', 'sample_type_groupB2', 'sample_type_groupB3', 'botanical_family', 'Group_num']

sample_type_group1 unique values: 6
Sample values: ['plant', 'algae', 'animal', 'water', 'mineral']

sample_type_group2 unique values: 11
Sample values: ['fruit', 'vegetable/herb', 'algae', 'animal', 'water']

sample_type_group3 unique values: 20
Sample values: ['grain/grass', 'legume', 'vegetable/herb', 'fleshy fruit', 'seaweed']

sample_type_group4 unique values: 39
Sample values: ['grain/grass', 'legume', 'vegetable', 'drupe_aggregate', 'flower']

sample_type_group5 unique values: 141
Sample values: ['grain_fermented', 'pea', 'garlic', 'raspberry', 'brussel sprout

In [5]:
# Load biomarkers data
biomarkers_file = data_dir / 'Untargeted_biomarkers_level5.csv'
biomarkers_df = pd.read_csv(biomarkers_file)

print(f"Biomarkers shape: {biomarkers_df.shape}")
print("\nFirst 5 rows:")
print(biomarkers_df.head())

print("\nColumns:")
print(biomarkers_df.columns.tolist())

print("\nData types:")
print(biomarkers_df.dtypes)

Biomarkers shape: (6127, 2)

First 5 rows:
   feature                                           category
0    10002                               beef, cheddar cheese
1   100035  betel nut leaf, onion, walnut, pine nut, grape...
2   100044                           beef, langostino, potato
3   100080                              salmon, chicken, kale
4   100120                       milk_cow, salmon, yogurt_cow

Columns:
['feature', 'category']

Data types:
feature      int64
category    object
dtype: object


In [6]:
# Analyze feature and category relationships
print(f"Unique features: {biomarkers_df['feature'].nunique()}")
print(f"Unique categories: {biomarkers_df['category'].nunique()}")

# Check category format (comma-separated)
print("\nSample categories:")
print(biomarkers_df['category'].head(10).tolist())

# Count foods per feature
foods_per_feature = biomarkers_df['category'].str.count(',').fillna(0) + 1
print(f"\nFoods per feature statistics:")
print(foods_per_feature.describe())

# Find most promiscuous features (connected to many foods)
most_promiscuous = biomarkers_df.loc[foods_per_feature.idxmax()]
print(f"\nMost promiscuous feature: {most_promiscuous['feature']}")
print(f"Connected to foods: {most_promiscuous['category']}")

Unique features: 6127
Unique categories: 2904

Sample categories:
['beef, cheddar cheese', 'betel nut leaf, onion, walnut, pine nut, grape_fermented, olive, bay leaf, spice, legume, avocado, mint', 'beef, langostino, potato', 'salmon, chicken, kale', 'milk_cow, salmon, yogurt_cow', 'carrot, pine nut, anise, oregano, lavender, mint, dill flower, parsley, lemon balm', 'carrot, turmeric, cardamom, ginger, oregano', 'cheddar cheese', 'cheddar cheese', 'walnut, cauliflower, bean, peanut, cardamom, legume, Spenach']

Foods per feature statistics:
count    6127.000000
mean        3.335401
std         2.986687
min         1.000000
25%         1.000000
50%         2.000000
75%         5.000000
max        15.000000
Name: category, dtype: float64

Most promiscuous feature: 115870
Connected to foods: pine nut, turmeric, grape_fermented, lentil, anise, olive, cardamom, bay leaf, spice, legume, butter, Spenach, ginger, Tilapia, pig


In [7]:
# Load feature intensity matrix
intensity_file = data_dir / 'featuretable_reformated - Kushal.csv'
intensity_df = pd.read_csv(intensity_file)

print(f"Intensity matrix shape: {intensity_df.shape}")
print("\nFirst 5 rows and 5 columns:")
print(intensity_df.iloc[:5, :5])

print("\nColumn names (first 10):")
print(intensity_df.columns[:10].tolist())

print("\nData types:")
print(intensity_df.dtypes.value_counts())

Intensity matrix shape: (54546, 526)

First 5 rows and 5 columns:
   Feature  P4_C10_G75213.mzML Peak area  P4_F3_G75184.mzML Peak area  \
0      271                     1151.5609                     8250.291   
1     4396                        0.0000                        0.000   
2    18754                     5994.3506                     4457.402   
3    15057                    43141.8300                    24845.160   
4   125441                    19674.2270                    20035.475   

   P3_F1_G74067.mzML Peak area  P4_D9_G75202.mzML Peak area  
0                    4166.9760                    615.75934  
1                     346.3053                      0.00000  
2                    1163.4347                   1732.78860  
3                    9865.4770                   1633.24930  
4                   17228.3900                  23373.15200  

Column names (first 10):
['Feature', 'P4_C10_G75213.mzML Peak area', 'P4_F3_G75184.mzML Peak area', 'P3_F1_G74067.mzML Pea

In [8]:
# Analyze intensity values
print("Intensity value statistics:")
print(intensity_df.describe())

# Check for missing values
print(f"\nMissing values: {intensity_df.isnull().sum().sum()}")

# Check if column names match metadata filenames
metadata_filenames = set(metadata_df['filename'].dropna())
intensity_columns = set(intensity_df.columns)

print(f"\nMetadata filenames: {len(metadata_filenames)}")
print(f"Intensity columns: {len(intensity_columns)}")
print(f"Overlap: {len(metadata_filenames.intersection(intensity_columns))}")

# Show some overlapping samples
overlap = metadata_filenames.intersection(intensity_columns)
print(f"\nSample overlapping files: {list(overlap)[:5]}")

Intensity value statistics:
             Feature  P4_C10_G75213.mzML Peak area  \
count   54546.000000                  5.454600e+04   
mean   115950.145914                  1.295999e+04   
std     70444.652194                  4.824996e+05   
min         4.000000                  0.000000e+00   
25%     54508.250000                  0.000000e+00   
50%    113976.000000                  0.000000e+00   
75%    177982.500000                  0.000000e+00   
max    248535.000000                  8.274985e+07   

       P4_F3_G75184.mzML Peak area  P3_F1_G74067.mzML Peak area  \
count                 5.454600e+04                 5.454600e+04   
mean                  5.300827e+03                 6.602530e+03   
std                   4.206903e+05                 2.353944e+05   
min                   0.000000e+00                 0.000000e+00   
25%                   0.000000e+00                 0.000000e+00   
50%                   0.000000e+00                 0.000000e+00   
75%             

In [9]:
# Analyze how the datasets connect
print("=== DATASET RELATIONSHIPS ===\n")

# 1. Check if biomarkers features exist in intensity matrix
biomarker_features = set(biomarkers_df['feature'].astype(str))
intensity_features = set(intensity_df.index.astype(str)) if intensity_df.index.name else set()

print(f"Biomarker features: {len(biomarker_features)}")
print(f"Intensity matrix features: {len(intensity_features)}")
print(f"Overlap: {len(biomarker_features.intersection(intensity_features))}")

# 2. Check if metadata foods appear in biomarker categories
metadata_foods = set(metadata_df['description'].dropna().str.lower())
biomarker_foods = set()
for categories in biomarkers_df['category'].dropna():
    foods = [food.strip().lower() for food in categories.split(',')]
    biomarker_foods.update(foods)

print(f"\nMetadata foods: {len(metadata_foods)}")
print(f"Biomarker foods: {len(biomarker_foods)}")
print(f"Overlap: {len(metadata_foods.intersection(biomarker_foods))}")

# Show some overlapping foods
overlap_foods = metadata_foods.intersection(biomarker_foods)
print(f"\nSample overlapping foods: {list(overlap_foods)[:10]}")

=== DATASET RELATIONSHIPS ===

Biomarker features: 6127
Intensity matrix features: 0
Overlap: 0

Metadata foods: 408
Biomarker foods: 120
Overlap: 47

Sample overlapping foods: ['langostino', 'blueberry', 'raspberry', 'sesame seed', 'cheddar cheese', 'watercress', 'basil', 'brussel sprout', 'nasturtium', 'cardamom']


In [10]:
print("=== SUMMARY ===\n")

print(f"1. Metadata: {metadata_df.shape[0]} food samples with {metadata_df.shape[1]} attributes")
print(f"   - Key linking field: ndb_number (for USDA data)")
print(f"   - Food ontology: {len(ontology_cols)} hierarchical categories")

print(f"\n2. Biomarkers: {biomarkers_df.shape[0]} molecule-food relationships")
print(f"   - {biomarkers_df['feature'].nunique()} unique molecular features")
print(f"   - {biomarkers_df['category'].nunique()} unique food categories")

print(f"\n3. Intensity Matrix: {intensity_df.shape[0]} features × {intensity_df.shape[1]} samples")
print(f"   - Provides quantitative abundance data for edge weighting")

print("\n=== NEXT STEPS ===")
print("1. Download USDA FoodData Central data")
print("2. Link metadata ndb_numbers to USDA nutritional data")
print("3. Generate Spec2Vec embeddings for molecular features")
print("4. Construct heterogeneous graph with Molecule and Food nodes")
print("5. Implement HAN or GIN architecture for training")

=== SUMMARY ===

1. Metadata: 500 food samples with 159 attributes
   - Key linking field: ndb_number (for USDA data)
   - Food ontology: 11 hierarchical categories

2. Biomarkers: 6127 molecule-food relationships
   - 6127 unique molecular features
   - 2904 unique food categories

3. Intensity Matrix: 54546 features × 526 samples
   - Provides quantitative abundance data for edge weighting

=== NEXT STEPS ===
1. Download USDA FoodData Central data
2. Link metadata ndb_numbers to USDA nutritional data
3. Generate Spec2Vec embeddings for molecular features
4. Construct heterogeneous graph with Molecule and Food nodes
5. Implement HAN or GIN architecture for training
