# Gadir Dataset Preprocessing

Generates metadata CSVs from raw Gadir metadata:
- **gadir_metadata.csv**: Raw SRA metadata with Run IDs, ages, and groups

Output: Age-grouped CSV files with columns `[sid, label]`
- `label`: 0=Control (healthy), 1=FoodAllergy
- Age groups: 0-6, 6-12, 12-18, 18-24, 24-30, 30+ months

> **Note:** Excludes "Unclear" and "ControlHiRisk" samples. Each sample is unique (no deduplication needed).

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

np.random.seed(42)

SCRIPT_DIR = Path('.')
OUTPUT_DIR = Path('../../../huggingface_datasets/Gadir/metadata')
RAW_METADATA = SCRIPT_DIR / 'gadir_metadata.csv'

## 1. Load Raw Metadata

In [None]:
# Load raw metadata
df = pd.read_csv(RAW_METADATA, dtype=str)
print(f"Loaded {len(df)} total samples")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## 2. Filter and Create Labels

In [None]:
# Filter: Keep only human samples
if 'HOST' in df.columns:
    df = df[df['HOST'].str.contains('Homo sapiens', case=False, na=False)]
if 'Organism' in df.columns:
    df = df[df['Organism'].str.contains('human', case=False, na=False)]

print(f"After human filter: {len(df)} samples")

# Filter: Exclude Unclear and ControlHiRisk
print(f"\nGroup distribution before filtering:")
print(df['Group'].value_counts())

df = df[~df['Group'].str.lower().isin(['unclear', 'controlhirisk'])]
print(f"\nAfter excluding Unclear/ControlHiRisk: {len(df)} samples")

# Create binary labels
label_mapping = {'FoodAllergy': 1, 'Control': 0}
df['label'] = df['Group'].map(label_mapping)

print(f"\nLabel distribution: {df['label'].value_counts().to_dict()}")

## 3. Create Age Groups

In [None]:
def assign_age_group(age):
    """Assign age group based on 6-month intervals."""
    if pd.isna(age):
        return None
    age = float(age)
    if age < 6:
        return '0-6_months'
    elif age < 12:
        return '6-12_months'
    elif age < 18:
        return '12-18_months'
    elif age < 24:
        return '18-24_months'
    elif age < 30:
        return '24-30_months'
    else:
        return '30+_months'

df['age_group'] = df['Age_at_Collection'].apply(assign_age_group)
df['Run'] = df['Run'].astype(str)
df['label'] = df['label'].astype(int)

print(f"Age group distribution:")
print(df['age_group'].value_counts())

## 4. Verify Sample Uniqueness

In [None]:
# Verify each BioSample is unique (no duplicates)
print(f"Unique BioSamples: {df['BioSample'].nunique()}")
print(f"Total samples: {len(df)}")
print(f"\nEach sample is from a unique collection event (no deduplication needed)")

## 5. Generate Age Group Files

In [None]:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

age_groups = sorted(df['age_group'].dropna().unique())
summary = []

for age_group in age_groups:
    group_df = df[df['age_group'] == age_group][['Run', 'label']].copy()
    group_df = group_df.rename(columns={'Run': 'sid'})  # Use 'sid' for consistency
    group_df = group_df.sort_values('sid').reset_index(drop=True)
    
    output_file = OUTPUT_DIR / f'gadir_preprocessed_{age_group}.csv'
    group_df.to_csv(output_file, index=False)
    
    label_dist = group_df['label'].value_counts().to_dict()
    summary.append({
        'age_group': age_group,
        'samples': len(group_df),
        'control': label_dist.get(0, 0),
        'food_allergy': label_dist.get(1, 0)
    })
    print(f"{age_group}: {len(group_df)} samples | Labels: {label_dist}")

# Also create combined file
all_df = df[df['age_group'].notna()][['Run', 'label']].copy()
all_df = all_df.rename(columns={'Run': 'sid'})
all_df.to_csv(OUTPUT_DIR / 'gadir_all_months.csv', index=False)
print(f"\nSaved gadir_all_months.csv with {len(all_df)} samples")

## 6. Summary

In [None]:
summary_df = pd.DataFrame(summary)
print("\n" + "="*60)
print("PREPROCESSING COMPLETE")
print("="*60)
print("\nAge Group Summary:")
print(summary_df.to_string(index=False))
print(f"\nTotal samples: {summary_df['samples'].sum()}")
print(f"Total control: {summary_df['control'].sum()}")
print(f"Total food allergy: {summary_df['food_allergy'].sum()}")