# Lab 2: Experimental Design & Metadata

## Objectives
- Build a metadata table you can actually analyze later
- Learn which fields prevent downstream confusion (batch, donor, chemistry, lane)
- Validate metadata consistency (unique IDs, controlled vocabulary)

## Outputs (save to `data/`)
- `data/metadata.csv` (your edited version of the template)
- `results/metadata_validation.md` (short validation notes)

---

## Checklist
- Create unique sample IDs
- Define batch variables explicitly
- Record chemistry/protocol and reference versions
- Ensure each sample has condition + replicate unit defined


In [None]:
import pandas as pd
from pathlib import Path

# Load the provided template
template_path = Path('../templates/metadata_template.csv')
md = pd.read_csv(template_path)
md.head()


In [None]:
# Make a working copy in data/
Path('../data').mkdir(exist_ok=True)
out_path = Path('../data/metadata.csv')
md.to_csv(out_path, index=False)
print(f"Wrote {out_path.resolve()}")

print("Edit ../data/metadata.csv in your editor, then re-run the next cell.")


In [None]:
# Validate metadata
md2 = pd.read_csv('../data/metadata.csv')

# Basic checks
required_cols = ['sample_id', 'condition', 'batch']
missing = [c for c in required_cols if c not in md2.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# sample_id must be unique
if md2['sample_id'].duplicated().any():
    dupes = md2.loc[md2['sample_id'].duplicated(), 'sample_id'].tolist()
    raise ValueError(f"Duplicate sample_id values found: {dupes}")

# Controlled vocab hints
print("Unique conditions:", sorted(md2['condition'].dropna().unique().tolist()))
print("Unique batches:", sorted(md2['batch'].dropna().unique().tolist()))

# Null checks
print("\nMissing values per column:")
print(md2.isna().sum().sort_values(ascending=False).head(15))

# Save validation notes
Path('../results').mkdir(exist_ok=True)
with open('../results/metadata_validation.md', 'w') as f:
    f.write('# Metadata Validation\n\n')
    f.write(f"Rows: {len(md2)}\n\n")
    f.write('## Unique conditions\n')
    f.write('\n'.join([f"- {x}" for x in sorted(md2['condition'].dropna().unique())]) + '\n\n')
    f.write('## Unique batches\n')
    f.write('\n'.join([f"- {x}" for x in sorted(md2['batch'].dropna().unique())]) + '\n')

print("Wrote ../results/metadata_validation.md")
