# Data Preparation for Competition Bundle

This notebook prepares the data for Codabench by:
1. Loading all grain data files
2. Splitting into train and test sets (stratified by variety)
3. Organizing data into the required structure:
   - `input_data/train/`: Training data with labels
   - `input_data/test/`: Test data WITHOUT labels (only images)
   - `reference_data/`: Test labels (ground truth for scoring)

In [1]:
import os
import re
import shutil
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json

# Configuration
DATA_DIR = "../Data/Grain-Data-RGB"  # Using RGB dataset
OUTPUT_DIR = "."  # Competition_Bundle directory
TEST_SIZE = 0.2  # 20% for test set
RANDOM_STATE = 42  # For reproducibility

# Output directories
TRAIN_DIR = os.path.join(OUTPUT_DIR, "input_data", "train")
TEST_DIR = os.path.join(OUTPUT_DIR, "input_data", "test")
REFERENCE_DIR = os.path.join(OUTPUT_DIR, "reference_data")

# Create output directories
os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(TEST_DIR, exist_ok=True)
os.makedirs(REFERENCE_DIR, exist_ok=True)

print("✓ Directories created")
print(f"  Train: {TRAIN_DIR}")
print(f"  Test: {TEST_DIR}")
print(f"  Reference: {REFERENCE_DIR}")

✓ Directories created
  Train: ./input_data/train
  Test: ./input_data/test
  Reference: ./reference_data


In [2]:
# Load and clean file list
def load_and_clean_files(files_dir):
    """Load and clean file list, removing system files."""
    files = os.listdir(files_dir)
    if ".DS_Store" in files:
        files.remove(".DS_Store")
    # Filter only .npz files
    files = [f for f in files if f.endswith('.npz')]
    return files

def extract_metadata(filename):
    """Extract metadata from filename."""
    grain_match = re.search(r"grain(?P<grainID>\d+)", filename)
    var_match = re.search(r"var(?P<varietyNumber>\d+)", filename)
    micro_match = re.search(r"x(?P<x>\d+)y(?P<y>\d+)", filename)
    time_match = re.search(
        r"2x_(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T(?P<timestamp>\d+)_corr",
        filename
    )

    if not all([grain_match, var_match, micro_match, time_match]):
        return None

    return {
        "grainID": grain_match.group("grainID"),
        "varietyNumber": var_match.group("varietyNumber"),
        "microplotID": f"x{micro_match.group('x')}y{micro_match.group('y')}",
        "year": time_match.group("year"),
        "month": time_match.group("month"),
        "day": time_match.group("day"),
        "timestamp": time_match.group("timestamp"),
        "filename": filename
    }

# Load all files
print("Loading files...")
all_files = load_and_clean_files(DATA_DIR)
print(f"✓ Found {len(all_files)} files")

# Extract metadata
print("Extracting metadata...")
all_metadata = [extract_metadata(file) for file in tqdm(all_files)]
all_metadata = [m for m in all_metadata if m is not None]
print(f"✓ Extracted metadata for {len(all_metadata)} files")

# Create dataframe
df = pd.DataFrame(all_metadata)
print(f"\nDataset statistics:")
print(f"  Total files: {len(df)}")
print(f"  Unique varieties: {df['varietyNumber'].nunique()}")
print(f"  Unique microplots: {df['microplotID'].nunique()}")
print(f"\nVariety distribution:")
print(df['varietyNumber'].value_counts().sort_index())

Loading files...
✓ Found 26882 files
Extracting metadata...


100%|██████████| 26882/26882 [00:00<00:00, 275693.70it/s]

✓ Extracted metadata for 26882 files

Dataset statistics:
  Total files: 26882
  Unique varieties: 8
  Unique microplots: 32

Variety distribution:
varietyNumber
1    3462
2    3933
3    3434
4    3205
5    3031
6    3201
7    3115
8    3501
Name: count, dtype: int64





In [3]:
# Split into train and test sets (stratified by variety)
print("Splitting data into train and test sets...")
print(f"  Test size: {TEST_SIZE * 100}%")
print(f"  Random state: {RANDOM_STATE}")

# Get variety numbers for stratification
varieties = df['varietyNumber'].values

# Split
train_df, test_df = train_test_split(
    df,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=varieties
)

print(f"\n✓ Split completed:")
print(f"  Train: {len(train_df)} samples")
print(f"  Test: {len(test_df)} samples")

# Verify stratification
print(f"\nTrain variety distribution:")
print(train_df['varietyNumber'].value_counts().sort_index())
print(f"\nTest variety distribution:")
print(test_df['varietyNumber'].value_counts().sort_index())

Splitting data into train and test sets...
  Test size: 20.0%
  Random state: 42

✓ Split completed:
  Train: 21505 samples
  Test: 5377 samples

Train variety distribution:
varietyNumber
1    2769
2    3146
3    2747
4    2564
5    2425
6    2561
7    2492
8    2801
Name: count, dtype: int64

Test variety distribution:
varietyNumber
1    693
2    787
3    687
4    641
5    606
6    640
7    623
8    700
Name: count, dtype: int64


In [4]:
# Copy training files (with labels)
print("\nCopying training files...")
train_labels = {}

for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    filename = row['filename']
    source_path = os.path.join(DATA_DIR, filename)
    dest_path = os.path.join(TRAIN_DIR, filename)
    
    # Copy file
    shutil.copy2(source_path, dest_path)
    
    # Store label for reference
    # Load to get the actual label
    data = np.load(source_path)
    label = int(data['y'])
    train_labels[filename] = label

print(f"✓ Copied {len(train_df)} training files")

# Save training labels metadata (optional, for reference)
train_labels_file = os.path.join(REFERENCE_DIR, "train_labels.json")
with open(train_labels_file, 'w') as f:
    json.dump(train_labels, f, indent=2)
print(f"✓ Saved training labels metadata to {train_labels_file}")


Copying training files...


100%|██████████| 21505/21505 [00:15<00:00, 1350.50it/s]

✓ Copied 21505 training files
✓ Saved training labels metadata to ./reference_data/train_labels.json





In [5]:
# Copy test files (WITHOUT labels - only images)
print("\nCopying test files (without labels)...")
test_labels = {}
test_filenames = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    filename = row['filename']
    source_path = os.path.join(DATA_DIR, filename)
    dest_path = os.path.join(TEST_DIR, filename)
    
    # Load original file
    original_data = np.load(source_path)
    
    # Extract label before removing it
    label = int(original_data['y'])
    test_labels[filename] = label
    test_filenames.append(filename)
    
    # Create new file with ONLY 'x' (image), no 'y' (label)
    np.savez_compressed(
        dest_path,
        x=original_data['x'],
        original_filename=original_data.get('original_filename', filename),
        bands=original_data.get('bands', None)
    )

print(f"✓ Copied {len(test_df)} test files (labels removed)")

# Verify test files don't have labels
sample_test = np.load(os.path.join(TEST_DIR, test_filenames[0]))
print(f"\n✓ Verification - Test file keys: {list(sample_test.keys())}")
print(f"  (Should NOT contain 'y' key)")


Copying test files (without labels)...


100%|██████████| 5377/5377 [00:22<00:00, 233.83it/s]

✓ Copied 5377 test files (labels removed)

✓ Verification - Test file keys: ['x', 'original_filename', 'bands']
  (Should NOT contain 'y' key)





In [6]:
# Save reference data (test labels) for scoring program
print("\nSaving reference data (test labels)...")

# Save as JSON (easy to load in scoring program)
reference_file_json = os.path.join(REFERENCE_DIR, "test_labels.json")
with open(reference_file_json, 'w') as f:
    json.dump(test_labels, f, indent=2)
print(f"✓ Saved test labels to {reference_file_json}")

# Also save as a simple text file (filename, label format)
reference_file_txt = os.path.join(REFERENCE_DIR, "test_labels.txt")
with open(reference_file_txt, 'w') as f:
    f.write("filename,label\n")
    for filename, label in sorted(test_labels.items()):
        f.write(f"{filename},{label}\n")
print(f"✓ Saved test labels to {reference_file_txt}")

# Save as numpy array (ordered list matching test files order)
reference_file_npy = os.path.join(REFERENCE_DIR, "test_labels.npy")
test_labels_array = np.array([test_labels[f] for f in sorted(test_filenames)])
np.save(reference_file_npy, test_labels_array)
print(f"✓ Saved test labels array to {reference_file_npy}")

print(f"\n✓ Reference data saved in multiple formats for flexibility")


Saving reference data (test labels)...
✓ Saved test labels to ./reference_data/test_labels.json
✓ Saved test labels to ./reference_data/test_labels.txt
✓ Saved test labels array to ./reference_data/test_labels.npy

✓ Reference data saved in multiple formats for flexibility


In [7]:
# Create summary statistics
summary = {
    "total_samples": len(df),
    "train_samples": len(train_df),
    "test_samples": len(test_df),
    "test_size_ratio": TEST_SIZE,
    "random_state": RANDOM_STATE,
    "num_varieties": df['varietyNumber'].nunique(),
    "varieties": sorted(df['varietyNumber'].unique().tolist()),
    "train_variety_distribution": train_df['varietyNumber'].value_counts().to_dict(),
    "test_variety_distribution": test_df['varietyNumber'].value_counts().to_dict(),
    "data_source": DATA_DIR
}

summary_file = os.path.join(OUTPUT_DIR, "data_split_summary.json")
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2, default=str)
print(f"✓ Saved data split summary to {summary_file}")

# Print final summary
print("\n" + "="*60)
print("DATA PREPARATION SUMMARY")
print("="*60)
print(f"Total samples: {len(df)}")
print(f"Train samples: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)")
print(f"Test samples: {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")
print(f"Number of varieties: {df['varietyNumber'].nunique()}")
print(f"\nTrain directory: {TRAIN_DIR}")
print(f"  Files: {len(os.listdir(TRAIN_DIR))} .npz files (with labels)")
print(f"\nTest directory: {TEST_DIR}")
print(f"  Files: {len(os.listdir(TEST_DIR))} .npz files (without labels)")
print(f"\nReference directory: {REFERENCE_DIR}")
print(f"  Files: test_labels.json, test_labels.txt, test_labels.npy")
print("="*60)
print("\n✓ Data preparation completed successfully!")
print("\nNext steps:")
print("  1. Verify the data structure")
print("  2. Test ingestion_program with this data")
print("  3. Test scoring_program with reference data")

✓ Saved data split summary to ./data_split_summary.json

DATA PREPARATION SUMMARY
Total samples: 26882
Train samples: 21505 (80.0%)
Test samples: 5377 (20.0%)
Number of varieties: 8

Train directory: ./input_data/train
  Files: 21505 .npz files (with labels)

Test directory: ./input_data/test
  Files: 5377 .npz files (without labels)

Reference directory: ./reference_data
  Files: test_labels.json, test_labels.txt, test_labels.npy

✓ Data preparation completed successfully!

Next steps:
  1. Verify the data structure
  2. Test ingestion_program with this data
  3. Test scoring_program with reference data


In [None]:
# Verification: Check a few files to ensure everything is correct
print("Verification checks...\n")``

# Check train file (should have 'x' and 'y')
train_sample = np.load(os.path.join(TRAIN_DIR, train_df.iloc[0]['filename']))
print(f"✓ Train file keys: {list(train_sample.keys())}")
print(f"  Has 'x': {'x' in train_sample}")
print(f"  Has 'y': {'y' in train_sample}")
print(f"  Label: {train_sample['y'] if 'y' in train_sample else 'MISSING'}")

# Check test file (should have 'x' but NOT 'y')
test_sample = np.load(os.path.join(TEST_DIR, test_filenames[0]))
print(f"\n✓ Test file keys: {list(test_sample.keys())}")
print(f"  Has 'x': {'x' in test_sample}")
print(f"  Has 'y': {'y' in test_sample} (should be False)")
if 'y' in test_sample:
    print("  ⚠️  WARNING: Test file contains 'y'! This should not happen.")

# Check reference data
with open(reference_file_json, 'r') as f:
    ref_data = json.load(f)
print(f"\n✓ Reference data loaded: {len(ref_data)} labels")
print(f"  Sample label: {list(ref_data.items())[0]}")

print("\n✓ All verification checks passed!")

Verification checks...

✓ Train file keys: ['x', 'y', 'original_filename', 'bands']
  Has 'x': True
  Has 'y': True
  Label: 5

✓ Test file keys: ['x', 'original_filename', 'bands']
  Has 'x': True
  Has 'y': False (should be False)

✓ Reference data loaded: 5377 labels
  Sample label: ('grain11640_x44y21-var3_11000_us_2x_2020-12-02T104851_corr.npz', 3)

✓ All verification checks passed!
