# Dataset Preparation: CodeRM-UnitTest Loading and Splitting

This notebook handles Step 2 of our masterplan:
- Load CodeRM-UnitTest dataset from Hugging Face
- Explore dataset structure and understand the format
- Sample 20k records from the full 77.2k dataset
- Create 80/10/10 split (16k train / 2k val / 2k test)
- Save preprocessed splits locally

## 1. Import Required Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import logging
from datetime import datetime
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import json
import pickle
from pathlib import Path

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('../logs/dataset_preparation.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

print("Libraries imported successfully!")
logger.info("Starting dataset preparation process")

## 2. Setup Environment and GPU Check

In [None]:
import torch

# Check GPU availability and memory
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU Memory: {gpu_memory:.2f} GB")
    print(f"Available GPU Memory: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
    
    logger.info(f"GPU detected: {torch.cuda.get_device_name(0)} with {gpu_memory:.2f} GB memory")
else:
    print("No GPU available, using CPU")
    logger.warning("No GPU available, training will use CPU")

# Create necessary directories
os.makedirs('../data', exist_ok=True)
os.makedirs('../logs', exist_ok=True)
print("Directory structure verified")

## 3. Load CodeRM-UnitTest Dataset

In [None]:
# Load the dataset from Hugging Face
logger.info("Loading CodeRM-UnitTest dataset from Hugging Face")

try:
    dataset = load_dataset("KAKA22/CodeRM-UnitTest")
    print(f"Dataset loaded successfully!")
    print(f"Dataset structure: {dataset}")
    
    # Get the train split (assuming it's the main split)
    train_data = dataset['train']
    print(f"Total samples in dataset: {len(train_data)}")
    
    logger.info(f"Dataset loaded with {len(train_data)} samples")
    
except Exception as e:
    logger.error(f"Error loading dataset: {str(e)}")
    print(f"Error: {str(e)}")
    raise

## 4. Explore Dataset Structure

In [None]:
# Examine the first few samples
print("=== Dataset Features ===")
print(train_data.features)

print("\n=== First Sample ===")
first_sample = train_data[0]
for key, value in first_sample.items():
    if isinstance(value, str) and len(value) > 200:
        print(f"{key}: {value[:200]}...")
    else:
        print(f"{key}: {value}")

logger.info("Dataset structure exploration completed")

In [None]:
# Examine unit tests structure
print("=== Unit Tests Structure ===")
if 'unit_tests' in first_sample:
    unit_tests = first_sample['unit_tests']
    print(f"Number of unit tests for first sample: {len(unit_tests)}")
    
    if len(unit_tests) > 0:
        print("\n=== First Unit Test ===")
        first_test = unit_tests[0]
        for key, value in first_test.items():
            if isinstance(value, str) and len(value) > 200:
                print(f"{key}: {value[:200]}...")
            else:
                print(f"{key}: {value}")

# Check data quality metrics
print("\n=== Quality Metrics Analysis ===")
if 'unit_tests' in first_sample and len(first_sample['unit_tests']) > 0:
    far_values = [test.get('FAR', 0) for test in first_sample['unit_tests']]
    frr_values = [test.get('FRR', 0) for test in first_sample['unit_tests']]
    print(f"FAR range in first sample: {min(far_values):.3f} - {max(far_values):.3f}")
    print(f"FRR range in first sample: {min(frr_values):.3f} - {max(frr_values):.3f}")

## 5. Sample 20k Records from Dataset

In [None]:
# Sample 20k records from the full dataset
SAMPLE_SIZE = 20000
total_samples = len(train_data)

logger.info(f"Sampling {SAMPLE_SIZE} records from {total_samples} total samples")

if total_samples >= SAMPLE_SIZE:
    # Use random sampling to get diverse data
    np.random.seed(42)  # For reproducibility
    sample_indices = np.random.choice(total_samples, SAMPLE_SIZE, replace=False)
    sample_indices = sorted(sample_indices)  # Sort for efficient access
    
    sampled_data = train_data.select(sample_indices)
    print(f"Successfully sampled {len(sampled_data)} records")
    
    logger.info(f"Sampled {len(sampled_data)} records using random sampling")
else:
    print(f"Dataset has only {total_samples} samples, using all available data")
    sampled_data = train_data
    
    logger.warning(f"Dataset smaller than requested sample size, using all {total_samples} samples")

## 6. Create 80/10/10 Data Splits

In [None]:
# Convert to pandas DataFrame for easier splitting
logger.info("Converting dataset to pandas DataFrame for splitting")

# Convert the sampled data to a list of dictionaries
data_list = []
for i in range(len(sampled_data)):
    sample = sampled_data[i]
    data_list.append(sample)

print(f"Converted {len(data_list)} samples to list format")

# Create indices for splitting
indices = list(range(len(data_list)))

# First split: 80% train, 20% temp (which will become 10% val + 10% test)
train_indices, temp_indices = train_test_split(
    indices, test_size=0.2, random_state=42, shuffle=True
)

# Second split: Split the 20% into 10% val and 10% test
val_indices, test_indices = train_test_split(
    temp_indices, test_size=0.5, random_state=42, shuffle=True
)

print(f"Data split sizes:")
print(f"Train: {len(train_indices)} samples ({len(train_indices)/len(data_list)*100:.1f}%)")
print(f"Validation: {len(val_indices)} samples ({len(val_indices)/len(data_list)*100:.1f}%)")
print(f"Test: {len(test_indices)} samples ({len(test_indices)/len(data_list)*100:.1f}%)")

logger.info(f"Created splits - Train: {len(train_indices)}, Val: {len(val_indices)}, Test: {len(test_indices)}")

In [None]:
# Create the actual data splits
train_split = [data_list[i] for i in train_indices]
val_split = [data_list[i] for i in val_indices]
test_split = [data_list[i] for i in test_indices]

print("Data splits created successfully!")
print(f"Train split: {len(train_split)} samples")
print(f"Validation split: {len(val_split)} samples")
print(f"Test split: {len(test_split)} samples")

## 7. Save Preprocessed Splits

In [None]:
# Save splits using pickle for Python objects
data_dir = Path('../data')
data_dir.mkdir(exist_ok=True)

logger.info("Saving data splits to disk")

# Save splits
with open(data_dir / 'train_split.pkl', 'wb') as f:
    pickle.dump(train_split, f)
    
with open(data_dir / 'val_split.pkl', 'wb') as f:
    pickle.dump(val_split, f)
    
with open(data_dir / 'test_split.pkl', 'wb') as f:
    pickle.dump(test_split, f)

print("Data splits saved successfully!")

# Save metadata
metadata = {
    'total_samples': len(data_list),
    'train_size': len(train_split),
    'val_size': len(val_split),
    'test_size': len(test_split),
    'sample_size': SAMPLE_SIZE,
    'original_dataset_size': total_samples,
    'split_ratio': '80/10/10',
    'random_seed': 42,
    'created_at': datetime.now().isoformat(),
    'dataset_name': 'KAKA22/CodeRM-UnitTest'
}

with open(data_dir / 'metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Metadata saved to {data_dir / 'metadata.json'}")
logger.info(f"All data splits and metadata saved to {data_dir}")

## 8. Verify Saved Data

In [None]:
# Verify the saved data by loading it back
logger.info("Verifying saved data integrity")

try:
    # Load splits back
    with open(data_dir / 'train_split.pkl', 'rb') as f:
        loaded_train = pickle.load(f)
        
    with open(data_dir / 'val_split.pkl', 'rb') as f:
        loaded_val = pickle.load(f)
        
    with open(data_dir / 'test_split.pkl', 'rb') as f:
        loaded_test = pickle.load(f)
    
    # Load metadata
    with open(data_dir / 'metadata.json', 'r') as f:
        loaded_metadata = json.load(f)
    
    print("=== Verification Results ===")
    print(f"Train split loaded: {len(loaded_train)} samples")
    print(f"Val split loaded: {len(loaded_val)} samples")
    print(f"Test split loaded: {len(loaded_test)} samples")
    print(f"\nMetadata:")
    for key, value in loaded_metadata.items():
        print(f"  {key}: {value}")
    
    # Quick integrity check
    assert len(loaded_train) == len(train_split), "Train split size mismatch"
    assert len(loaded_val) == len(val_split), "Val split size mismatch"
    assert len(loaded_test) == len(test_split), "Test split size mismatch"
    
    print("\n✅ Data integrity verification passed!")
    logger.info("Data integrity verification completed successfully")
    
except Exception as e:
    print(f"❌ Verification failed: {str(e)}")
    logger.error(f"Data verification failed: {str(e)}")
    raise

## 9. Summary and Next Steps

In [None]:
print("=== Dataset Preparation Summary ===")
print(f"✅ Loaded CodeRM-UnitTest dataset ({total_samples} total samples)")
print(f"✅ Sampled {len(data_list)} records for training")
print(f"✅ Created 80/10/10 splits:")
print(f"   - Train: {len(train_split)} samples")
print(f"   - Validation: {len(val_split)} samples")
print(f"   - Test: {len(test_split)} samples")
print(f"✅ Saved all splits to {data_dir}")
print(f"✅ Data integrity verified")

print("\n=== Files Created ===")
for file_path in data_dir.glob('*'):
    file_size = file_path.stat().st_size / 1024 / 1024  # MB
    print(f"  {file_path.name}: {file_size:.2f} MB")

print("\n=== Next Steps ===")
print("1. ✅ Step 2 Complete: Dataset loading and splitting")
print("2. 🔄 Step 3: Data preprocessing and tokenization")
print("3. ⏳ Step 4: Model loading and configuration")
print("4. ⏳ Step 5: QLoRA/PEFT setup")

logger.info("Dataset preparation completed successfully!")
logger.info(f"Ready to proceed with Step 3: Data preprocessing")