# CSV Processing Experiment

This notebook demonstrates how to use the CSV processor to add `manipulation_id` and `probe` fields to your prompts CSV file.

## Overview
- **Input**: CSV file with prompts
- **Output**: Enhanced CSV with manipulation_id and probe fields

In [None]:
import pandas as pd
from csv_processor import CSVProcessor

# Set up paths
input_file = "data/prompts.csv"
output_file = "data/processed_prompts.csv"

## 1. Explore the Input Data

In [None]:
# Load and examine the input CSV
df = pd.read_csv(input_file, dtype=object, na_filter=False)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()

In [None]:
# Check for missing values and data types
print("Data Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())

## 2. Configure Processing Parameters

Set your manipulation ID and prefix string here:

In [None]:
# Configuration - Modify these values as needed
MANIPULATION_ID = "M001"  # Change this to desired manipulation ID
PREFIX = "DAN: "  # Change this to desired prefix

print(f"Manipulation ID: {MANIPULATION_ID}")
print(f"Prefix: '{PREFIX}'")

## 3. Preview the Output

In [None]:
# Create processor and preview results
processor = CSVProcessor(input_file, output_file)
preview = processor.preview_output(MANIPULATION_ID, PREFIX, num_rows=10)

print("Preview of processed data:")
preview[['id', 'prompt', 'manipulation_id', 'probe']].head()

In [None]:
# Show example of how the probe field looks
print("Example probe transformations:")
for i, row in preview.head(3).iterrows():
    print(f"\n[Prompt] {row['prompt']}")
    print(f"[Probe]  {row['probe']}")

## 4. Process the Full Dataset

In [None]:
# Process the entire dataset
processor.process_csv(MANIPULATION_ID, PREFIX)

## 5. Validate Output Format

In [None]:
# Check the raw file format
with open(output_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()[:5]  # First 5 lines

print("Raw output format (first 5 lines):")
for i, line in enumerate(lines, 1):
    print(f"{i}: {repr(line)}")

In [None]:
output_df = pd.read_csv(output_file, dtype=object, na_filter=False)

# Final verification
print("Final Results:")
print(f"Total rows processed: {len(output_df)}")
print(f"Output file: {output_file}")
print(f"Output columns: {list(output_df.columns)}")
print(f"Manipulation ID: {MANIPULATION_ID}")
print(f"Prefix used: '{PREFIX}'")
print("\nProcessing complete!")