In [28]:
import pandas as pd
from pathlib import Path
data_dir = Path('~/drive/Projects/backups/ocs/ocsai-py/data/ocsai1')

splits = dict()
for (split, fname) in [('train', 'finetune-gt_main2_prepared_train.jsonl'), ('val', 'finetune-gt_main2_prepared_val.jsonl')]:
    df = pd.read_json(data_dir / fname, lines=True)

    # Split the 'prompt' column on newlines and extract the relevant parts
    df[['prompt', 'response']] = df['prompt'].str.extract(r'AUT Prompt:(.+)\nResponse:(.+)\nScore:', expand=True)

    df['prompt'] = df['prompt'].str.strip()
    df['response'] = df['response'].str.strip()
    df.completion = df.completion.astype(int).div(10)
    df = df.rename(columns={'completion': 'score'})
    splits[split] = df

In [26]:
import json
import os

saved_prompts = []

# Group by prompt and create separate JSON files
for prompt in splits['val']['prompt'].unique():
    # Filter data for this prompt
    prompt_data = splits['val'][splits['val']['prompt'] == prompt]

    # Limit to 100 examples, randomized
    n_examples = 100
    if prompt_data.shape[0] > n_examples:
        prompt_data = prompt_data.sample(n_examples, random_state=1234)
        saved_prompts.append(prompt)
    else:
        continue

    # Create the examples list
    examples = [
        {
            "text": row['response'],
            "truth": row['score']  # assuming 'score' is your truth column
        }
        for _, row in prompt_data.iterrows()
    ]
    
    # Create the JSON structure
    dataset = {
        "description": f"This dataset contains responses to the question, 'What is a surprising use for a {prompt}?'. It is sourced from multiple studies, with different participant groups and different experimental conditions. The responses are scores by human judges on a scale of 1-5, with 1 being the least original and 5 being the most original. The ground truth is an average of multiple judges, rounded to one decimal point.",
        "train_data_url": f"https://github.com/organisciak/ai-class/tree/main/static/datasets/aut_{prompt}_analysis.md",
        "examples": examples
    }
    
    # Create filename - convert prompt to lowercase and remove spaces
    filename = f"aut_{prompt.lower().replace(' ', '_')}.json"
    
    # Create the directory if it doesn't exist
    os.makedirs('../src/lib/data/datasets', exist_ok=True)
    
    # Write the JSON file
    with open(f'../src/lib/data/datasets/{filename}', 'w') as f:
        json.dump(dataset, f, indent=4)

Save train dataset to an easy to reference file.

In [27]:
# Create markdown and CSV files for train data analysis
for prompt in saved_prompts:
    # Filter train data for this prompt
    prompt_data = splits['train'][splits['train']['prompt'] == prompt]
    
    # Sort by score for percentile analysis
    sorted_data = prompt_data.sort_values('score', ascending=False)
    
    # Calculate indices for different sections
    n_five_percent = int(n_examples * 0.05)
    top_responses = sorted_data.head(n_five_percent)
    bottom_responses = sorted_data.tail(n_five_percent)
    
    # Get median responses (middle 5 responses)
    median_idx = n_examples // 2
    median_responses = sorted_data.iloc[median_idx-2:median_idx+3]
    
    # Create markdown content
    markdown_content = f"""# AUT Analysis: {prompt}

## Examples: Top Responses
{top_responses[['response', 'score']].to_markdown(index=False)}

## Examples: Bottom Responses
{bottom_responses[['response', 'score']].to_markdown(index=False)}

## Median Responses
{median_responses[['response', 'score']].to_markdown(index=False)}

## All Train Examples

```csv
{prompt_data[['response', 'score']].to_csv(index=False).strip()}
```
"""
    
    # Save markdown file
    markdown_filename = f"aut_{prompt.lower().replace(' ', '_')}_analysis.md"
    with open(f'../static/datasets/{markdown_filename}', 'w') as f:
        f.write(markdown_content)