# Shuffling and Saving
This notebook does the following
* Loads the csv with prompts, targets, and original indicies
* Shuffles the examples to remove personal bias
* Takes first 10,000 examples
* Saves as dataset for experimentation

In [13]:
import pandas as pd
import numpy as np
import argparse

def shuffle_and_sample_csv(input_csv, output_csv, seed=42, sample_size=10000):
    """
    Load a large CSV file, shuffle it with a fixed seed, and save the top samples to a new CSV.
    
    Parameters:
    input_csv (str): Path to the input CSV file.
    output_csv (str): Path to save the output sampled CSV file.
    seed (int): Random seed for reproducibility.
    sample_size (int): Number of rows to sample.
    """
    # Set seed for reproducibility
    np.random.seed(seed)
    
    # Load CSV
    print("Loading CSV...")
    df = pd.read_csv(input_csv)
    
    # Shuffle data
    print("Shuffling data...")
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    
    # Select top samples
    print(f"Selecting top {sample_size} samples...")
    df_sampled = df.head(sample_size)
    
    # Save to new CSV
    print(f"Saving to {output_csv}...")
    df_sampled.to_csv(output_csv, index=False)
    print("Process completed successfully.")

'''if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Shuffle and sample a large CSV file.")
    parser.add_argument("input_csv", type=str, default = "../../../data/prompts/first_formatted_prompts.csv", help="Path to the input CSV file.")
    parser.add_argument("output_csv", type=str, default = "../../../data/prompts/shuffled_prompts.csv", help="Path to the output CSV file.")
    parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility.")
    parser.add_argument("--sample_size", type=int, default=10000, help="Number of rows to sample.")
    
    args = parser.parse_args()
    
    shuffle_and_sample_csv(args.input_csv, args.output_csv, args.seed, args.sample_size)'''
shuffle_and_sample_csv(input_csv="../../../data/prompts/first_formatted_prompts.csv", output_csv="../../../data/prompts/shuffled_prompts.csv", seed=42, sample_size=10000)

Loading CSV...
Shuffling data...
Selecting top 10000 samples...
Saving to ../../../data/prompts/shuffled_prompts.csv...
Process completed successfully.


In [16]:
df = pd.read_csv("../../../data/prompts/first_formatted_prompts.csv")

In [7]:
len(df)

130326

In [18]:
df = pd.read_csv("../../../data/prompts/shuffled_prompts.csv")

In [19]:
df.head()

Unnamed: 0.1,Unnamed: 0,original_index,prompt,original_file
0,686019,755993,\n@Override\n\tpublic void removeAll(Iterator<...,test/9714608/9714608_391.json
1,35689,38981,\n@Override\n\tpublic void removeAll(Iterator<...,train/134236467/134236467_316.json
2,580541,641039,\n@Override\n\tpublic void removeAll(Iterator<...,eval/178848687/178848687_8.json
3,284689,316187,\n@Override\n\tpublic void removeAll(Iterator<...,train/2357859/2357859_474.json
4,336666,375526,\n@Override\n\tpublic void removeAll(Iterator<...,train/78267831/78267831_18.json
