In [None]:
import sys
import os

# Get the project root directory (parent of the notebooks folder)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the src folder to the Python path
sys.path.append(project_root)

In [2]:
import os
import msgpack
from tqdm import tqdm

def extract_and_save_first_50_records(shard_fnames, output_dir, limit=50):
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist

    for shard_fname in tqdm(shard_fnames):
        # Create a new file to save the extracted records
        extracted_records = []
        
        # Read the shard file
        with open(shard_fname, "rb") as infile:
            unpacker = msgpack.Unpacker(infile, raw=False)
            
            # Extract the first 'limit' records
            for idx, record in enumerate(unpacker):
                if idx < limit:
                    extracted_records.append(record)
                else:
                    break

        # Define the new filename for the output
        shard_basename = os.path.basename(shard_fname)
        output_fname = os.path.join(output_dir, f"extracted_{shard_basename}")
        
        # Save the extracted records to a new .msg file, writing them one by one (not as a list)
        with open(output_fname, "wb") as outfile:
            packer = msgpack.Packer()
            for record in extracted_records:
                outfile.write(packer.pack(record))  # Write each record individually

        print(f"Saved {len(extracted_records)} records to {output_fname}")

# Example usage
dataset_dir = "../data/trial_data/shards/"
shard_fnames = [os.path.join(dataset_dir, fname) for fname in os.listdir(dataset_dir) if fname.endswith(".msg")]
output_dir = "../data/trial_data/extracted_shards/"

# Extract and save the first 50 records from each shard
extract_and_save_first_50_records(shard_fnames, output_dir, limit=50)


100%|██████████| 1/1 [00:00<00:00, 210.65it/s]

Saved 50 records to ../data/trial_data/extracted_shards/extracted_shard_1.msg



