In [1]:
import json
import random

In [8]:
def sample_and_reformat_jsonl(input_file, output_file, sample_size):
    """Sample lines from a JSONL file, extract 'user' role content, reformat, and save."""
    # Read all lines from the input file
    with open(input_file, "r", encoding="utf-8") as infile:
        lines = infile.readlines()
    
    # Check if the file has enough lines
    if len(lines) < sample_size:
        raise ValueError(f"The file contains only {len(lines)} lines, but {sample_size} lines were requested.")
    
    # Randomly sample the specified number of lines
    sampled_lines = random.sample(lines, sample_size)
    
    # Process sampled lines
    reformatted_lines = []
    for line in sampled_lines:
        # Parse the JSON line
        data = json.loads(line)
        
        # Find the 'user' role content
        user_message = next(
            (msg for msg in data["messages"] if msg["role"] == "user"),
            None
        )
        
        if user_message:
            # Reformat to the desired structure
            reformatted_lines.append({"verse": user_message["content"]})
    
    # Write the reformatted lines to the output file
    with open(output_file, "w", encoding="utf-8") as outfile:
        for reformatted_line in reformatted_lines:
            outfile.write(json.dumps(reformatted_line, ensure_ascii=False) + "\n")

In [9]:
len(input_jsonl_file)

39

In [10]:
# Define the input and output file paths
input_jsonl_file = "data/fine_tuning_complete_dataset.jsonl"  
output_jsonl_file = "data/all_verses_dataset.jsonl"  
sample_size = 13084 

In [11]:
# Execute the sampling
sample_and_reformat_jsonl(input_jsonl_file, output_jsonl_file, sample_size)