In [1]:
import os
import json

import pandas as pd

In [2]:
# Parameters
input_path = './magpie_data/magpie_llama70b_00-of-13.parquet'
output_dir = './processed_magpie'
num_samples = 20000
output_filename = f'magpieclean_{num_samples//1000}k.jsonl' 

# Load the Parquet file
df = pd.read_parquet(input_path)

In [3]:
# Randomly sample up to num_samples rows
if num_samples < len(df):
    df = df.sample(n=num_samples)
else:
    print(f"Requested {num_samples} samples, but only {len(df)} available. Using all rows.")

outputs = []
for _, row in df.iterrows():
    chat_history = [
        ["system", "llama4_default_sysprompt.txt"],
        ["user", row['instruction']],
        ["assistant", row['response']]
    ]
    outputs.append({"chat": chat_history})

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Save as JSONL
output_path = os.path.join(output_dir, output_filename)
with open(output_path, 'w', encoding='utf-8') as f:
    for obj in outputs:
        f.write(json.dumps(obj) + '\n')

print(f"Saved {len(outputs)} samples to {output_path}")

Saved 20000 samples to ./processed_magpie\magpieclean_20k.jsonl
