In [None]:
# !pip install datasets huggingface_hub

In [4]:
# # Download OpenCodeReasoning Dataset
# This script downloads the OpenCodeReasoning dataset from Hugging Face
# using STREAMING MODE to avoid caching the full dataset to disk.

# ## Cell 1: Install dependencies
# !pip install datasets huggingface_hub pandas pyarrow

# ## Cell 2: Imports
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from datasets import load_dataset

# ## Cell 3: Configuration
DATASET_NAME    = "nvidia/OpenCodeReasoning"
CONFIGS         = ["split_0", "split_1"]
COLUMNS_TO_KEEP = ["id", "input", "source", "dataset", "license", "split", "difficulty", "solution"]
OUTPUT_PARQUET  = "opencodereasoning_filtered.parquet"
BATCH_SIZE      = 5_000   # rows buffered in memory before flushing to parquet

print(f"Dataset : {DATASET_NAME}")
print(f"Configs : {CONFIGS}")
print(f"Columns : {COLUMNS_TO_KEEP}")
print(f"Output  : {OUTPUT_PARQUET}")


Dataset : nvidia/OpenCodeReasoning
Configs : ['split_0', 'split_1']
Columns : ['id', 'input', 'source', 'dataset', 'license', 'split', 'difficulty', 'solution']
Output  : opencodereasoning_filtered.parquet


In [5]:
# ## Cell 4: Stream both configs and write directly to Parquet
# streaming=True means rows are fetched on-the-fly — no full dataset cache written to disk.

parquet_writer = None
total_rows     = 0

for config in CONFIGS:
    print(f"\n--- Streaming config: {config} ---")
    ds_stream = load_dataset(DATASET_NAME, config, split=config, streaming=True)

    batch = []
    for row in ds_stream:
        filtered_row = {col: row.get(col) for col in COLUMNS_TO_KEEP if col in row}
        batch.append(filtered_row)

        if len(batch) >= BATCH_SIZE:
            table = pa.Table.from_pandas(pd.DataFrame(batch), preserve_index=False)
            if parquet_writer is None:
                parquet_writer = pq.ParquetWriter(OUTPUT_PARQUET, table.schema)
            parquet_writer.write_table(table)
            total_rows += len(batch)
            print(f"  Rows written so far: {total_rows:,}", end="\r")
            batch = []

    # Flush any remaining rows for this config
    if batch:
        table = pa.Table.from_pandas(pd.DataFrame(batch), preserve_index=False)
        if parquet_writer is None:
            parquet_writer = pq.ParquetWriter(OUTPUT_PARQUET, table.schema)
        parquet_writer.write_table(table)
        total_rows += len(batch)

    print(f"\n  Finished {config}. Total rows written: {total_rows:,}")


--- Streaming config: split_0 ---


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

  Rows written so far: 565,000
  Finished split_0. Total rows written: 567,850

--- Streaming config: split_1 ---


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

  Rows written so far: 732,850
  Finished split_1. Total rows written: 735,255


In [7]:
if parquet_writer:
    parquet_writer.close()

print(f"\nStreaming complete!")
print(f"Total rows : {total_rows:,}")
print(f"Saved to   : {OUTPUT_PARQUET}")

# ## Cell 5: Reload from Parquet and preview
import pyarrow.parquet as pq
import pyarrow as pa

print("\n--- Reloading from Parquet for inspection ---")

# Read with pyarrow directly, then convert — avoids the pandas/pyarrow compat bug
table = pq.read_table(OUTPUT_PARQUET)
df = table.to_pandas(safe=False)

print(f"Shape   : {df.shape}")
print(f"Columns : {list(df.columns)}")

preview = df.head(5).copy()
for col in ["input", "solution"]:
    if col in preview.columns:
        preview[col] = preview[col].str[:100] + "..."
print("\nFirst 5 rows:")
print(preview.to_string(index=False))

# ## Cell 6: Dataset statistics
print("\n--- Dataset Statistics ---")
for col in ["source", "difficulty", "dataset", "license"]:
    if col in df.columns:
        print(f"\nValue counts — {col}:")
        print(df[col].value_counts().to_string())

print("\nDone!")


Streaming complete!
Total rows : 735,255
Saved to   : opencodereasoning_filtered.parquet

--- Reloading from Parquet for inspection ---
Shape   : (735255, 8)
Columns : ['id', 'input', 'source', 'dataset', 'license', 'split', 'difficulty', 'solution']

First 5 rows:
                              id                                                                                                    input   source       dataset   license split         difficulty                                                                                                     solution
c0d4209f929db2b5bf3526a47a2520b0 Problem description.\nVipul is a hardworking super-hero who maintains the bracket ratio of all the st... codechef code_contests cc-by-4.0 train UNKNOWN_DIFFICULTY T = int(input())\nfor _ in range(T):\n    s = input().strip()\n    stack = []\n    valid = True\n    for ...
5378dbbc2f9928bfe9b3a196e3a45a0b  The Chef likes to stay in touch with his staff. So, the Chef, the head server, and the so

In [10]:
# ## Split large Parquet into ~40MB chunks (JSON) for GitHub
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import os

INPUT_PARQUET = "opencodereasoning_filtered.parquet"
OUTPUT_DIR    = "opencodereasoning_parts"
TARGET_MB     = 10
TARGET_BYTES  = TARGET_MB * 1024 * 1024

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load with pyarrow directly
print("Reading parquet...")
table = pq.read_table(INPUT_PARQUET)
total_rows = len(table)
file_size  = os.path.getsize(INPUT_PARQUET)

bytes_per_row  = file_size / total_rows
rows_per_chunk = int(TARGET_BYTES / bytes_per_row)

print(f"Total rows : {total_rows:,}")
print(f"File size  : {file_size / 1024**2:.1f} MB")
print(f"Target     : {TARGET_MB} MB per part")
print(f"~Rows/chunk: {rows_per_chunk:,}")

# Split and write as JSON
part = 0
for start in range(0, total_rows, rows_per_chunk):
    chunk_df = table.slice(start, rows_per_chunk).to_pandas(safe=False)
    out_path = os.path.join(OUTPUT_DIR, f"part_{part:04d}.json")
    chunk_df.to_json(out_path, orient="records", lines=True, force_ascii=False)
    size_mb = os.path.getsize(out_path) / 1024**2
    print(f"  part_{part:04d}.json — {len(chunk_df):,} rows — {size_mb:.1f} MB")
    part += 1

print(f"\nDone! {part} files written to: {OUTPUT_DIR}/")

Reading parquet...
Total rows : 735,255
File size  : 772.6 MB
Target     : 10 MB per part
~Rows/chunk: 9,516
  part_0000.json — 9,516 rows — 26.6 MB
  part_0001.json — 9,516 rows — 26.6 MB
  part_0002.json — 9,516 rows — 26.6 MB
  part_0003.json — 9,516 rows — 26.6 MB
  part_0004.json — 9,516 rows — 26.6 MB
  part_0005.json — 9,516 rows — 26.5 MB
  part_0006.json — 9,516 rows — 26.7 MB
  part_0007.json — 9,516 rows — 26.6 MB
  part_0008.json — 9,516 rows — 26.6 MB
  part_0009.json — 9,516 rows — 26.5 MB
  part_0010.json — 9,516 rows — 26.5 MB
  part_0011.json — 9,516 rows — 26.6 MB
  part_0012.json — 9,516 rows — 26.5 MB
  part_0013.json — 9,516 rows — 26.5 MB
  part_0014.json — 9,516 rows — 26.6 MB
  part_0015.json — 9,516 rows — 26.7 MB
  part_0016.json — 9,516 rows — 26.6 MB
  part_0017.json — 9,516 rows — 26.6 MB
  part_0018.json — 9,516 rows — 26.6 MB
  part_0019.json — 9,516 rows — 26.5 MB
  part_0020.json — 9,516 rows — 26.7 MB
  part_0021.json — 9,516 rows — 26.5 MB
  part_0022

In [1]:
# ## Validate Parquet vs JSON parts before deleting the original

import pyarrow.parquet as pq
import pandas as pd
import os

INPUT_PARQUET = "opencodereasoning_filtered.parquet"
OUTPUT_DIR    = "opencodereasoning_parts"

# --- Step 1: Load original parquet ---
print("Reading original parquet...")
table      = pq.read_table(INPUT_PARQUET)
df_orig    = table.to_pandas(safe=False)
total_rows = len(df_orig)
print(f"Original : {total_rows:,} rows, {list(df_orig.columns)}")

# --- Step 2: Load all JSON parts ---
json_files = sorted([f for f in os.listdir(OUTPUT_DIR) if f.endswith(".json")])
print(f"\nFound {len(json_files)} JSON part files in '{OUTPUT_DIR}/'")

df_parts = pd.concat(
    [pd.read_json(os.path.join(OUTPUT_DIR, f), lines=True) for f in json_files],
    ignore_index=True
)
print(f"Combined : {len(df_parts):,} rows, {list(df_parts.columns)}")

# --- Step 3: Checks ---
print("\n--- Validation Checks ---")
checks_passed = True

# Row count
if len(df_parts) == total_rows:
    print(f"  [PASS] Row count matches: {total_rows:,}")
else:
    print(f"  [FAIL] Row count mismatch — original: {total_rows:,}, parts: {len(df_parts):,}")
    checks_passed = False

# Column match
if set(df_parts.columns) == set(df_orig.columns):
    print(f"  [PASS] Columns match: {list(df_orig.columns)}")
else:
    print(f"  [FAIL] Column mismatch — original: {set(df_orig.columns)}, parts: {set(df_parts.columns)}")
    checks_passed = False

# Null counts per column
print("\n  Null counts (original vs combined):")
null_match = True
for col in df_orig.columns:
    n_orig  = df_orig[col].isna().sum()
    n_parts = df_parts[col].isna().sum()
    status  = "PASS" if n_orig == n_parts else "FAIL"
    if status == "FAIL":
        null_match = False
        checks_passed = False
    print(f"    [{status}] {col}: original={n_orig:,}  parts={n_parts:,}")

# Sample content check — compare first and last 5 rows by 'id'
if "id" in df_orig.columns:
    orig_ids  = set(df_orig["id"].astype(str))
    parts_ids = set(df_parts["id"].astype(str))
    if orig_ids == parts_ids:
        print(f"\n  [PASS] All IDs match ({len(orig_ids):,} unique)")
    else:
        missing  = orig_ids - parts_ids
        extra    = parts_ids - orig_ids
        print(f"\n  [FAIL] ID mismatch — missing: {len(missing):,}, extra: {len(extra):,}")
        checks_passed = False

# --- Step 4: Verdict ---
print("\n--- Verdict ---")
if checks_passed:
    print("  All checks passed! Safe to delete the original parquet.")
    print(f"\n  To delete, run:  os.remove('{INPUT_PARQUET}')")
    # Uncomment the line below to auto-delete:
    # os.remove(INPUT_PARQUET)
else:
    print("  One or more checks FAILED. Do NOT delete the original parquet yet.")

Reading original parquet...
Original : 735,255 rows, ['id', 'input', 'source', 'dataset', 'license', 'split', 'difficulty', 'solution']

Found 78 JSON part files in 'opencodereasoning_parts/'
Combined : 735,255 rows, ['id', 'input', 'source', 'dataset', 'license', 'split', 'difficulty', 'solution']

--- Validation Checks ---
  [PASS] Row count matches: 735,255
  [PASS] Columns match: ['id', 'input', 'source', 'dataset', 'license', 'split', 'difficulty', 'solution']

  Null counts (original vs combined):
    [PASS] id: original=0  parts=0
    [PASS] input: original=0  parts=0
    [PASS] source: original=0  parts=0
    [PASS] dataset: original=0  parts=0
    [PASS] license: original=0  parts=0
    [PASS] split: original=0  parts=0
    [PASS] difficulty: original=0  parts=0
    [PASS] solution: original=0  parts=0

  [PASS] All IDs match (28,319 unique)

--- Verdict ---
  All checks passed! Safe to delete the original parquet.

  To delete, run:  os.remove('opencodereasoning_filtered.parq

In [2]:
os.remove("opencodereasoning_filtered.parquet")
print("Deleted opencodereasoning_filtered.parquet")

Deleted opencodereasoning_filtered.parquet


In [5]:
print("Unique values check...")
print(f"  input    : {df_orig['id'].nunique():,} unique out of {len(df_orig):,} rows")
print(f"  solution : {df_orig['solution'].nunique():,} unique out of {len(df_orig):,} rows")

Unique values check...
  input    : 28,319 unique out of 735,255 rows
  solution : 626,241 unique out of 735,255 rows


In [6]:
df_orig.shape

(735255, 8)