# Split by Size

python split_csv.py --input "C:\Data\big file.csv" --mb-per-file 100 --out-dir "C:\Data\chunks"

# Split by Rows
python split_csv.py --input "C:\Data\big file.csv" --rows-per-file 200000 --out-dir "C:\Data\chunks"



In [1]:
# split_csv.py
import csv, io, os, argparse

def split_by_rows(inp, out_dir, rows_per_file, encoding="utf-8"):
    os.makedirs(out_dir, exist_ok=True)
    with open(inp, "r", newline="", encoding=encoding) as fin:
        reader = csv.reader(fin)
        header = next(reader)

        def open_part(idx):
            path = os.path.join(out_dir, f"{os.path.basename(inp)}.part_{idx:03d}.csv")
            fout = open(path, "w", newline="", encoding=encoding)
            w = csv.writer(fout)
            w.writerow(header)
            return fout, w, path

        part = 1
        fout, writer, path = open_part(part)
        written = 0

        for row in reader:
            if written >= rows_per_file:
                fout.close()
                part += 1
                fout, writer, path = open_part(part)
                written = 0
            writer.writerow(row)
            written += 1
        fout.close()
    print(f"Done. Files saved in {out_dir}")

def split_by_mb(inp, out_dir, mb_per_file, encoding="utf-8"):
    target_bytes = int(mb_per_file * 1024 * 1024)
    os.makedirs(out_dir, exist_ok=True)

    with open(inp, "r", newline="", encoding=encoding) as fin:
        reader = csv.reader(fin)
        header = next(reader)

        # Precompute header size in bytes as it will be in each chunk
        tmp = io.StringIO()
        csv.writer(tmp).writerow(header)
        header_bytes = len(tmp.getvalue().encode(encoding))

        def open_part(idx):
            path = os.path.join(out_dir, f"{os.path.basename(inp)}.part_{idx:03d}.csv")
            f = open(path, "w", newline="", encoding=encoding)
            w = csv.writer(f)
            w.writerow(header)
            return f, w, path

        part = 1
        fout, writer, path = open_part(part)
        current_bytes = header_bytes

        for row in reader:
            # Measure row size as it will be written
            s = io.StringIO()
            csv.writer(s).writerow(row)
            row_bytes = len(s.getvalue().encode(encoding))

            # If adding this row would exceed target, start a new part
            if current_bytes + row_bytes > target_bytes and current_bytes > header_bytes:
                fout.close()
                part += 1
                fout, writer, path = open_part(part)
                current_bytes = header_bytes

            writer.writerow(row)
            current_bytes += row_bytes

        fout.close()
    print(f"Done. Files saved in {out_dir}")

if __name__ == "__main__":
    ap = argparse.ArgumentParser(description="Split a large CSV into smaller files, preserving the header.")
    ap.add_argument("--input", required=True, help="Path to the big CSV")
    ap.add_argument("--out-dir", default="chunks", help="Output folder for parts")
    grp = ap.add_mutually_exclusive_group(required=True)
    grp.add_argument("--rows-per-file", type=int, help="Number of data rows per output file")
    grp.add_argument("--mb-per-file", type=float, help="Approx size per output file in MB")
    ap.add_argument("--encoding", default="utf-8", help="File encoding, for example utf-8 or latin-1")
    args = ap.parse_args()

    if args.rows_per_file:
        split_by_rows(args.input, args.out_dir, args.rows_per_file, args.encoding)
    else:
        split_by_mb(args.input, args.out_dir, args.mb_per_file, args.encoding)


usage: ipykernel_launcher.py [-h] --input INPUT [--out-dir OUT_DIR]
                             (--rows-per-file ROWS_PER_FILE |
                             --mb-per-file MB_PER_FILE) [--encoding ENCODING]
ipykernel_launcher.py: error: the following arguments are required: --input


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
