In [1]:
import pandas as pd
import numpy as np
import orjson, os
from pathlib import Path
from typing import Any

In [2]:
def _encode(obj: Any):
    if isinstance(obj, pd.DataFrame):
        return {
            "__pd_dataframe__": True,
            "columns": obj.columns.tolist(),
            "data": obj.to_dict("records"),
        }
    if isinstance(obj, (np.generic,)):
        return obj.item()
    raise TypeError

def dump_jsonl(iterable, path):
    tmp = path.with_suffix(".tmp")
    with tmp.open("wb") as f:
        for row in iterable:
            f.write(orjson.dumps(row, default=_encode))
            f.write(b"\n")
        f.flush(); os.fsync(f.fileno())
    os.replace(tmp, path)

def resurrect(obj: Any):
    if isinstance(obj, dict):
        if obj.get("__pd_dataframe__"):
            return pd.DataFrame(obj["data"], columns=obj["columns"])
        return {k: resurrect(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [resurrect(v) for v in obj]
    return obj

def load_jsonl(path: Path):
    with path.open("rb") as f:
        for line in f:
            plain = orjson.loads(line)
            yield resurrect(plain)

In [4]:
# data - all filtered records for tracks 1 - 7 with added percent of completion in file 'filtered_records_1_to_7.jsonl'
# split into 7 files, one for each track

# https://wutwaw-my.sharepoint.com/:f:/g/personal/01161476_pw_edu_pl/EvWK0T-5Y61FlRiFfp0IEPEBAM63WUY5IVUj0h_CH2f1HA?e=aCLg3b

# how to read jsonl file
gen = load_jsonl(Path("filtered_records_1_to_7.jsonl")) # this is a generator so it will not load the whole file into memory

# you load the whole file into memory (2.5 GB)
# records_all = list(gen)

# or you can use generator and process it in chunks
for record in gen:
    # do something with record
    print(record)
    break

{'Volume': 100, 'Global_Date': '20240102', 'ID': '20240102-0000', 'Date': '20240102', 'Consent': True, 'Team_Name': 'BJ2 00', 'Selected_Language': 0, 'Consent_Time_X_axis': '09:30:48:500', 'Age_X_axis': 25.0, 'Companionship_X_axis': 1, 'Question_Time_X_axis': '09:30:55:925', 'Consent_Time_Y_axis': '09:30:27:247', 'Age_Y_axis': 24.0, 'Companionship_Y_axis': 1, 'Question_Time_Y_axis': '09:30:40:859', 'StartTime': '09:30:27:247', 'ClosingTime': '09:38:18:265', 'Completed': True, 'TerminationType': 2, 'track_Track_ID': 2, 'track_Difficulty_Level': 0, 'track_StartTime': '09:31:04:583', 'track_ClosingTime': '09:31:49:178', 'track_Time': 44594, 'track_Mistake': False, 'track_Completed': True, 'track_Interface_Mode': [], 'Points':          X     Y          Time  Status
0        0     0  09:31:04:583       0
1        1     0  09:31:04:615       1
2        2     0  09:31:04:649       1
3        3     0  09:31:04:683       1
4        4     0  09:31:04:716       1
...    ...   ...           ...   

In [None]:
# how to save jsonl file
# sample_records should be a list of dicts
sample_records = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]
dump_jsonl(sample_records, Path("sample_records.jsonl"))