In [10]:
import pandas as pd
import numpy as np
import orjson, os
from pathlib import Path
from typing import Any

In [12]:
def _encode(obj: Any):
    if isinstance(obj, pd.DataFrame):
        return {
            "__pd_dataframe__": True,
            "columns": obj.columns.tolist(),
            "data": obj.to_dict("records"),
        }
    if isinstance(obj, (np.generic,)):
        return obj.item()
    raise TypeError

def dump_jsonl(iterable, path):
    tmp = path.with_suffix(".tmp")
    with tmp.open("wb") as f:
        for row in iterable:
            f.write(orjson.dumps(row, default=_encode))
            f.write(b"\n")
        f.flush(); os.fsync(f.fileno())
    os.replace(tmp, path)

def resurrect(obj: Any):
    if isinstance(obj, dict):
        if obj.get("__pd_dataframe__"):
            return pd.DataFrame(obj["data"], columns=obj["columns"])
        return {k: resurrect(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [resurrect(v) for v in obj]
    return obj

def load_jsonl(path: Path):
    with path.open("rb") as f:
        for line in f:
            plain = orjson.loads(line)
            yield resurrect(plain)

In [13]:
# data - all filtered records for tracks 1 - 7 with added percent of completion in file 'filtered_records_1_to_7.jsonl'
# split into 7 files, one for each track

# https://wutwaw-my.sharepoint.com/:f:/g/personal/01161476_pw_edu_pl/EvWK0T-5Y61FlRiFfp0IEPEBAM63WUY5IVUj0h_CH2f1HA?e=aCLg3b

# how to read jsonl file
gen = load_jsonl(Path("../filtered_data/filtered_records_1_metrics.jsonl")) # this is a generator so it will not load the whole file into memory

# you load the whole file into memory (2.5 GB)
# records_all = list(gen)

# or you can use generator and process it in chunks
for record in gen:
    # do something with record
    print(record)
    break

{'Volume': 100, 'Global_Date': '20240102', 'ID': '20240102-0003', 'Date': '20240102', 'Consent': True, 'Team_Name': 'BJ2 03', 'Selected_Language': 0, 'Consent_Time_X_axis': '09:58:37:166', 'Age_X_axis': 44.0, 'Companionship_X_axis': 0, 'Question_Time_X_axis': '09:58:43:705', 'Consent_Time_Y_axis': '09:58:39:368', 'Age_Y_axis': 8.0, 'Companionship_Y_axis': 0, 'Question_Time_Y_axis': '09:58:56:652', 'StartTime': '09:58:37:166', 'ClosingTime': '10:06:28:954', 'Completed': True, 'TerminationType': 0, 'track_Track_ID': 1, 'track_Difficulty_Level': 0, 'track_StartTime': '09:59:10:082', 'track_ClosingTime': '09:59:42:281', 'track_Time': 32198, 'track_Mistake': False, 'track_Completed': True, 'track_Interface_Mode': [], 'Points':         X    Y          Time  Status
0       0    0  09:59:10:082       0
1       2    0  09:59:10:098       1
2       3    0  09:59:10:115       1
3       4    0  09:59:10:148       1
4       5    0  09:59:10:215       1
...   ...  ...           ...     ...
1407  790

In [16]:
# Assuming load_jsonl is a function that loads your JSONL file
gen = load_jsonl(Path("../filtered_data/filtered_records_1_metrics.jsonl"))  # this is a generator so it will not load the whole file into memory

# You can iterate over the generator and print only the selected fields
for record in gen:
    # Sprawdzamy, czy klucze istnieją w rekordzie
    if 'smoothness' in record and 'stair_ratio' in record:
        record_id = record['ID']
        smoothness = record['smoothness']
        stair_ratio = record['stair_ratio']
        print(f"ID: {record_id}, Smoothness: {smoothness}, Stair Ratio: {stair_ratio}")
    else:
        print("Brak danych smoothness/stair_ratio w rekordzie.")
    break  # Tylko dla pierwszego rekordu


ID: 20240102-0003, Smoothness: 0.03718318657511171, Stair Ratio: 0.7512402551381998


In [5]:
# how to save jsonl file
# sample_records should be a list of dicts
sample_records = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]
dump_jsonl(sample_records, Path("sample_records.jsonl"))