In [8]:
import pickle
import json
import gzip
import bz2
# import joblib  <-- Removed joblib
import h5py
import os
import time
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np

# --- Create a larger dictionary (100x) ---

# Original dictionary (small, for demonstration):
original_dict = {
    'record1': {'data': [1, 2, 3], 'info': {'a': 'hello', 'b': 42}},
    'record2': {'data': [4, 5, 6], 'info': {'a': 'world', 'b': 99}}
}

# Create a larger dictionary by replicating the original
oalex_records_dict_filtered = {}
for i in range(1000):
    for key, value in original_dict.items():
        new_key = f"{key}_{i}"  # Create a unique key
        oalex_records_dict_filtered[new_key] = value  # Copy the value


# --- 1. Pickle ---

def save_with_pickle(data, filename, compress=False, compression_method='gzip'):
    """Saves data using pickle, optionally with compression."""
    if compress:
        if compression_method == 'gzip':
            filepath = filename + ".pkl.gz"
            with gzip.open(filepath, 'wb') as f:
                pickle.dump(data, f)
        elif compression_method == 'bz2':
            filepath = filename + ".pkl.bz2"
            with bz2.open(filepath, 'wb') as f:
                pickle.dump(data, f)
        else:
            raise ValueError("Invalid compression_method. Choose 'gzip' or 'bz2'.")
    else:
        filepath = filename + ".pkl"
        with open(filepath, 'wb') as f:
            pickle.dump(data, f)
    return filepath

def load_with_pickle(filename):
    """Loads data saved with pickle, handling compression."""
    if filename.endswith('.gz'):
        with gzip.open(filename, 'rb') as f:
            return pickle.load(f)
    elif filename.endswith('.bz2'):
        with bz2.open(filename, 'rb') as f:
            return pickle.load(f)
    elif filename.endswith('.pkl'):
        with open(filename, 'rb') as f:
            return pickle.load(f)
    else:
        raise ValueError("Unsupported file extension.  Expected .pkl, .pkl.gz, or .pkl.bz2")



# --- 2. JSON ---

def save_with_json(data, filename, compress=False, compression_method='gzip'):
    """Saves data using JSON, optionally with compression."""
    if compress:
        if compression_method == 'gzip':
            filepath = filename + ".json.gz"
            with gzip.open(filepath, 'wt', encoding='utf-8') as f:
                json.dump(data, f, indent=4)
        elif compression_method == 'bz2':
            filepath = filename + ".json.bz2"
            with bz2.open(filepath, 'wt', encoding='utf-8') as f:
                json.dump(data, f, indent=4)
        else:
            raise ValueError("Invalid compression_method. Choose 'gzip' or 'bz2'.")
    else:
        filepath = filename + ".json"
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4)
    return filepath

def load_with_json(filename):
    """Loads data saved with JSON, handling compression."""
    if filename.endswith('.gz'):
        with gzip.open(filename, 'rt', encoding='utf-8') as f:
            return json.load(f)
    elif filename.endswith('.bz2'):
        with bz2.open(filename, 'rt', encoding='utf-8') as f:
            return json.load(f)
    elif filename.endswith('.json'):
        with open(filename, 'r', encoding='utf-8') as f:
            return json.load(f)
    else:
        raise ValueError("Unsupported file extension.  Expected .json, .json.gz, or .json.bz2")


# --- 3. HDF5 ---

def save_with_h5py(data, filename):
    """Saves a nested dictionary to an HDF5 file."""
    filepath = filename + ".h5"
    with h5py.File(filepath, 'w') as hf:
        _save_dict_to_h5py(hf, '/', data)
    return filepath

def _save_dict_to_h5py(h5file, path, dic):
    """Recursively saves a dictionary to HDF5 groups and datasets."""
    for key, value in dic.items():
        if isinstance(value, dict):
            group = h5file.create_group(path + key)
            _save_dict_to_h5py(group, path + key + '/', value)
        else:
            if isinstance(value, (list, tuple)):
                value = np.array(value)
            h5file[path + key] = value

def load_with_h5py(filename):
    """Loads a nested dictionary from an HDF5 file."""
    with h5py.File(filename, 'r') as hf:
        return _load_dict_from_h5py(hf, '/')

def _load_dict_from_h5py(h5file, path):
    """Recursively loads a dictionary from HDF5 groups and datasets."""
    output = {}
    for key in h5file[path].keys():
        item = h5file[path + key]
        if isinstance(item, h5py.Group):
            output[key] = _load_dict_from_h5py(h5file, path + key + '/')
        else:
            output[key] = item[()]
    return output



# --- 4. Parquet (with PyArrow) ---

def flatten_dict(d, parent_key='', sep='_'):
    """Flattens a nested dictionary into a single-level dictionary."""
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, list):
             items.append((new_key, str(v))) # Convert lists to strings
        else:
            items.append((new_key, v))
    return dict(items)


def save_with_parquet(data, filename):
    """Saves a dictionary to a Parquet file using Pandas and PyArrow."""
    filepath = filename + ".parquet"

    # Flatten the dictionary
    flat_data = [flatten_dict(record) for record in data.values()]

    # Convert to Pandas DataFrame
    df = pd.DataFrame(flat_data)

    # Convert to PyArrow Table
    table = pa.Table.from_pandas(df)

    # Write to Parquet file
    pq.write_table(table, filepath)
    return filepath

def load_with_parquet(filename):
    """Loads data from a Parquet file into a dictionary."""

    # Read Parquet file into PyArrow Table
    table = pq.read_table(filename)

    # Convert to Pandas DataFrame
    df = table.to_pandas()

    # Convert DataFrame back to a list of dictionaries (undo flattening)
    loaded_data = df.to_dict('records')
    
    #reconstruct nested dictionary
    reconstructed_data = {}
    for i, flat_dict in enumerate(loaded_data):
      nested_dict = {}
      for key, value in flat_dict.items():
          parts = key.split('_')
          current_level = nested_dict
          for part in parts[:-1]:
              if part not in current_level:
                  current_level[part] = {}
              current_level = current_level[part]
          
          #try to convert back the lists
          if isinstance(value,str) and value.startswith("[") and value.endswith("]"):
            try:
                value = eval(value) #Use of eval for simplicity. In production, ast.literal_eval would be safer
            except (SyntaxError, ValueError):
                pass
          current_level[parts[-1]] = value
      reconstructed_data[f"record{i+1}"] = nested_dict #use ordered keys as in original dict


    return reconstructed_data

# --- Usage and Comparison ---
filenames = []
#pickle
filepath_pickle = save_with_pickle(oalex_records_dict_filtered, "mydata_pickle")
filenames.append(filepath_pickle)
filepath_pickle_gz = save_with_pickle(oalex_records_dict_filtered, "mydata_pickle_gz", compress=True)
filenames.append(filepath_pickle_gz)
filepath_pickle_bz2 = save_with_pickle(oalex_records_dict_filtered, "mydata_pickle_bz2", compress=True, compression_method='bz2')
filenames.append(filepath_pickle_bz2)

#json
filepath_json = save_with_json(oalex_records_dict_filtered, "mydata_json")
filenames.append(filepath_json)
filepath_json_gz = save_with_json(oalex_records_dict_filtered, "mydata_json_gz", compress=True)
filenames.append(filepath_json_gz)
filepath_json_bz2 = save_with_json(oalex_records_dict_filtered, "mydata_json_bz2", compress=True, compression_method='bz2')
filenames.append(filepath_json_bz2)

#h5py
filepath_h5py = save_with_h5py(oalex_records_dict_filtered, "mydata_h5py")
filenames.append(filepath_h5py)

#parquet
filepath_parquet = save_with_parquet(oalex_records_dict_filtered, "mydata_parquet")
filenames.append(filepath_parquet)


print("\nFile Sizes (MB):")
for filename in filenames:
    size = os.path.getsize(filename) / (1024 * 1024)  # Convert bytes to MB
    print(f"{filename}: {size:.4f} MB")


print("\nLoading Times (seconds):")
load_functions = {  # Use a dictionary to map extensions to loading functions
    ".pkl": load_with_pickle,
    ".pkl.gz": load_with_pickle,
    ".pkl.bz2": load_with_pickle,
    ".json": load_with_json,
    ".json.gz": load_with_json,
    ".json.bz2": load_with_json,
    ".h5": load_with_h5py,
    ".parquet": load_with_parquet
}

for filename in filenames:
    start_time = time.time()
    extension = os.path.splitext(filename)[-1] #get the last extension
    if extension in load_functions:
        loaded_data = load_functions[extension](filename) # Call correct load function
        # Basic check to make sure the loading process went ok
        if extension == ".parquet": #Parquet dict has slightly different keys
            assert len(loaded_data) == len(oalex_records_dict_filtered)
        else:
            assert len(loaded_data) == len(oalex_records_dict_filtered) # Check length
    else:
        print(f"Warning: No loading function found for {filename}")
    end_time = time.time()
    print(f"{filename}: {end_time - start_time:.4f} seconds")


File Sizes (MB):
mydata_pickle.pkl: 0.0304 MB
mydata_pickle_gz.pkl.gz: 0.0044 MB
mydata_pickle_bz2.pkl.bz2: 0.0017 MB
mydata_json.json: 0.3546 MB
mydata_json_gz.json.gz: 0.0053 MB
mydata_json_bz2.json.bz2: 0.0022 MB
mydata_h5py.h5: 5.8276 MB
mydata_parquet.parquet: 0.0025 MB

Loading Times (seconds):
mydata_pickle.pkl: 0.0195 seconds
mydata_pickle_gz.pkl.gz: 0.0000 seconds
mydata_pickle_bz2.pkl.bz2: 0.0000 seconds
mydata_json.json: 0.0259 seconds
mydata_json_gz.json.gz: 0.0000 seconds
mydata_json_bz2.json.bz2: 0.0000 seconds
mydata_h5py.h5: 2.8556 seconds
mydata_parquet.parquet: 0.0599 seconds
