# Check out "zarr" for handling the VERGE dataset

In [None]:
import zarr
import numcodecs
from numcodecs import Blosc

import numpy as np
from typing import List, Tuple
import json

In [None]:
# The first draft of this code was produced by Claude.

class ZarrVariableDatasetSeparate:
    def __init__(self, store_path: str, mode='w'):
        """
        Store each training instance as separate arrays in a Zarr group.
        
        Args:
            store_path: Path to zarr store
            mode: 'w' for write, 'r' for read, 'a' for append
        """
        self.store = zarr.open(store_path, mode=mode)
        self.metadata = self.store.attrs.get('metadata', {})
        
    def add_instance(self, instance_id: str, features: np.ndarray, labels: np.ndarray):
        """
        Add a training instance.
        
        Args:
            instance_id: Unique identifier for this instance
            features: Shape (R, C) features matrix
            labels: Shape (R,) array of class labels
        """
        # Create group for this instance
        instance_group = self.store.create_group(instance_id, overwrite=True)
        
        # Store features and labels using modern API
        instance_group['features'] = zarr.array(features, 
                                               compressor=Blosc(cname='gzip', clevel=1))
        instance_group['labels'] = zarr.array(labels,
                                             compressor=Blosc(cname='gzip', clevel=1))
        
        # Store metadata
        instance_group.attrs['shape'] = features.shape
        instance_group.attrs['num_rows'] = features.shape[0]
        instance_group.attrs['num_cols'] = features.shape[1]
        
        # Update global metadata
        self.metadata[instance_id] = {
            'shape': features.shape,
            'num_rows': features.shape[0]
        }
        self.store.attrs['metadata'] = self.metadata
        
    def get_instance(self, instance_id: str) -> Tuple[np.ndarray, np.ndarray]:
        """Get a training instance by ID."""
        instance_group = self.store[instance_id]
        features = instance_group['features'][:]
        labels = instance_group['labels'][:]
        return features, labels
    
    def list_instances(self) -> List[str]:
        """Get list of all instance IDs."""
        return list(self.metadata.keys())
    
    def get_stats(self):
        """Get dataset statistics."""
        if not self.metadata:
            return {}
        shapes = [info['shape'] for info in self.metadata.values()]
        rows = [info['num_rows'] for info in self.metadata.values()]
        return {
            'num_instances': len(self.metadata),
            'min_rows': min(rows),
            'max_rows': max(rows),
            'mean_rows': np.mean(rows),
            'num_cols': shapes[0][1] if shapes else 0
        }


In [None]:
fname = './zdata'
dset = ZarrVariableDatasetSeparate(store_path=fname, mode='w')

In [None]:
nc = 8

nr = np.random.randint(4) + 2
features = np.random.random((nr, nc))
labels = np.random.randint(5, size=(nr,))
instance_id = 'instance-%06d' % i
print(instance_id, features.shape, labels.shape)
dset.add_instance(instance_id=instance_id, features=features, labels=labels)


## Based on ChatGPT suggestion. 

In [3]:
import zarr
import numpy as np
import os

In [4]:
np.random.seed(1)

In [5]:
# Parameters
root_path = './zdata3'

num_instances = 50  # For example
C = 400            # Fixed number of columns

# Remove old dataset if exists
if os.path.exists(root_path):
    import shutil
    shutil.rmtree(root_path)

# Create Zarr root group
root = zarr.open(root_path, mode='w')



In [6]:
from numcodecs import Blosc
compressor = Blosc(cname='zstd', clevel=5, shuffle=Blosc.SHUFFLE)  # Good default

In [7]:
# Store metadata
root.attrs['num_instances'] = num_instances
root.attrs['num_columns'] = C

# Example: write instances
for i in range(num_instances):
    R = np.random.randint(20, 500)  # Variable number of rows
    features = np.random.rand(R, C).astype('float32')
    labels = np.random.randint(0, 10, size=(R,), dtype='int32')

    # Create a group for this instance
    g = root.create_group(f'instance_{i}')
    g.create_dataset('features', data=features, shape=features.shape, 
                 chunks=(min(100, R), C), dtype='float32', compressor=compressor)
    g.create_dataset('labels', data=labels, shape=labels.shape,
                 chunks=(min(100, R),), dtype='int32', compressor=compressor)


In [2]:
import zarr
import numpy as np
from numcodecs import Blosc
import shutil
import os

if os.path.exists("my_dataset.zarr"):
    shutil.rmtree("my_dataset.zarr")

root = zarr.open("my_dataset.zarr", mode='w')

compressor = Blosc(cname='zstd', clevel=5, shuffle=Blosc.SHUFFLE)

R, C = 100, 422
features = np.random.rand(R, C).astype('float32')
labels = np.random.randint(0, 10, size=R, dtype='int32')

g = root.create_group("instance_0")
g.create_dataset(
    "features",
    data=features,
    shape=features.shape,
    chunks=(50, C),
    dtype='float32',
    compressor=compressor
)
g.create_dataset(
    "labels",
    data=labels,
    shape=labels.shape,
    chunks=(50,),
    dtype='int32',
    compressor=compressor
)


<zarr.core.Array '/instance_0/labels' (100,) int32>