In [1]:
from arrow_types import *
import pyarrow as pa
import pyarrow.parquet as pq


In [2]:
class Use:

    def __init__(self, id: str, data: list[float]):
        self.id = id
        self.data = data

    def to_dict(self) -> dict:
        return {"id": self.id, "data": self.data}




In [3]:

class UseType(pa.ExtensionType):
    """Use type as PyArrow StructType"""

    def __init__(self):
        super(UseType, self).__init__(
            pa.struct(
                [
                    pa.field("id", pa.string()),
                    pa.field("data", pa.list_(pa.float32())),
                ]
            ),
            "Use",
        )

    def __arrow_ext_serialize__(self):
        return b""

    @classmethod
    def __arrow_ext_deserialize__(cls, storage_type, serialized):
        return UseType()

    def __arrow_ext_scalar_class__(self):
        return UseScalar


class UseScalar(pa.ExtensionScalar):
    def as_py(self) -> Use:
        return Use(self.value["id"].as_py(), self.value["data"].as_py())
    
class UseArray(pa.ExtensionArray):
    @classmethod
    def from_Use_list(self,use_list):
        return pa.array( [use.to_dict() for use in use_list] ,UseType())
    
pa.register_extension_type(UseType())

In [4]:
UseType()

UseType(StructType(struct<id: string, data: list<item: float>>))

In [5]:

class Test:

    def __init__(self, use:Use, sca:float):
        self.use = use
        self.sca = sca

    def to_dict(self):
        return {"use":self.use, "sca":self.sca}
    
    def to_array(self):
        return [self.use, self.sca]


class TestType(pa.ExtensionType):
    """Test type as PyArrow StructType"""

    def __init__(self):
        super(TestType, self).__init__(
            pa.struct(
                [
                    pa.field("use", UseType(), nullable=True),
                    pa.field("sca", pa.float64(), nullable=True)
                ]
            ),
            "Test",
        )

    def __arrow_ext_serialize__(self):
        return b""

    @classmethod
    def __arrow_ext_deserialize__(cls, storage_type, serialized):
        return TestType()

    def __arrow_ext_scalar_class__(self):
        return TestScalar


class TestScalar(pa.ExtensionScalar):
    def as_py(self) -> Test:
        return Test(self.value["use"].as_py(), self.value["sca"].as_py())
    
pa.register_extension_type(TestType())

In [6]:
use1 = Use("u1", [1.2, 3])
use2 = Use("u2", [1.7, 8.9])

use_arr = UseArray.from_Use_list([use1, use2])

Test1 = Test(use1, 3)
Test2 = Test(use2,4)

In [7]:
test_type = TestType()

In [8]:

sto_test = pa.StructArray.from_arrays([use_arr,[3.0,2.1]], names=["use","sca"])
test_arr = pa.StructArray.from_arrays([sto_test], names=["Test"])

test_arr.take([0])[0]['Test']['use'].as_py()

<__main__.Use at 0x7f4983bb8d50>

In [9]:
type_mapping = {
    Use: UseType(),
    Test: TestType()
}


def convert_annotation_list_to_dict(annotation_list):
    result_dict = {}

    if len(annotation_list) == 0:
        return result_dict

    attributes = annotation_list[0].__dict__.keys()

    for attr in attributes:
        attr_values = [getattr(annotation, attr) for annotation in annotation_list]

        if isinstance(attr_values[0], tuple(type_mapping.keys())):
            attr_type = type_mapping[type(attr_values[0])]
            attr_values = pa.array([value.to_dict() for value in attr_values], type=attr_type)
            
        result_dict[attr] = pa.array(attr_values)

    return result_dict

dict_arr = convert_annotation_list_to_dict([Test1,Test2])

In [10]:
arr = pa.StructArray.from_arrays(list(dict_arr.values()),names= list(dict_arr.keys()))
arr



<pyarrow.lib.StructArray object at 0x7f4982b0fca0>
-- is_valid: all not null
-- child 0 type: extension<Use<UseType>>
  -- is_valid: all not null
  -- child 0 type: string
    [
      "u1",
      "u2"
    ]
  -- child 1 type: list<item: float>
    [
      [
        1.2,
        3
      ],
      [
        1.7,
        8.9
      ]
    ]
-- child 1 type: int64
  [
    3,
    4
  ]

In [12]:

table = pa.Table.from_arrays([arr], names = ['test'])

pq.write_table(table,"test_so.parquet")

table = pq.read_table("test_so.parquet")

table



pyarrow.Table
test: struct<use: extension<Use<UseType>>, sca: int64>
  child 0, use: extension<Use<UseType>>
  child 1, sca: int64
----
test: [
  -- is_valid: all not null
  -- child 0 type: extension<Use<UseType>>
    -- is_valid: all not null
    -- child 0 type: string
["u1","u2"]
    -- child 1 type: list<item: float>
[[1.2,3],[1.7,8.9]]
  -- child 1 type: int64
[3,4]]

In [13]:
row0 = table.take([0])
row0.to_pylist()

[{'test': {'use': <__main__.Use at 0x7f4990386c50>, 'sca': 3}}]