In [None]:
import micromegas
import datetime
import pandas as pd
import pyarrow as pa
pd.set_option('display.max_colwidth', None)
client = micromegas.connect(preserve_dictionary=True)

In [None]:
sql = """
SELECT properties, count(*)
FROM measures
WHERE array_length(properties) > 0
group by properties
"""
df = client.query(sql)
df

In [None]:
sql = """
WITH md AS(
SELECT properties_to_dict(properties) as dproperties, properties_length(properties) as nb
FROM measures
)
SELECT properties_length(dproperties) as nbtest, nb, property_get(dproperties, 'target')
FROM md
"""
print("sql:", sql)
df = client.query(sql)
assert( (df["nbtest"] == df["nb"]).all() )
display(df)

In [None]:
sql = "SELECT properties_to_dict(properties) as properties FROM measures"
print("sql:", sql)
table = client.query_arrow(sql)
print("schema:", table.schema)

In [None]:
def analyze_memory(table):
      def fmt_bytes(b):
          return f"{b/1024/1024:.2f} MB" if b > 1024*1024 else f"{b/1024:.1f} KB"

      print(f"Total: {fmt_bytes(table.nbytes)} | Rows: {len(table):,}")

      for i, col in enumerate(table.columns):
          name = table.column_names[i]
          size = col.nbytes
          print(f"{name}: {fmt_bytes(size)} - {col.type}")

          # Dictionary details
          if hasattr(col.type, 'value_type'):
              if hasattr(col, 'chunks'):
                  dict_size = sum(chunk.dictionary.nbytes for chunk in col.chunks)
                  idx_size = sum(chunk.indices.nbytes for chunk in col.chunks)
                  unique_count = sum(len(chunk.dictionary) for chunk in col.chunks)
              else:
                  dict_size = col.dictionary.nbytes
                  idx_size = col.indices.nbytes
                  unique_count = len(col.dictionary)

              print(f"  Dict: {fmt_bytes(dict_size)} | Indices: {fmt_bytes(idx_size)}")
              print(f"  Unique values: {unique_count}")

In [None]:
analyze_memory(table)

In [None]:
def analyze_pandas_memory(df, description="DataFrame"):
      """Analyze memory usage of a pandas DataFrame."""

      def fmt_bytes(b):
          if b > 1024*1024:
              return f"{b/1024/1024:.2f} MB"
          elif b > 1024:
              return f"{b/1024:.1f} KB"
          else:
              return f"{b} B"

      print(f"\n=== {description} ===")

      # Overall info
      total_memory = df.memory_usage(deep=True).sum()
      print(f"Total memory: {fmt_bytes(total_memory)}")
      print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
      print(f"Memory per row: {total_memory / len(df):.1f} bytes")

      # Column breakdown
      memory_usage = df.memory_usage(deep=True)
      print(f"\nColumn breakdown:")

      for i, col in enumerate(df.columns):
          col_memory = memory_usage.iloc[i + 1]
          col_dtype = df[col].dtype
          print(f"{col}: {fmt_bytes(col_memory)} ({col_dtype})")

          # Extra info for object columns (lists, dicts, etc.)
          if col_dtype == 'object' and len(df) > 0:
              sample = df[col].iloc[0]
              if hasattr(sample, '__len__') and not isinstance(sample, str):
                  avg_len = df[col].apply(lambda x: len(x) if hasattr(x, '__len__') else 0).mean()
                  print(f"  Average length: {avg_len:.1f}")

In [None]:
sql = "SELECT properties_to_dict(properties) as properties FROM measures"
print("sql:", sql)
df = client.query(sql)


In [None]:
analyze_pandas_memory(df)