In [27]:
import pyarrow.csv as pv
import pyarrow.parquet as pq
import pyarrow as pa
import json

# Column level metadata
We know that Metadata can be written to Parquet files or columns. Note the schema(contains metadata) is defined in the footer of the parquet file.
In BasicMeta, we have seen how to add table level custom metadata. In this section, we will see how to add column level metadata


In [11]:
source_file_path="../../data/source.csv"

In [12]:
raw_table = pv.read_csv(source_file_path)

##  Table schema
we can define table schema which contains column level custom metadata

In [13]:

my_schema = pa.schema(
    [pa.field("name", pa.string(), False, metadata={"description": "the name of user"}),
     pa.field("age", pa.int64(), True, metadata={"description": "the age of user at year 2000"}),
     pa.field("sex", pa.string(), False, metadata={"description": "the sex of user. Value must be F or M"}),
     ],
    # table level custom meta
    metadata={"data_provider": "toto",
              "version":"1",
              'collect_date': '2020-10-17T03:59:59+0000'
              }
)


In [14]:
table = raw_table.cast(my_schema)

In [15]:
output_path="../../data/colum_custom_meta.parquet"

In [16]:
pq.write_table(table, output_path)

## Read parquet file metadata

We will try two ways to get the metadata
1. Use ParquetFile method
2. Use read_table

### Use ParquetFile method

In [22]:
pf=pq.ParquetFile(output_path)

In [23]:
print(pf.schema_arrow)

name: string not null
  -- field metadata --
  description: 'the name of user'
age: int64
  -- field metadata --
  description: 'the age of user at year 2000'
sex: string not null
  -- field metadata --
  description: 'the sex of user. Value must be F or M'
-- schema metadata --
data_provider: 'toto'
version: '1'
collect_date: '2020-10-17T03:59:59+0000'


### Use read_table

In [18]:
parquet_schema = pq.read_table(output_path).schema

In [19]:
# You can notice that we can only get table level metadata with this field
print(parquet_schema.metadata)

{b'data_provider': b'toto', b'version': b'1', b'collect_date': b'2020-10-17T03:59:59+0000'}


In [25]:
# To get the column level metadata, we need to use field method to get the column first
col_list=["name","age","sex"]
for col_name in col_list:
    print(parquet_schema.field(col_name).metadata)

{b'description': b'the name of user'}
{b'description': b'the age of user at year 2000'}
{b'description': b'the sex of user. Value must be F or M'}


# Json file as metadata value

Even though we can add any type in metadata key, value paire, we recommend to use string. As a result, if you want to use a json file as metadata, we recommend you to convert it into json string first. Below code shows an example

In [34]:
# read the parquet file and get the exising metadata
table=pq.read_table(output_path)
existing_meta = table.schema.metadata
print(existing_meta)

{b'data_provider': b'toto', b'version': b'1', b'collect_date': b'2020-10-17T03:59:59+0000'}


In [36]:
# prepare json metadata value
dict_meta_content = {
    'data_provider': '鹏飞',
    'version': '2',
    'collect_date': '2022-10-17T03:59:59+0000'  # ISO-8601
}
meta_val=json.dumps(dict_meta_content)
print(type(meta_val))
print(meta_val)

<class 'str'>
{"data_provider": "\u9e4f\u98de", "version": "2", "collect_date": "2022-10-17T03:59:59+0000"}


In [42]:
# build new metadata by adding json string
meta_key="json_meta"
new_meta={
    meta_key:meta_val.encode(),
    **existing_meta
}

In [38]:
json_table = table.replace_schema_metadata(new_meta)

## Write to parquet

In [39]:
json_parquet_output_path="../../data/json_custom_meta.parquet"

In [40]:
pq.write_table(json_table,json_parquet_output_path)

## Read metadata

In [43]:
json_meta_table=pq.read_table(json_parquet_output_path)

In [44]:
json_meta_val=json_meta_table.schema.metadata[meta_key.encode()]

In [45]:
print(json.loads(json_meta_val))

{'data_provider': '鹏飞', 'version': '2', 'collect_date': '2022-10-17T03:59:59+0000'}
