# Explore Parquet Files

- 2017 - [Development update: High speed Apache Parquet in Python with Apache Arrow ](https://wesmckinney.com/blog/python-parquet-update/)

In [1]:
%matplotlib inline

In [2]:
import os
import json
import numpy as np
import pandas as pd
import fastparquet
import pyarrow as pa
import pyarrow.parquet as pq

from IPython.display import JSON

In [3]:
for module in [pd, np, pa, fastparquet]:
    print(module.__name__, module.__version__)

pandas 1.0.5
numpy 1.18.5
pyarrow 3.0.0
fastparquet 0.4.0


# PyArrow Parquet

- [Reading and Writing the Apache Parquet Format](https://arrow.apache.org/docs/python/parquet.html)

In [7]:
parquet_file = pq.ParquetFile(
    'processed_data/gdelt_2020_500MB.snappy.parq/part.0.parquet')

parquet_file.metadata

<pyarrow._parquet.FileMetaData object at 0x7fba08472350>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 13
  num_rows: 937936
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 7166

In [8]:
filepath = 'processed_data/gdelt_2020_500MB.snappy.parq'

dataset = pq.ParquetDataset(filepath)
dataset.metadata

<pyarrow._parquet.FileMetaData object at 0x7fba08472650>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 13
  num_rows: 43452639
  num_row_groups: 46
  format_version: 1.0
  serialized_size: 84707

In [9]:
dataset.schema

<pyarrow._parquet.ParquetSchema object at 0x7fba084744b0>
required group field_id=0 schema {
  optional binary field_id=1 event_id (String);
  optional int32 field_id=2 date (Date);
  optional int32 field_id=3 event_date (Date);
  optional int64 field_id=4 event_code;
  optional int64 field_id=5 event_base_code;
  optional int64 field_id=6 event_root_code;
  optional double field_id=7 lat;
  optional double field_id=8 lon;
  optional int64 field_id=9 geo_type;
  optional binary field_id=10 country_code (String);
  optional binary field_id=11 adm1_code (String);
  optional binary field_id=12 source_url (String);
  optional binary field_id=13 netloc (String);
}

# Fastparquet

- https://fastparquet.readthedocs.io/en/latest/

In [10]:
filepath = 'processed_data/gdelt_500MB.snappy.parq/'

pf = fastparquet.ParquetFile(filepath)

In [11]:
pf.info

{'name': 'processed_data/gdelt_500MB.snappy.parq/_metadata',
 'columns': ['event_id',
  'date',
  'event_date',
  'event_code',
  'event_base_code',
  'event_root_code',
  'lat',
  'lon',
  'geo_type',
  'country_code',
  'adm1_code',
  'source_url',
  'netloc'],
 'partitions': [],
 'rows': 640389681}

In [12]:
pf.dtypes

OrderedDict([('event_id', dtype('O')),
             ('date', dtype('<M8[ns]')),
             ('event_date', dtype('<M8[ns]')),
             ('event_code', dtype('int64')),
             ('event_base_code', dtype('int64')),
             ('event_root_code', dtype('int64')),
             ('lat', dtype('float64')),
             ('lon', dtype('float64')),
             ('geo_type', dtype('int64')),
             ('country_code', dtype('O')),
             ('adm1_code', dtype('O')),
             ('source_url', dtype('O')),
             ('netloc', dtype('O'))])

In [13]:
pf.count

640389681

In [14]:
JSON(json.loads(pf.key_value_metadata['pandas']), expanded=False)

<IPython.core.display.JSON object>