In [None]:
import os
import boto3
import botocore
import pyarrow.parquet as pq
import pandas as pd

## Download a Parquet file from S3
See https://kafka-connect-manager.lsst.io/ for more information on the S3 Sink connector.

In [None]:
BUCKET_NAME = "efd-sandbox.data"

S3 credentials are added to `~/.aws/credentials`file and the S3 region to the`~/.aws/config` file as explained here https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)

In this example the S3 Sink connector is configured to partition data by time on an hourly basis. The following helps to construct the path to find the files on S3.

In [None]:
topic = "example-002-aggregated"
year = "2020"
month = "08"
day = "07"
hour = "22"

In [None]:
for obj in bucket.objects.filter(Prefix=f"topics/{topic}/year={year}/month={month}/day={day}/hour={hour}"):
    print(f"{bucket.name}:{obj.key}")

The S3 Sink connector is configured to invoke file commits to S3 every 10 minutes (see the `rotate_interval_ms` configuration setting).

The object key in S3 is the complete file path. Here we download one of the files.

In [None]:
KEY = "topics/example-002-aggregated/year=2020/month=08/day=07/hour=21/example-002-aggregated+0+0000097683.snappy.parquet"
FILE = os.path.basename(KEY)
bucket.download_file(KEY, FILE)

## Use Pyarrow read the Parquet file 

In [None]:
example_002_aggregated = pq.read_table(FILE)

## Convert from Parquet to Pandas Dataframe

In [None]:
df = example_002_aggregated.to_pandas()
df.head()

## Plotting the aggregated stream

In [None]:
p = df.plot(x='time', y='mean_value1', c='white', figsize=(15,5))
p.fill_between(x='time', y1='min_value1', y2='max_value1', data=df)