# Run experiments to store data in azure storage blob.

In [17]:
from io import BytesIO
from azure.storage.blob import BlobClient
import os

# Get a BlobClient with the specified blob name

In [18]:
bc = BlobClient.from_connection_string(
    conn_str = os.environ['BLOB_CONNECTION_STR'],
    container_name = os.environ['CONTAINER_NAME'],
    blob_name = 'my.parquet'
)
bc

<azure.storage.blob._blob_client.BlobClient at 0x7fb011698320>

# Create a pandas data frame for storage

In [19]:

import pyarrow as pa
import pandas as pd

data = {'Name':['Tom', 'nick', 'krish', 'jack'], 'Age':[20, 21, 19, 18]} 
df = pd.DataFrame(data, index = ('i1', 'i2', 'i3', 'i4'))

# Create a pyarrow table from pandas data frame

In [20]:
output_table = pa.Table.from_pandas(df)
output_table.schema

Name: string
Age: int64
__index_level_0__: string
metadata
--------
OrderedDict([(b'pandas',
              b'{"index_columns": ["__index_level_0__"], "column_indexes": ['
              b'{"name": null, "field_name": null, "pandas_type": "unicode",'
              b' "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}]'
              b', "columns": [{"name": "Name", "field_name": "Name", "pandas'
              b'_type": "unicode", "numpy_type": "object", "metadata": null}'
              b', {"name": "Age", "field_name": "Age", "pandas_type": "int64'
              b'", "numpy_type": "int64", "metadata": null}, {"name": null, '
              b'"field_name": "__index_level_0__", "pandas_type": "unicode",'
              b' "numpy_type": "object", "metadata": null}], "creator": {"li'
              b'brary": "pyarrow", "version": "0.15.1"}, "pandas_version": "'
              b'0.25.3"}')])

# Write the pyarrow table as parquet and store it in azure

In [21]:
#%%

import pyarrow.parquet as pq

try:
    outputstream = BytesIO()
    pq.write_table(output_table, outputstream)
    bc.upload_blob(outputstream.getvalue())
finally:
    outputstream.close()


# Read the schema back from the blob store

In [22]:
try:
    input_blob = bc.download_blob()
    inputstream = BytesIO()
    input_blob.download_to_stream(inputstream)
    input_table = pq.read_table(source=inputstream)
finally:
    inputstream.close()

# Assert that the schema are the same

In [23]:
assert output_table.schema == input_table.schema
print('Source and target parquet are identical!')

Source and target parquet are identical!


# Print out the table content as pandas data frame

In [24]:
pdf = input_table.to_pandas()
pdf

Unnamed: 0,Name,Age
i1,Tom,20
i2,nick,21
i3,krish,19
i4,jack,18
