In [1]:
import awswrangler as wr
import pandas as pd
import numpy as np
import pyarrow as pa
from pyarrow import fs, parquet as pq
from arrow_pd_parser.parse import (
    pa_read_json_to_pandas,
    pa_read_csv_to_pandas,
    pa_read_json,
)
from mojap_metadata import Metadata
from mojap_metadata.converters.glue_converter import GlueConverter
from mojap_metadata.converters.arrow_converter import ArrowConverter
# from dataengineeringutils3 import s3
import os
import boto3
import csv
import datetime

# Setup

1. Create a test folder to work from (need access to `alpha-everyone`)
2. Move that single dataset to S3
3. Define the datas metadata using our Metadata class
4. Read that data back from S3 using pd_arrow ensuring it conforms to our metadata
5. Write the data to S3 into a database folder creating a csv table, jsonl table and parquet table (using awswrangler)
6. Create the Table DDLs using the glueConverter and awswrangler
7. Use aws wrangler to query each table

## 1. Setup

Add some key parameters probably the only thing you will need to change is your foldername. Second cell does some clean up using `awswrangler`.

In [2]:
foldername = "isichei" # GH username
region = "eu-west-1"
bucketname = "alpha-everyone"

In [3]:
db_name = f"aws_example_{foldername}"
db_base_path = f"s3://{bucketname}/{foldername}/database"
s3_base_path = f"s3://{bucketname}/{foldername}/"
if wr.s3.list_objects(s3_base_path):
    print("deleting objs")
    wr.s3.delete_objects(s3_base_path)

deleting objs


# 2. Upload a dataset to S3

Using `boto3`. There are many ways to do each of these. Hopefully this tutorial gives you a mix of how to do ones that we trust the most or have simple functions that others a re missing.

In [4]:
s3_client = boto3.client('s3')
with open("data/init-data.csv", "rb") as f:
    s3_client.upload_fileobj(f, bucketname, os.path.join(foldername, "init-data.csv"))

In [5]:
# Quick look at the data we just uploaded in S3 note we are reading the local version
# pd.read_json("data/init-data.jsonl", lines=True, dtype=str).head()
pd.read_csv("data/init-data.csv", dtype=str).head()

Unnamed: 0,character_col,int_col,long_col,date_col,datetime_col,boolean_col,float_col,double_col,decimal_col
0,malcovitch,1.0,2147483648.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123456789,12.345
1,"malcovitch, malcovitch",2147483647.0,10000000000.0,2018-01-01,2018-01-01 23:59:59,False,3.141592,3.141592653589,12.345
2,,1.0,1.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123456789,12.345
3,malcovitch,,10000000000.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123456789,12.345
4,malcovitch,1.0,,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123456789,12.345


## 3. Define the metadata

Create the metadata for the data using `mojap-metadata`. 

In [6]:
# 3 Define the metadata
meta_dict = {
    "name": "test",
    "description": "Some test data of different types",
    "file_format": "jsonl",
    "columns":[
        {
            "name": "character_col",
             "type": "string",
            "description": "This col has a tricky comma that messes with the serdes (parser). You'll see how we account for this later."
        },
        {"name": "int_col", "type": "int32"},
        {"name": "long_col", "type": "int64"},
        {"name": "date_col", "type": "date64"},
        {"name": "datetime_col", "type": "timestamp(s)"},
        {"name": "boolean_col", "type": "bool_"},
        {"name": "float_col", "type": "float32"},
        {"name": "double_col", "type": "float64"},
        {"name": "decimal_col", "type": "decimal128(5,3)"}
    ]
}

meta = Metadata.from_dict(meta_dict)

## 4. Conform the data to pandas

Generate an arrow schema from our metadata (TODO) and then use that to ensure conformance with pandas (using `arrow_pd_parser`).

In [7]:
init_data_s3_path = os.path.join(bucketname, foldername, "init-data.csv")

# Use the Arrow Converter to get a pyarrow schema
ac = ArrowConverter()
arrow_schema = ac.generate_from_meta(meta)

# Read the data in from S3 this time
s3 = fs.S3FileSystem(region='eu-west-1')
with s3.open_input_stream(init_data_s3_path) as f:
    df = pa_read_csv_to_pandas(f, schema=arrow_schema)

print(df.dtypes)
df.head(5)

character_col     string
int_col            Int64
long_col           Int64
date_col          object
datetime_col      object
boolean_col      boolean
float_col        float32
double_col       float64
decimal_col       object
dtype: object


Unnamed: 0,character_col,int_col,long_col,date_col,datetime_col,boolean_col,float_col,double_col,decimal_col
0,malcovitch,1.0,2147483648.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
1,"malcovitch, malcovitch",2147483647.0,10000000000.0,2018-01-01,2018-01-01 23:59:59,False,3.141592,3.141593,12.345
2,,1.0,1.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
3,malcovitch,,10000000000.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
4,malcovitch,1.0,,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345


## 5. Write the data to a database

Can do this multiple ways so going to do it multiple ways.

- (a) Going to write the data to a table using `awswrangler`
- (b) Will write the data directly to S3 and use mojap-metadata and boto3 to create the schema

In [8]:
# For both gonna need to do some setup
database_path = os.path.join(bucketname, foldername, "database")

databases = wr.catalog.databases()
if db_name not in databases.values:
    wr.catalog.create_database(db_name)
    print(f"Database '{db_name}' already exists")
else:
    print(f"Database '{db_name}' already exists")
    
for t in ["csv_wr", "csv_gc", "csv_header", "csv_open", "jsonl_hive", "jsonl_openx", "parquet_table"]:
    wr.catalog.delete_table_if_exists(database=db_name, table=t)

Database 'aws_example_isichei' already exists


In [9]:
# Now let's do (a)
df_copy = df.copy() # wrangler castsa the col datatypes so gotta be careful as it changes df
table_path = f"s3://{database_path}/csv_wr"
wr.s3.to_csv(
    df=df_copy,
    path=table_path,
    dataset=True,
    index=False,
    database=db_name,
    table='csv_wr',
    mode="overwrite",
)
wr.athena.read_sql_query(f"SELECT * FROM {db_name}.csv_wr", database=db_name, ctas_approach=False).head()

Unnamed: 0,character_col,int_col,long_col,date_col,datetime_col,boolean_col,float_col,double_col,decimal_col
0,malcovitch,1.0,2147483648.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
1,"malcovitch, malcovitch",2147483647.0,10000000000.0,2018-01-01,2018-01-01 23:59:59,False,3.141592,3.141593,12.345
2,,1.0,1.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
3,malcovitch,,10000000000.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
4,malcovitch,1.0,,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345


Now checkout the datatypes

In [10]:
wr.catalog.get_table_types(database=db_name, table="csv_wr")

{'character_col': 'string',
 'int_col': 'bigint',
 'long_col': 'bigint',
 'date_col': 'date',
 'datetime_col': 'timestamp',
 'boolean_col': 'boolean',
 'float_col': 'float',
 'double_col': 'double',
 'decimal_col': 'decimal(5,3)'}

So the types here have been inferred by `awswranger` because we've lost some information between the conversion between arrow -> pandas (e.g. no decimal type in pandas). `awswrangler` is correctly and safely converting your pandas types to glue types to ensure they can be read safely in glue. We can use our metadata and converter to get the column types we want. So let's do that as well...

> This didn't work but leaving in here. Might work one day.

In [11]:
# # Just want columns
# gc = GlueConverter()
# columns, partitions = gc.convert_columns(meta)
# columns[:3]

In [12]:
# # Get it into the data format needed for awswrangler
# dtypes = {}
# for c in columns:
#     dtypes[c["Name"]] = c["Type"].lower()

# wr.s3.to_csv(
#     df=df,
#     path=table_path,
#     dataset=True,
#     index=False,
#     database=db_name,
#     table='csv_wr',
#     mode="overwrite",
#     dtype=dtypes
# )
# wr.athena.read_sql_query(f"SELECT * FROM {db_name}.csv_wr", database=db_name, ctas_approach=True).head()

## 5(b)

So let's move on to using our converter to be exact about the datatypes (let's say we want to try and conform to their origin (as defined in the metadata). Let's write the data somewhere else to compare specifying a glue schema specific to our metadata.

In [13]:
# Note how we have to write out our CSV to work with the lazy hive schema
# wrangler does this under the hood
# Also worth noting that this is an aweful way to write CSVs do not use (quoting=csv.QUOTE_NONE) these parameters for any other CSV reader like R or Python!
table_path = f"s3://{database_path}/csv_gc"
wr.s3.to_csv(df, f"{table_path}/csv_gc.csv", index=False, header=False, escapechar="\\", quoting=csv.QUOTE_NONE) 

{'paths': ['s3://alpha-everyone/isichei/database/csv_gc/csv_gc.csv'],
 'partitions_values': {}}

In [14]:
# Create a glue converter and convert our metadata to the GlueAPI specs
gc = GlueConverter()
meta.file_format = "csv"
meta.name = "csv_gc"

boto_dict = gc.generate_from_meta(meta, database_name=db_name, table_location=table_path)
boto_dict

{'DatabaseName': 'aws_example_isichei',
 'TableInput': {'Name': 'csv_gc',
  'Description': 'Some test data of different types',
  'Owner': 'owner',
  'Retention': 0,
  'StorageDescriptor': {'Columns': [{'Name': 'character_col',
     'Type': 'string',
     'Comment': "This col has a tricky comma that messes with the serdes (parser). You'll see how we account for this later."},
    {'Name': 'int_col', 'Type': 'int'},
    {'Name': 'long_col', 'Type': 'bigint'},
    {'Name': 'date_col', 'Type': 'date'},
    {'Name': 'datetime_col', 'Type': 'timestamp'},
    {'Name': 'boolean_col', 'Type': 'boolean'},
    {'Name': 'float_col', 'Type': 'float'},
    {'Name': 'double_col', 'Type': 'double'},
    {'Name': 'decimal_col', 'Type': 'decimal(5,3)'}],
   'Location': 's3://alpha-everyone/isichei/database/csv_gc',
   'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
   'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
   'Compressed': False,
   'NumberOfBuckets': -

In [15]:
glue_client = boto3.client("glue")

try:
    _ = glue_client.delete_table(
        DatabaseName=db_name,
        Name="csv_gc"
    )
except glue_client.exceptions.EntityNotFoundException:
    print("table already deleted")

glue_client.create_table(**boto_dict)

table already deleted


{'ResponseMetadata': {'RequestId': '9e7e004e-37ac-48ef-9cf8-647be54ae58c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Fri, 18 Dec 2020 17:00:04 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': '9e7e004e-37ac-48ef-9cf8-647be54ae58c'},
  'RetryAttempts': 0}}

**You'll see we get the same resulted output but we get more exact column types.**

In [16]:
wr.athena.read_sql_query(f"SELECT * FROM {db_name}.csv_gc", database=db_name, ctas_approach=False).head()

Unnamed: 0,character_col,int_col,long_col,date_col,datetime_col,boolean_col,float_col,double_col,decimal_col
0,malcovitch,1.0,2147483648.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
1,"malcovitch, malcovitch",2147483647.0,10000000000.0,2018-01-01,2018-01-01 23:59:59,False,3.141592,3.141593,12.345
2,,1.0,1.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
3,malcovitch,,10000000000.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
4,malcovitch,1.0,,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345


In [17]:
wr.catalog.get_table_types(database=db_name, table="csv_gc")

{'character_col': 'string',
 'int_col': 'int',
 'long_col': 'bigint',
 'date_col': 'date',
 'datetime_col': 'timestamp',
 'boolean_col': 'boolean',
 'float_col': 'float',
 'double_col': 'double',
 'decimal_col': 'decimal(5,3)'}

# 6. Let's make hella tables with the glueConverter

We'll create the following:
- CSV with header
- CSV openSerde aweful for dates (requires UNIX epochs)
- Jsonl (with Hive serde)
- Jsonl (with Openx serde)
- Parquet

### CSV with header

We are actually going to do two things. We are going to specify the seperator as a pipe and write with header.

In [18]:
meta.name = "csv_header"
meta.file_format = "csv"

# let's use the same glueConverter
gc.options.csv.sep = "|"
gc.options.csv.skip_header = True
gc.options.default_db_name = db_name # set db name so we don't have to set it everytime
gc.options.default_db_base_path = f"s3://{database_path}/" # set db name so we don't have to set it everytime
full_table_path = f"s3://{database_path}/{meta.name}/{meta.name}.csv"
wr.s3.to_csv(
    df,
    full_table_path,
    index=False,
    header=gc.options.csv.skip_header,
    escapechar=gc.options.csv.escape_char,
    sep=gc.options.csv.sep,
    quoting=csv.QUOTE_NONE
) # note header=True and sep="|" escapechar is the same as defailt options for converter are "\\"

spec = gc.generate_from_meta(meta)
glue_client.create_table(**spec)
wr.athena.read_sql_query(f"SELECT * FROM {db_name}.{meta.name}", database=db_name, ctas_approach=False).head()

Unnamed: 0,character_col,int_col,long_col,date_col,datetime_col,boolean_col,float_col,double_col,decimal_col
0,malcovitch,1.0,2147483648.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
1,"malcovitch, malcovitch",2147483647.0,10000000000.0,2018-01-01,2018-01-01 23:59:59,False,3.141592,3.141593,12.345
2,,1.0,1.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
3,malcovitch,,10000000000.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
4,malcovitch,1.0,,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345


### CSV with openSerde

We can use a different serde (aka parser) for CSVs. The pros of this parser means that having commas in the character cols work when you write out CSVs like a normal person (quoted values). However, the con here is that it requires you to write out dates and timestamps are based on unix timesstamps. [More on this con](https://stackoverflow.com/questions/52564194/athena-unable-to-parse-date-using-opencsvserde). The TLDR is that dates should be integer days from 1 January 1970 and timestamps should an integer in milliseconds that have elapsed since Midnight 1 January 1970.

> This actually fails will also fail unless your data is not NULL - so just avoid openCSV to be honest. More on this null value error: https://aws.amazon.com/premiumsupport/knowledge-center/athena-hive-bad-data-error-csv/
So in this example we treat everything but the date and datetimes (which I also fill as 0 just to demonstrate the date types).

In [19]:
meta_open_csv = Metadata.from_dict(meta_dict)
meta_open_csv.name = "csv_open"
meta_open_csv.file_format = "csv"

# Do unix conversion
df_unix = df.copy()

epoch = datetime.datetime.utcfromtimestamp(0)

df_unix.datetime_col = df_unix.datetime_col.apply(lambda x: 0 if pd.isna(x) else int((x - epoch).total_seconds() * 1000.0))
df_unix.date_col = df_unix.date_col.apply(lambda x: 0 if pd.isna(x) else (x -  datetime.date(1970, 1, 1)).days)

# ALSO Cant do decimal
for c in meta_open_csv.columns:
    if c["type"].startswith("decimal"):
        c["type"] = "float64"
    elif not c["name"].startswith("date"):
        c["type"] = "string"

# let's use the same glueConverter
gc.options.set_csv_serde("open") # openCSVSerde
gc.options.csv.sep = ","
gc.options.csv.skip_header = False

full_table_path = f"s3://{database_path}/{meta_open_csv.name}/{meta_open_csv.name}.csv"
wr.s3.to_csv(
    df_unix,
    full_table_path,
    index=False,
    header=gc.options.csv.skip_header,
    sep=gc.options.csv.sep
) # note we are using quotes again and a comma sep!

spec = gc.generate_from_meta(meta_open_csv)
glue_client.create_table(**spec)
wr.athena.read_sql_query(f"SELECT * FROM {db_name}.{meta_open_csv.name}", database=db_name, ctas_approach=False).head()

Unnamed: 0,character_col,int_col,long_col,date_col,datetime_col,boolean_col,float_col,double_col,decimal_col
0,malcovitch,1.0,2147483648.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123456789,12.345
1,"malcovitch, malcovitch",2147483647.0,10000000000.0,2018-01-01,2018-01-01 23:59:59,False,3.141592,3.141592653589,12.345
2,,1.0,1.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123456789,12.345
3,malcovitch,,10000000000.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123456789,12.345
4,malcovitch,1.0,,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123456789,12.345


☝ What a palaver - don't do this unless you definitely want this serde because you hate yourself.

### Jsonl with Hive Serde

JSONL files whoop whoop. Hive serde is the default for the glue-converter. Pandas writer is still not quite there for jsonl so you should cast your dates and datetimes yourself into the appropriate ISO format. Also worth noting that currently awswrangler doesn't have an option to add a pandas table to glue for jsonl types (it only supports CSV and parquet). Most likely this will change in the near future.

In [20]:
# Cast dates and timestamps to str
df_json = df.copy()
df_json.date_col = df_json.date_col.apply(lambda x: pd.NA if pd.isna(x) else str(x)).astype("string")
df_json.datetime_col = df_json.datetime_col.apply(lambda x: pd.NA if pd.isna(x) else str(x)).astype("string")

meta.name = "jsonl_hive"
meta.file_format = "jsonl" # can also write json

gc.options.set_json_serde("hive")
# let's use the same glueConverter
full_table_path = f"s3://{database_path}/{meta.name}/{meta.name}.jsonl"
wr.s3.to_json(
    df_json,
    full_table_path,
    orient="records",
    lines=True
) # note header=True and sep="|" escapechar is the same as defailt options for converter are "\\"

spec = gc.generate_from_meta(meta)
glue_client.create_table(**spec)
wr.athena.read_sql_query(f"SELECT * FROM {db_name}.{meta.name}", database=db_name, ctas_approach=False).head()

Unnamed: 0,character_col,int_col,long_col,date_col,datetime_col,boolean_col,float_col,double_col,decimal_col
0,malcovitch,1.0,2147483648.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
1,"malcovitch, malcovitch",2147483647.0,10000000000.0,2018-01-01,2018-01-01 23:59:59,False,3.141592,3.141593,12.345
2,,1.0,1.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
3,malcovitch,,10000000000.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
4,malcovitch,1.0,,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345


In [21]:
wr.catalog.get_table_types(database=db_name, table=meta.name)

{'character_col': 'string',
 'int_col': 'int',
 'long_col': 'bigint',
 'date_col': 'date',
 'datetime_col': 'timestamp',
 'boolean_col': 'boolean',
 'float_col': 'float',
 'double_col': 'double',
 'decimal_col': 'decimal(5,3)'}

### Jsonl with Openx Serde

Now the openx serde. Differences between this and the Hive one can be [found here](https://docs.aws.amazon.com/athena/latest/ug/json-serde.html).

In [22]:
meta.name = "jsonl_openx"
meta.file_format = "jsonl" # can also write json

gc.options.set_json_serde("openx")
# let's use the same glueConverter
full_table_path = f"s3://{database_path}/{meta.name}/{meta.name}.jsonl"
wr.s3.to_json(
    df_json,
    full_table_path,
    orient="records",
    lines=True
) # note header=True and sep="|" escapechar is the same as defailt options for converter are "\\"

spec = gc.generate_from_meta(meta)
glue_client.create_table(**spec)
wr.athena.read_sql_query(f"SELECT * FROM {db_name}.{meta.name}", database=db_name, ctas_approach=False).head()

Unnamed: 0,character_col,int_col,long_col,date_col,datetime_col,boolean_col,float_col,double_col,decimal_col
0,malcovitch,1.0,2147483648.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
1,"malcovitch, malcovitch",2147483647.0,10000000000.0,2018-01-01,2018-01-01 23:59:59,False,3.141592,3.141593,12.345
2,,1.0,1.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
3,malcovitch,,10000000000.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
4,malcovitch,1.0,,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345


In [23]:
wr.catalog.get_table_types(database=db_name, table=meta.name)

{'character_col': 'string',
 'int_col': 'int',
 'long_col': 'bigint',
 'date_col': 'date',
 'datetime_col': 'timestamp',
 'boolean_col': 'boolean',
 'float_col': 'float',
 'double_col': 'double',
 'decimal_col': 'decimal(5,3)'}

### Parquet

Finally! This is the easiest dataset to work with because it has metadata built into the data (that does mean you cannot eye ball it with a text editor but who cares). Strongly recommend using this type because it works well with arrow and glue.

In [24]:
# Define our meta changes again
meta.name = "parquet_table"
meta.file_format = "parquet" # can also write json

full_table_path = f"s3://{database_path}/{meta.name}/{meta.name}.snappy.parquet"

So now instead of writing to S3 using wrangler we are going to use arrow instead. This allows us to be super specific about our metadata conformance which is why we are all here.

In [25]:
# Convert the dataframe to an arrow table and then cast to our specific metadata then write to S3
table = pa.Table.from_pandas(df)
table = table.cast(arrow_schema)

# Write the data to S3 and then generate the glue table as normal
s3 = fs.S3FileSystem(region='eu-west-1')
with s3.open_output_stream(full_table_path.replace("s3://","")) as f:
    pq.write_table(table, f)

spec = gc.generate_from_meta(meta)
glue_client.create_table(**spec)
wr.athena.read_sql_query(f"SELECT * FROM {db_name}.{meta.name}", database=db_name, ctas_approach=False).head()

Unnamed: 0,character_col,int_col,long_col,date_col,datetime_col,boolean_col,float_col,double_col,decimal_col
0,malcovitch,1.0,2147483648.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
1,"malcovitch, malcovitch",2147483647.0,10000000000.0,2018-01-01,2018-01-01 23:59:59,False,3.141592,3.141593,12.345
2,,1.0,1.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
3,malcovitch,,10000000000.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
4,malcovitch,1.0,,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345


In [26]:
wr.catalog.get_table_types(database=db_name, table=meta.name)

{'character_col': 'string',
 'int_col': 'int',
 'long_col': 'bigint',
 'date_col': 'date',
 'datetime_col': 'timestamp',
 'boolean_col': 'boolean',
 'float_col': 'float',
 'double_col': 'double',
 'decimal_col': 'decimal(5,3)'}

# 7. Side note using `pd_arrow_parser` to enforce conformance on our databases

What is useful about enforcing data conformance is imagine we have a table in our database (lets use `parquet_table` in our database). We produce the table in our database from a CSV extract given to us by some external data supply. There is an update on the data export from our friendly data sender and now we get data as a `jsonl` file. Using `arrow_pd_parser` we can ensure conformance A -> B data formats to ensure updates to our database 👊

In [27]:
# So our new data is jsonl
# # We could just read it directly input pandas but we don't actually need to we can 
# # just go jsonl -> arrow -> parquet still ensuring metadata conformance
arrow_table = pa_read_json("data/init-data.jsonl", schema=arrow_schema)
new_data_s3_path = f"{database_path}/{meta.name}/{meta.name}_new.snappy.parquet"

with s3.open_output_stream(new_data_s3_path) as f:
    pq.write_table(arrow_table, f)

The metadata schema for the database is already defined and as we haven't changed the datatypes we don't need to do anything. The new data can now be queried via Athena.

In [28]:
wr.athena.read_sql_query(f"SELECT * FROM {db_name}.{meta.name}", database=db_name, ctas_approach=False).head(20)

Unnamed: 0,character_col,int_col,long_col,date_col,datetime_col,boolean_col,float_col,double_col,decimal_col
0,malcovitch,1.0,2147483648.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
1,"malcovitch, malcovitch",2147483647.0,10000000000.0,2018-01-01,2018-01-01 23:59:59,False,3.141592,3.141593,12.345
2,,1.0,1.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
3,malcovitch,,10000000000.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
4,malcovitch,1.0,,1900-01-01,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
5,malcovitch,1.0,10000000000.0,,1900-01-01 00:00:00,True,0.123456,0.123457,12.345
6,malcovitch,1.0,10000000000.0,1900-01-01,NaT,True,0.123456,0.123457,12.345
7,malcovitch,1.0,10000000000.0,1900-01-01,1900-01-01 00:00:00,False,0.123456,0.123457,12.345
8,malcovitch,1.0,10000000000.0,1900-01-01,1900-01-01 00:00:00,True,,0.123457,12.345
9,malcovitch,1.0,10000000000.0,1900-01-01,1900-01-01 00:00:00,True,0.123456,,12.345
