In [251]:
import os
import boto3
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from decimal import Decimal
from s3fs import S3FileSystem
from datetime import datetime
from dotenv import load_dotenv

In [229]:
load_dotenv()

True


# 1. Create Bucket

In [230]:
session = boto3.Session(profile_name="mentha")
s3_client = session.client("s3")

In [231]:
# s3_client.list_buckets()["Buckets"]

# s3_client.create_bucket(Bucket="mentha-athena-datastore",
#                         ACL="public-read",
#                         CreateBucketConfiguration={
#                             "LocationConstraint": "ap-northeast-2"
#                         }
# )

s3_client.list_buckets()["Buckets"]

[{'Name': 'aws-glue-assets-305045122135-ap-northeast-2',
  'CreationDate': datetime.datetime(2022, 5, 19, 14, 23, 19, tzinfo=tzutc())},
 {'Name': 'mentha-athena-datastore',
  'CreationDate': datetime.datetime(2022, 10, 2, 5, 52, 13, tzinfo=tzutc())},
 {'Name': 'mentha-athena-query-result',
  'CreationDate': datetime.datetime(2022, 5, 22, 4, 57, 56, tzinfo=tzutc())},
 {'Name': 'mentha-datastore',
  'CreationDate': datetime.datetime(2022, 10, 2, 6, 33, tzinfo=tzutc())},
 {'Name': 'mentha-sample-datastore',
  'CreationDate': datetime.datetime(2022, 5, 22, 5, 40, 57, tzinfo=tzutc())}]

# 2. Convert csv(local) to parquet(s3)

## 2.1. Read csv

In [232]:
def decimal_from_value(value):
    if value == "null":
        return pd.NA
    else:
        return Decimal(value)

event_metadata = {
    "column_list": ["identity_adid", "os", "model", "country", "event_name", "log_id", "server_datetime", "quantity", "price"],
    "dtype_dict": {
        "identity_adid": "str",
        "os": "str",
        "model": "str",
        "country": "str",
        "event_name": "str",
        "log_id": "str",
        "quantity": "Int64",
        "server_datetime": "str"
    },
    "convert_dict": {
        "price": decimal_from_value
    },
}

attribution_metadata = {
    "column_list": ["partner", "campaign", "server_datetime", "tracker_id", "log_id", "attribution_type", "identity_adid"],
    "dtype_dict": {
        "partner": "str",
        "campaign": "str",
        "tracker_id": "str",
        "log_id": "str",
        "attribution_type": "Int64",
        "identity_adid": "str",
        "server_datetime": "str"
    }
}

In [233]:
df_event = pd.read_csv("data/event.csv",
                       names=event_metadata["column_list"],
                       dtype=event_metadata["dtype_dict"],
                       converters=event_metadata["convert_dict"]
)

In [234]:
df_attribution = pd.read_csv("data/attribution.csv",
                             names=attribution_metadata["column_list"],
                             dtype=attribution_metadata["dtype_dict"],
)

### server_datetime 에러 케이스 확인
- event 테이블에서 "0001-01-01 00:00:00.0"인 경우 Null로 대체하자

In [235]:
df_event["server_datetime"] = pd.to_datetime(df_event["server_datetime"], errors = "coerce")
df_attribution["server_datetime"] = pd.to_datetime(df_attribution["server_datetime"], errors = "coerce")

In [237]:
df_event["date"] = df_event["server_datetime"].dt.date
df_attribution["date"] = df_attribution["server_datetime"].dt.date

In [238]:
df_event.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17966166 entries, 0 to 17966165
Data columns (total 10 columns):
 #   Column           Dtype         
---  ------           -----         
 0   identity_adid    object        
 1   os               object        
 2   model            object        
 3   country          object        
 4   event_name       object        
 5   log_id           object        
 6   server_datetime  datetime64[ns]
 7   quantity         Int64         
 8   price            object        
 9   date             object        
dtypes: Int64(1), datetime64[ns](1), object(8)
memory usage: 1.4+ GB


In [239]:
df_event.head(5)

Unnamed: 0,identity_adid,os,model,country,event_name,log_id,server_datetime,quantity,price,date
0,984549936,8.9,8.9,jp,abx:login,c21efdb8-b6e5-4ccc-a474-aff72a62c248,2018-05-18 12:23:15.303,,,2018-05-18
1,885033552,8.9,8.9,gb,abx:login,b4470f3b-4bb9-43ef-9248-25b503fa5660,2018-05-18 12:32:46.395,,,2018-05-18
2,768602461,7.1,7.1,ge,abx:firstopen,372dfecc-a27f-4a16-8e31-eccf34b8855f,2018-05-18 12:34:55.196,,,2018-05-18
3,1666798466,3.4,3.4,gb,abx:end_session,08730bdc-2895-4061-8399-f45df94d3fd0,2018-05-18 12:30:23.945,,,2018-05-18
4,683694696,7.1,7.1,kr,abx:start_session,a9556df7-f6ee-4600-af5b-89a44f18673c,2018-05-18 12:31:14.824,,,2018-05-18


In [240]:
df_attribution.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3760185 entries, 0 to 3760184
Data columns (total 8 columns):
 #   Column            Dtype         
---  ------            -----         
 0   partner           object        
 1   campaign          object        
 2   server_datetime   datetime64[ns]
 3   tracker_id        object        
 4   log_id            object        
 5   attribution_type  Int64         
 6   identity_adid     object        
 7   date              object        
dtypes: Int64(1), datetime64[ns](1), object(6)
memory usage: 233.1+ MB


In [241]:
df_attribution.head(5)

Unnamed: 0,partner,campaign,server_datetime,tracker_id,log_id,attribution_type,identity_adid,date
0,,,2018-05-03 07:19:24.813,,bdb8fc95-4f66-4d1d-8186-d10e86fe6433,0,764796223,2018-05-03
1,,,2018-05-03 10:25:11.034,,67c41325-a700-4f98-ad72-108025e9af8d,0,2126194985,2018-05-03
2,,,2018-05-03 10:26:08.081,,0e41af66-3f17-4bde-91db-806296209ad1,0,738518810,2018-05-03
3,,,2018-05-03 22:38:15.378,,a5f7ed1f-5d4e-4adf-96ce-e94f6820c2c2,0,595719449,2018-05-03
4,,,2018-05-03 04:14:55.453,,1e1aae33-282d-4dc2-9267-22fbd4ee2798,0,302402748,2018-05-03


## 2.2. Convert csv to parquet

In [247]:
event_table = pa.Table.from_pandas(df_event)
attribution_table = pa.Table.from_pandas(df_attribution)
# event_output_path = "s3://mentha-athena-datastore/sample/event"
# attribution_output_path = "s3://mentha-athena-datastore/sample/attribution"

In [243]:
pq.write_to_dataset(event_table,
                    root_path="data/event/",
                    partition_cols=["date"]
)

In [244]:
pq.write_to_dataset(attribution_table,
                    root_path="data/attribution/",
                    partition_cols=["date"]
)

In [84]:
# pq.write_table(event_table, "data/event.parquet")
# pq.write_table(attribution_table, "data/attribution.parquet")
# # https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_metadata.html

In [67]:
# pq.read_metadata("data/event.parquet")
# read_event_schema = pq.read_schema("data/event.parquet")
# read_event_schema

## 2.3. Upload parquet files to S3
- ref: https://stackoverflow.com/questions/25380774/upload-a-directory-to-s3-with-boto

In [259]:
def upload_folder_to_s3(s3bucket, inputDir, s3Path):
    print("Uploading results to s3 initiated...")
    print("Local Source:", inputDir)
    os.system("ls -ltR " + inputDir)

    print("Dest  S3path:", s3Path)

    try:
        for path, subdirs, files in os.walk(inputDir):
            for file in files:
                dest_path = path.replace(inputDir,"")
                __s3file = os.path.normpath(s3Path + '/' + dest_path + '/' + file)
                __local_file = os.path.join(path, file)
                print("upload : ", __local_file, " to Target: ", __s3file, end="")
                s3bucket.upload_file(__local_file, __s3file)
                print(" ...Success")
    except Exception as e:
        print(" ... Failed!! Quitting Upload!!")
        print(e)
        raise e

In [256]:
s3 = boto3.resource("s3")
s3bucket = s3.Bucket("mentha-athena-datastore")

In [260]:
upload_folder_to_s3(s3bucket, "data/attribution", "sample/attribution")

Uploading results to s3 initiated...
Local Source: data/attribution
total 0
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-05-29
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=__HIVE_DEFAULT_PARTITION__
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-05-04
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-05-05
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-06-09
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-06-08
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-05-25
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-05-17
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-05-12
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-05-13
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-05-22
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-05-19
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-05-23
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018-05-24
drwxr-xr-x  3 jmjeon  staff  96 Oct  2 17:24 date=2018

In [261]:
upload_folder_to_s3(s3bucket, "data/event", "sample/event")

Uploading results to s3 initiated...
Local Source: data/event
total 0
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-06-09
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-05-01
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-06-02
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-06-03
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-05-13
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-06-05
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-05-25
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=__HIVE_DEFAULT_PARTITION__
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-05-09
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-05-24
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-05-22
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-05-23
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-05-31
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 date=2018-06-01
drwxr-xr-x  4 jmjeon  staff  128 Oct  2 17:24 

In [252]:
# s3_file.put("data/event/", "mentha-athena-datastore/sample", recursive=True)
# s3_file.put("data/attribution/", "mentha-athena-datastore/sample", recursive=True)

In [85]:
# s3_client.upload_file(Filename="data/event.parquet",
#                       Bucket="mentha-athena-datastore",
#                       Key="sample/event/event.parquet"
# )

In [72]:
# s3_client.upload_file(Filename="data/attribution.parquet",
#                       Bucket="mentha-athena-datastore",
#                       Key="sample/attribution/attribution.parquet"
# )

In [64]:
# s3 = S3FileSystem()
pq.write_to_dataset(event_table, root_path=event_output_path, filesystem=s3)
# pq.write_to_dataset(attribution_table, root_path=attribution_output_path, filesystem=s3)

# 3. Athena

## 3.1. Create Athena tables

In [264]:
from pyathena import connect

In [265]:
cursor = connect(s3_staging_dir="s3://mentha-athena-query-result/sample",
                 region_name="ap-northeast-2",
                 schema_name="data_store"
).cursor()

In [266]:
create_event_table = """
CREATE EXTERNAL TABLE IF NOT EXISTS `data_store`.`event` (
  `identity_adid` string,
  `os` string,
  `model` string,
  `country` string,
  `event_name` string,
  `log_id` string,
  `server_datetime` timestamp,
  `quantity` int,
  `price` decimal(10, 1)
)
PARTITIONED BY (date date)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
WITH SERDEPROPERTIES ('serialization.format' = '1')
LOCATION 's3://mentha-athena-datastore/sample/event/'
TBLPROPERTIES ('has_encrypted_data' = 'false');
"""

create_event_partition = """
MSCK REPAIR TABLE `data_store`.`event`;
"""

In [267]:
create_attribution_table = """
CREATE EXTERNAL TABLE IF NOT EXISTS `data_store`.`attribution` (
  `partner` string,
  `campaign` string,
  `server_datetime` timestamp,
  `tracker_id` string,
  `log_id` string,
  `attribution_type` int,
  `identity_adid` string
)
PARTITIONED BY (date date)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
WITH SERDEPROPERTIES ('serialization.format' = '1')
LOCATION 's3://mentha-athena-datastore/sample/attribution/'
TBLPROPERTIES ('has_encrypted_data' = 'false');
"""

create_attribution_partition = """
MSCK REPAIR TABLE `data_store`.`attribution`;
"""

In [268]:
cursor.execute(create_event_table)
cursor.execute(create_attribution_table)

<pyathena.cursor.Cursor at 0x7f9051199a90>

In [269]:
cursor.execute(create_event_partition)
cursor.execute(create_attribution_partition)

<pyathena.cursor.Cursor at 0x7f9051199a90>

In [270]:
cursor.execute("show tables")
cursor.fetchall()

[('attribution',), ('event',), ('timestamp_error',), ('whether',)]

In [271]:
cursor.execute("select * from event limit 10")

<pyathena.cursor.Cursor at 0x7f9051199a90>

In [272]:
cursor.fetchall()

[('818187194',
  '7.1',
  '7.1',
  'ge',
  'abx:start_session',
  'b1f3eed7-e038-4973-bc52-90663ac8c48a',
  None,
  None,
  None,
  None),
 ('659691242',
  '7.1',
  '7.1',
  'kr',
  'abx:login',
  'ba8f5ca8-c972-4b5b-98bf-84d0c15b8bd0',
  None,
  None,
  None,
  None),
 ('174649877',
  '6.0',
  '6.0',
  'gb',
  'abx:login',
  'd05369ef-07b7-4347-bd79-075e39b2ae18',
  None,
  None,
  None,
  None),
 ('1010819176',
  '8.9',
  '8.9',
  'gb',
  'abx:start_session',
  '7df56a60-8a6b-4ef6-b949-76189d908ec5',
  None,
  None,
  None,
  None),
 ('1035721498',
  '8.9',
  '8.9',
  'us',
  'abx:start_session',
  '5154bf89-bfe9-431a-a83f-2d35bdda942d',
  None,
  None,
  None,
  None),
 ('1199745791',
  '8.9',
  '8.9',
  'kr',
  'abx:firstopen',
  'fcf35f94-6bb2-41ce-8e7f-9ec4085bebf5',
  None,
  None,
  None,
  None),
 ('1771879785',
  '3.4',
  '3.4',
  'jp',
  'custom:battle',
  'e73dce88-1f8f-4f58-932d-cf2ad78bc71a',
  None,
  None,
  None,
  None),
 ('932314574',
  '8.9',
  '8.9',
  'jp',
  'abx

In [88]:
# session = boto3.Session(profile_name="mentha")
# athena_client = session.client("athena")
# athena_client.list_data_catalogs()
# athena_client.list_databases(CatalogName="AwsDataCatalog")
# athena_client.list_work_groups()
# exec = athena_client.start_query_execution(QueryString="select * from event limit 10",
#                                            QueryExecutionContext={
#                                                "Database": "data_store"
#                                            },
#                                            ResultConfiguration={
#                                                "OutputLocation": "s3://mentha-athena-query-result"
#                                            }
# )
# athena_client.get_query_execution(
#     QueryExecutionId=exec["QueryExecutionId"]
# )
# result = athena_client.get_query_results(
#     QueryExecutionId=exec["QueryExecutionId"]
# )
# result["ResultSet"]
# athena_client.get_query_execution(
#     QueryExecutionId=exec["QueryExecutionId"]
# )
# athena_client.stop_query_execution(
#     QueryExecutionId=exec["QueryExecutionId"]
# )

# References

## 1. DataFrame decimal
- https://stackoverflow.com/questions/38114654/pandas-read-csv-column-dtype-is-set-to-decimal-but-converts-to-string

In [121]:
import pandas as pd
import io
import decimal as D

temp = u"""a,b,c,d
           1,1,1,1.0"""

# after testing replace io.StringIO(temp) to filename
df = pd.read_csv(io.StringIO(temp),
                 dtype={'a': int, 'b': float},
                 converters={'c': D.Decimal, 'd': D.Decimal})

for i, v in df.iterrows():
    print(type(v.a), type(v.b), type(v.c), type(v.d))

<class 'int'> <class 'float'> <class 'decimal.Decimal'> <class 'decimal.Decimal'>


## 2. Pandas NA
- https://note.nkmk.me/en/python-pandas-nan-none-na/

## 3. Python csv to parquet
- https://www.quora.com/How-do-I-convert-CSV-to-parquet-using-Python-and-without-using-Spark
- Chunk Example: https://stackoverflow.com/questions/26124417/how-to-convert-a-csv-file-to-parquet

In [134]:
## 4. Glue

In [135]:
# {
#     "Version": "2012-10-17",
#     "Statement": [
#         {
#             "Sid": "Statement1",
#             "Effect": "Allow",
#             "Principal": "*",
#             "Action": "s3:*",
#             "Resource": "arn:aws:s3:::mentha-query-datastore/*"
#         }
#     ]
# }

## 4. Read parquet file metadata
- https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_metadata.html

In [215]:
# s3_client.list_objects(Bucket="mentha-athena-datastore")