# Exporting Database Tables to S3 in Parquet.

This notebook demonstrates how to export multiple database tables in parallel to Parquet files in S3 using Spark.

Note:
1. Package dependencies are - s3fs, pandas, pyarrow, pymysql/pg8000.
2. This code uses pymysql. For Postgres, one can use pg8000.
3. Output files will have a max records set to say 10M which keeps container memory demands within control and parquet file sizes within recommended range - 128 MB to 1GB.
4. For timestamp based exports, one can use timestamp column as filters in the SQL queries that exports the data. 

In [3]:
spark.version

'2.4.0'

In [2]:
import pymysql.cursors
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
from s3fs import S3FileSystem
from pyspark.sql import Row

def connect2DB(host,user,password,db):
    ''' connect to Database '''
    # Connect to the database
    db = pymysql.connect(host=host,
                         user=user,
                         password=password,
                         db=db,
                         charset='utf8mb4',
                         cursorclass=pymysql.cursors.DictCursor)
    return db

def runSQL(sql,limit=10):
    ''' executes SQL statement '''
    result=[]
    cur.execute(sql)
    for i,row in enumerate(cur):
        if i > limit:
            break
        result.append(row) 
    return result;

def export2S3(host,user,password,db,tablename,s3Location):
    ''' exports file to S3 '''
    # Max records per file = chunk size
    chunk_size = 50000
    offset = 0
    s3 = S3FileSystem() 
    conn=connect2DB(host,user,password,db)
    while True:
        sql="Select * from %s limit %d offset %d"%(tablename,chunk_size,offset)
        df=pd.read_sql(sql,conn)
        table = pa.Table.from_pandas(df)
        pq.write_to_dataset(table, s3Location, filesystem=s3, use_dictionary=True, compression='snappy')
        offset += chunk_size
        if df.shape[0] < chunk_size:
            break
    return True

In [4]:
host='aurora-3.abc.us-west-2.rds.amazonaws.com'
user='<user>'
password='<password>'
db='<schema>'
bucket='<bucket>'

# connect to database
conn=connect2DB(host,user,password,db)
cur = conn.cursor(pymysql.cursors.DictCursor)

# get tables in database
output=runSQL("Show tables")

tables=[list(t.values())[0] for t in output]
tables

['CUSTOMER', 'CUSTOMER_DIM', 'CUSTOMER_SITE', 'PRODUCT', 'PRODUCT_CATEGORY', 'PRODUCT_DIM', 'SALES_ORDER', 'SALES_ORDER_ALL', 'SALES_ORDER_DETAIL', 'SALES_ORDER_FACT', 'SALES_ORDER_V']

In [6]:
df=sc.parallelize(tables).map(lambda x:Row(x)).toDF(["table"])
# repartition to get max parallelism from Spark cluster
df=df.repartition(10)
df.rdd.map(lambda x:export2S3(host,user,password,db,x['table'],\
                              's3://{0}/mysql/{1}/{2}'.format(bucket,db,x['table']))).collect()

[True, True, True, True, True, True, True, True, True, True, True]