In [1]:
import boto3
import pandas as pd

bucket='<bucket>'
prefix='root'

# This needs to be replaced if using an external Type lookup
# perhaps read from a file in S3. Types should be Athena supported types.
Type_Translation={"project2.table1.col1":"string","project2.table1.col2":"bigint","project2.table1.col3":"bigint",   \
                  "project2.table1.col4":"bigint","project2.table1.col5":"bigint", "project2.table2.col1":"string",  \
                  "project2.table2.col2":"bigint", "project2.table2.col3": "bigint","project1.table3.col10":"string",\
                  "project1.table3.col20":"float","project1.table3.col30":"float", "project1.table3.col40":"float",  \
                  "project1.table3.col50":"float"}

In [2]:
def create_database(glue_client,database_name):
    """Create this database in the Data Catalog"""
    glue_client.create_database(
            DatabaseInput={
                'Name': database_name
            }
)
        
def does_database_exist(glue_client,database_name):
    """Determine if this database exists in the Data Catalog
    The Glue client will raise an exception if it does not exist.
    """
    try:
        glue_client.get_database(Name=database_name)
        return True
    except glue_client.exceptions.EntityNotFoundException:
        return False
    
def does_table_exist(glue_client,table):
    """Determine if this table exists in the Data Catalog
    The Glue client will raise an exception if it does not exist.
    """
    schema=table[0]
    tablename=table[1]
    try:
        glue_client.get_table(DatabaseName=schema,Name=tablename)
        return True
    except glue_client.exceptions.EntityNotFoundException:
        return False
    
def create_table(glue_client,bucket,table):
    """Create this table in the Data Catalog"""
    schema=table[0]
    tablename=table[1]
    filename=table[2]
    filepath="s3://"+('/').join([bucket,prefix,schema,tablename])
    # Type lookup to build columns
    columns_array=[{'Name':k.strip(),'Type': Type_Translation['.'.join([schema,tablename,k.strip()])]} for k in table[3]]
    
    # Assuming CSV tables with headers in files.
    response = glue_client.create_table(
        DatabaseName=table[0],
        TableInput={
        'Name': table[1],
        'StorageDescriptor': {
            'Columns': columns_array ,
        'Location': filepath, 
        'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
        'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
        'Compressed': False,
        'SerdeInfo': {  'SerializationLibrary': 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe',\
                      'Parameters': {'field.delim': ',', 'serialization.format': ','}}
        },
        'TableType' : "EXTERNAL_TABLE",
        'Parameters': {
                    'classification': 'csv',
                    'delimiter':',',
                    'skip.header.line.count':'1'
                }}
        )

In [3]:
# Get all files from S3 with size > 0
# and create tables in the Glue Catalog

s3 = boto3.client('s3')
glue = boto3.client('glue')

paginator = s3.get_paginator('list_objects_v2')

# Paginator ensures we can pull more than 1000 objects
page_iterator = paginator.paginate(Bucket=bucket,Prefix=prefix)

for page in page_iterator:
    if page['KeyCount'] > 0:
        for item in page['Contents']:
            if item['Size'] > 0:
                
                # Assuming bucket/prefix/schema/table/filename path in S3
                schema,table,filename = item['Key'].split("/")[-3:]
                df=pd.read_csv("s3://"+('/').join([bucket,prefix,schema,table,filename]))
                columns=list(df.columns)
                
                # Table tuple
                t=(schema,table,filename,columns)
                
                # Create Database if not exists
                if not does_database_exist(glue,schema):
                    create_database(glue,schema)
                    print ("Database created: %s"%schema)
                    
                # Create Table if not exists    
                if not does_table_exist(glue,t):
                    create_table(glue,bucket,t)
                    print ("Table created: %s"%(t[0]+'.'+t[1]))

Database created: project1
Table created: project1.table3
Database created: project2
Table created: project2.table1
Table created: project2.table2
