# Create Temporary Tables

This tutorial demonstrates how to create tempoary tables in athena using pydbtools

## Setup

Just run this script to create the source database so we can use it for our example.

In [None]:
import os
import pandas as pd
import awswrangler as wr
import pydbtools as pydb

In [None]:
# setup your own testing area (set foldername = GH username)
foldername = "mratford"  # GH username
foldername = foldername.lower().replace("-", "_")

In [None]:
bucketname = "alpha-everyone"
s3_base_path = f"s3://{bucketname}/{foldername}/"

db_name = f"aws_example_{foldername}"
source_db_base_path = f"s3://{bucketname}/{foldername}/source_db/"

# Delete all the s3 files in a given path
if wr.s3.list_objects(s3_base_path):
    print("deleting objs")
    wr.s3.delete_objects(s3_base_path)

# Delete the database if it exists
df_dbs = wr.catalog.databases(None)
if db_name in df_dbs["Database"].to_list():
    print(f"{db_name} found deleting")
    wr.catalog.delete_database(name=db_name)

# Setup source database
# Create the database
wr.catalog.create_database(db_name)

# Iterate through the tables in data/ and write them to our db using awswrangler
for table_name in ["department", "employees", "sales"]:

    df = pd.read_csv(f"data/{table_name}.csv")
    table_path = os.path.join(source_db_base_path, table_name)
    wr.s3.to_parquet(
        df=df,
        path=table_path,
        index=False,
        dataset=True,  # True allows the other params below i.e. overwriting to db.table
        database=db_name,
        table=table_name,
        mode="overwrite",
    )

## Task

We are going to create a table that shows total sales per employee using all 3 tables

In [None]:
pydb.read_sql_query(f"SELECT * FROM {db_name}.employees LIMIT 5", ctas_approach=False)

In [None]:
pydb.read_sql_query(f"SELECT * FROM {db_name}.department LIMIT 5", ctas_approach=False)

In [None]:
pydb.read_sql_query(f"SELECT * FROM {db_name}.sales LIMIT 5", ctas_approach=False)

pydbtools has a create temp table function that allows you to create tables which you can refer to in a `__temp__` database.

**First create a total_sales table:**

In [None]:
sql = f"""
SELECT employee_id, sum(sales) as total_sales
FROM {db_name}.sales
GROUP BY employee_id
"""
print(sql)
pydb.create_temp_table(sql, table_name="total_sales")

**Then create a table of employee names from the sales department:**

In [None]:
sql = f"""
SELECT e.employee_id, e.forename, e.surname, d.department_name
FROM {db_name}.employees AS e
LEFT JOIN {db_name}.department AS d
ON e.department_id = d.department_id
WHERE e.department_id = 1
"""
print(sql)
pydb.create_temp_table(sql, table_name="sales_employees")

**Finally return our final table**

In [None]:
sql = f"""
SELECT se.*, ts.total_sales
FROM __temp__.sales_employees AS se
INNER JOIN __temp__.total_sales AS ts
ON se.employee_id = ts.employee_id
"""
print(sql)
pydb.read_sql_query(sql, ctas_approach=False).head(10)

### Creating a temporary table from a dataframe

You can also use an existing dataframe as a table in the temporary database and run SQL queries on it.

In [None]:
df = pd.read_csv("data/sales.csv")
pydb.dataframe_to_temp_table(df, "sales")
pydb.read_sql_query("select qtr, sum(sales) as sales from __temp__.sales group by qtr")

In [None]:
### Clean up

# Delete all the s3 files in a given path
if wr.s3.list_objects(s3_base_path):
    print("deleting objs")
    wr.s3.delete_objects(s3_base_path)

# Delete the database if it exists
df_dbs = wr.catalog.databases(None)
if db_name in df_dbs["Database"].to_list():
    print(f"{db_name} found deleting")
    wr.catalog.delete_database(name=db_name)