# Delete databases, tables and partitions and data

`pydbtools` now has functions to remove databases, database tables, and partitions, plus the underlying data on S3.

## Setup

First run the following cells to set up the database tables.

Import the necessary libraries.

In [1]:
import os
import pandas as pd
import awswrangler as wr
import pydbtools as pydb

Set up your testing area. 

**Important:** substitute your own Github username below.

In [2]:
# setup your own testing area (set foldername = GH username)
foldername = "mratford" # GH username
foldername = foldername.lower().replace("-","_")

In [3]:
bucketname = "alpha-everyone"
s3_base_path = f"s3://{bucketname}/{foldername}/"

db_name = f"aws_example_{foldername}"
source_db_base_path = f"s3://{bucketname}/{foldername}/source_db/"

# Delete all the s3 files in a given path
if wr.s3.list_objects(s3_base_path):
    print("deleting objs")
    wr.s3.delete_objects(s3_base_path)

# Delete the database if it exists
df_dbs = wr.catalog.databases(None)
if db_name in df_dbs["Database"].to_list():
    print(f"{db_name} found deleting")
    wr.catalog.delete_database(
        name=db_name
    )

# Setup source database
# Create the database
wr.catalog.create_database(db_name)

# Iterate through the tables in data/ and write them to our db using awswrangler
for table_name in ["department", "employees"]:
    df = pd.read_csv(f"data/{table_name}.csv")
    table_path = os.path.join(source_db_base_path, f"{table_name}/")
    wr.s3.to_parquet(
        df=df,
        path=table_path,
        index=False,
        dataset=True, # True allows the other params below i.e. overwriting to db.table
        database=db_name,
        table=table_name,
        mode="overwrite",
    )
    
# For the sales table partition the data by employee_id and qtr
# (reduce the size of the table for legibility)
df = pd.read_csv("data/sales.csv").query('employee_id < 5')
table_path = os.path.join(source_db_base_path, "sales")
partition_info = wr.s3.to_parquet(
    df=df,
    path=table_path,
    index=False,
    dataset=True,
    partition_cols=["employee_id", "qtr"],
    database=db_name,
    table="sales",
    mode="overwrite"
)

## Deleting a table

Show the tables in the database.

In [4]:
table_info = list(wr.catalog.get_tables(database=db_name))
[x['Name'] for x in table_info]

['department', 'employees', 'sales']

Show the data for the `department` table.

In [5]:
dept_info = next(x for x in table_info if x['Name'] == 'department')
dept_location = dept_info['StorageDescriptor']['Location']
wr.s3.list_objects(dept_location)

['s3://alpha-everyone/mratford/source_db/department/d998e99f143d4a6aab4e9395f550fdaf.snappy.parquet']

Now delete the `department` table.

In [6]:
pydb.delete_table_and_data(database=db_name, table='department')

Check that it's no longer in the database.

In [7]:
table_info = list(wr.catalog.get_tables(database=db_name))
[x['Name'] for x in table_info]

['employees', 'sales']

Check that the data no longer exists.

In [8]:
wr.s3.list_objects(dept_location)

[]

## Deleting a partition

Show the partitions from the `sales` table.

In [9]:
wr.catalog.get_partitions(database=db_name, table='sales')

{'s3://alpha-everyone/mratford/source_db/sales/employee_id=2/qtr=2/': ['2',
  '2'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=1/qtr=1/': ['1',
  '1'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=1/qtr=3/': ['1',
  '3'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=4/qtr=2/': ['4',
  '2'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=1/qtr=2/': ['1',
  '2'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=3/qtr=1/': ['3',
  '1'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=3/qtr=2/': ['3',
  '2'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=2/qtr=4/': ['2',
  '4'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=2/qtr=3/': ['2',
  '3'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=1/qtr=4/': ['1',
  '4'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=3/qtr=4/': ['3',
  '4'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=3/qtr=3/': ['3',


Check the data for one of the partitions.

In [10]:
wr.s3.list_objects(f"{source_db_base_path}sales/employee_id=1/qtr=4/")

['s3://alpha-everyone/mratford/source_db/sales/employee_id=1/qtr=4/7ce1d90a1ee147b3ab68954f52f26037.snappy.parquet']

Use an SQL like query to delete the partition and data for quarter 4.

In [11]:
pydb.delete_partitions_and_data(database=db_name, table="sales", expression="qtr = 4")

Check that the partition no longer exists.

In [12]:
wr.catalog.get_partitions(database=db_name, table='sales')

{'s3://alpha-everyone/mratford/source_db/sales/employee_id=2/qtr=2/': ['2',
  '2'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=1/qtr=1/': ['1',
  '1'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=1/qtr=3/': ['1',
  '3'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=4/qtr=2/': ['4',
  '2'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=1/qtr=2/': ['1',
  '2'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=3/qtr=1/': ['3',
  '1'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=3/qtr=2/': ['3',
  '2'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=2/qtr=3/': ['2',
  '3'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=3/qtr=3/': ['3',
  '3'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=2/qtr=1/': ['2',
  '1'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=4/qtr=3/': ['4',
  '3'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=4/qtr=1/': ['4',


Check that the data no longer exists.

In [13]:
wr.s3.list_objects(f"{source_db_base_path}sales/employee_id=1/qtr=4/")

[]

Using a more complex query, delete quarters 1 and 2 for employee 3.

In [14]:
pydb.delete_partitions_and_data(database=db_name, table="sales", expression="employee_id = 3 and qtr < 3")
wr.catalog.get_partitions(database=db_name, table='sales')

{'s3://alpha-everyone/mratford/source_db/sales/employee_id=2/qtr=2/': ['2',
  '2'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=1/qtr=1/': ['1',
  '1'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=1/qtr=3/': ['1',
  '3'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=4/qtr=2/': ['4',
  '2'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=1/qtr=2/': ['1',
  '2'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=2/qtr=3/': ['2',
  '3'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=3/qtr=3/': ['3',
  '3'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=2/qtr=1/': ['2',
  '1'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=4/qtr=3/': ['4',
  '3'],
 's3://alpha-everyone/mratford/source_db/sales/employee_id=4/qtr=1/': ['4',
  '1']}

See [the documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html#Glue.Client.get_partitions) for details on the expression syntax.

## Deleting a database

In [15]:
db_name in list(wr.catalog.databases()['Database'])

True

In [16]:
pydb.delete_database_and_data(db_name)

In [17]:
db_name in list(wr.catalog.databases()['Database'])

False

In [18]:
wr.s3.list_objects(source_db_base_path)

[]

In [19]:
source_db_base_path

's3://alpha-everyone/mratford/source_db/'

## Deleting temporary database tables

It might be useful during development to get rid of the temporary database or one of it's tables if something has gone wrong. This can be accomplished by using `__temp__` as the database name in one of the functions above.

In [20]:
# Setup source database
# Create the database
wr.catalog.create_database(db_name, exist_ok=True)

# Iterate through the tables in data/ and write them to our db using awswrangler
for table_name in ["department", "employees", "sales"]:
    
    df = pd.read_csv(f"data/{table_name}.csv")
    table_path = os.path.join(source_db_base_path, f"{table_name}/")
    wr.s3.to_parquet(
        df=df,
        path=table_path,
        index=False,
        dataset=True, # True allows the other params below i.e. overwriting to db.table
        database=db_name,
        table=table_name,
        mode="overwrite",
    )
    
sql = f"""
SELECT employee_id, sum(sales) as total_sales
FROM {db_name}.sales
GROUP BY employee_id
"""
print(sql)
pydb.create_temp_table(sql, table_name="total_sales")


SELECT employee_id, sum(sales) as total_sales
FROM aws_example_mratford.sales
GROUP BY employee_id



In [21]:
pydb.read_sql_query("select * from __temp__.total_sales")

Unnamed: 0,employee_id,total_sales
0,47,1489.52
1,24,2248.35
2,35,2624.64
3,18,1759.39
4,42,1688.76
5,43,2073.25
6,45,2778.84
7,28,2071.77
8,1,2911.65
9,101,817.45


In [22]:
pydb.delete_database_and_data("__temp__")

In [23]:
try:
    df = pydb.read_sql_query("select * from __temp__.total_sales")
    print("Error, temporary database not deleted correctly.")
except wr.exceptions.QueryFailed:
    print("Query failed correctly.")

Query failed correctly.
