# Duckdb python API

- the same way of leading sakila data sa via terminal


In [5]:
import duckdb
from pathlib import Path

In [6]:
# duckdb path
duckdb_path = "data/sakila.duckdb"
Path(duckdb_path).unlink(missing_ok=True)
# Connect to duckdb file with context manager
# with statement
# the same way to ingest it in duckdb terminal duckdb..... < sql/ingest..
with duckdb.connect(duckdb_path) as conn, open("sql/load_sakila.sql") as ingest_scripts:
    # connect script and read the content
    conn.sql(ingest_scripts.read())
    # create pandas dataframe from the created duckdb connection
    description = conn.sql("DESC;").df()
    film = conn.sql("FROM film;").df()


# check put the created dataframes
film.head()

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2021-03-06 15:52:00
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2021-03-06 15:52:00
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,1,,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",2021-03-06 15:52:00
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,2006,1,,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes",2021-03-06 15:52:00
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,2006,1,,6,2.99,130,22.99,G,Deleted Scenes,2021-03-06 15:52:00


In [7]:
description.head(3)

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,sakila,main,actor,"[actor_id, first_name, last_name, last_update]","[DOUBLE, VARCHAR, VARCHAR, TIMESTAMP]",False
1,sakila,main,address,"[address_id, address, address2, district, city...","[BIGINT, VARCHAR, VARCHAR, VARCHAR, BIGINT, VA...",False
2,sakila,main,category,"[category_id, name, last_update]","[BIGINT, VARCHAR, TIMESTAMP]",False


## read all the tables into dataframes


In [8]:
# creating dataframes under 1 dictionary
# create a dictonary to keep all dataframes
dfs = {}

with duckdb.connect(duckdb_path) as conn:
    # look for the name - columns in the database
    for name in description["name"]:
        # put the name into a key in the dictionary
        dfs[name] = conn.sql(f"FROM {name};").df()

In [9]:
# check the resulting dictionary

# check how many keys and names of them
dfs.keys()

dict_keys(['actor', 'address', 'category', 'city', 'country', 'customer', 'customer_list', 'film', 'film_actor', 'film_category', 'film_list', 'film_text', 'inventory', 'language', 'payment', 'rental', 'sales_by_film_category', 'sales_by_store', 'staff', 'staff_list', 'store'])

In [10]:
dfs["actor"]

Unnamed: 0,actor_id,first_name,last_name,last_update
0,1.0,PENELOPE,GUINESS,2021-03-06 15:51:59
1,2.0,NICK,WAHLBERG,2021-03-06 15:51:59
2,3.0,ED,CHASE,2021-03-06 15:51:59
3,4.0,JENNIFER,DAVIS,2021-03-06 15:51:59
4,5.0,JOHNNY,LOLLOBRIGIDA,2021-03-06 15:51:59
...,...,...,...,...
195,196.0,BELA,WALKEN,2021-03-06 15:52:00
196,197.0,REESE,WEST,2021-03-06 15:52:00
197,198.0,MARY,KEITEL,2021-03-06 15:52:00
198,199.0,JULIA,FAWCETT,2021-03-06 15:52:00


In [None]:
# each value fot each key in the dictionary is a pd df

type(dfs["actor"])

pandas.core.frame.DataFrame

In [14]:
dfs["actor"].head(3)

Unnamed: 0,actor_id,first_name,last_name,last_update
0,1.0,PENELOPE,GUINESS,2021-03-06 15:51:59
1,2.0,NICK,WAHLBERG,2021-03-06 15:51:59
2,3.0,ED,CHASE,2021-03-06 15:51:59


## Register dfs to duckdb

- join dataframes


In [None]:
# 1 paramter - name of the table (string)
# 2 parameter- source of the table
duckdb.register("actor", dfs["actor"])

In [20]:
# Same thing but with a loop so you don't have to repeat code
table_names = ["film", "film_actor", "film_category", "actor", "category"]

for table_name in table_names:
    duckdb.register(table_name, dfs[table_name])

# show the result of the registration
duckdb.sql("DESC;").df()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,temp,main,actor,"[actor_id, first_name, last_name, last_update]","[DOUBLE, VARCHAR, VARCHAR, TIMESTAMP]",True
1,temp,main,category,"[category_id, name, last_update]","[BIGINT, VARCHAR, TIMESTAMP]",True
2,temp,main,film,"[film_id, title, description, release_year, la...","[BIGINT, VARCHAR, VARCHAR, VARCHAR, BIGINT, BI...",True
3,temp,main,film_actor,"[actor_id, film_id, last_update]","[BIGINT, BIGINT, TIMESTAMP]",True
4,temp,main,film_category,"[film_id, category_id, last_update]","[BIGINT, BIGINT, TIMESTAMP]",True


## Join the registered tales


In [19]:
films_joined = duckdb.sql(""" 
                            SELECT 
                                CONCAT(a.first_name, ' ', a.last_name) AS actor,
                                a.actor_id::INT AS actor_id,
                                f.title,
                                f.description,
                                f.release_year,
                                f.rental_duration,
                                f.rating,
                                c.name AS category,
                            FROM film f
                            LEFT JOIN film_actor fa ON f.film_id = fa.film_id
                            LEFT JOIN actor a ON a.actor_id = fa.actor_id
                            LEFT JOIN film_category fc ON fc.film_id = f.film_id
                            LEFT JOIN category c ON fc.category_id = c.category_id
                        """).df()

films_joined.head(2)

Unnamed: 0,actor,actor_id,title,description,release_year,rental_duration,rating,category
0,PENELOPE GUINESS,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,PG,Documentary
1,PENELOPE GUINESS,1,ANACONDA CONFESSIONS,A Lacklusture Display of a Dentist And a Denti...,2006,3,R,Animation


## EDA with Pandas

- a light introduction of pandas


## Visualization with pandas dataframe
