In [1]:
import duckdb

Example of Jupyter "magic command":

In [2]:
%pwd

'/Users/p_park/spring_2024/eds_213_data/bren-meds213-spring-2024-class-data/week3'

In [3]:
%ls

01_ASDN_Readme.txt        [0m[01;32mbuild-database[0m*          species.csv
ASDN_Bird_eggs.csv        database.db              sql-continued.sql
ASDN_Bird_nests.csv       database.db.wal          theloop.sh
ASDN_Camp_assignment.csv  personnel.csv            week6a_file.ipynb
[01;34mHW[0m/                       schema-build-script.sql  week6b_file.ipynb
addition.sh               script_2.sql
asdn-er-diagram.png       site.csv


To install Duckdb module:

In [4]:
#%pip install duckdb

1. Create a connection and a cursor

In [5]:
conn = duckdb.connect("database.db")

IOException: IO Error: Could not set lock on file "/Users/p_park/spring_2024/eds_213_data/bren-meds213-spring-2024-class-data/week3/database.db": Conflicting lock is held in /opt/python/3.7.13/bin/python3.7 (PID 1647904). See also https://duckdb.org/docs/connect/concurrency

In [None]:
conn

In [None]:
cur = conn.cursor()

Now let's do something with our cursor

In [None]:
cur.execute("SELECT * FROM Site LIMIT 5")

Now we want results... three ways of getting them.

1. All results at once

In [None]:
cur.fetchall()

Cursors don't store anything, they just transfer queries to the database ang get results back.

In [None]:
cur.fetchall()

In [None]:
cur.execute("SELECT Nest_ID FROM Bird_nests LIMIT 10")

In [None]:
cur.fetchall()

In [None]:
cur.execute("SELECT Nest_ID FROM Bird_nests LIMIT 10")
[t[0] for t in cur.fetchall()]

#how to read: for each tuple 't', in cur.fetchall(), give me the first element.

2. Get the one result, or the next result

In [None]:
cur.execute("SELECT COUNT(*) FROM Bird_nests")
cur.fetchone()

In [None]:
cur.execute("SELECT COUNT(*) FROM Bird_nests")
cur.fetchone()[0]

3. Using an iterator - but Duckdb doesn't support iterators :(

In [None]:
cur.execute("SELECT Nest_ID FROM Bird_nests LIMIT 10")
for row in cur:
    print(f"got {row[0]}")
#this does not work

In [None]:
cur.execute("SELECT Nest_ID FROM Bird_nests LIMIT 10")
while True:
    row = cur.fetchone()
    if row == None:
        break
    #do something with row
    print(f"got nest ID {row[0]}")

Can do things other than SELECT!

In [None]:
cur.execute("""
    CREATE TEMP TABLE temp_table AS
    SELECT * FROM Bird_nests LIMIT 10
""")

In [None]:
cur.execute("SELECT * FROM temp_table")

In [None]:
cur.fetchall()

A note on fragility

For example:
INSERT INTO Site VALUES ("abcd", "FOO", 35.7, -119.5, "?")

A less fragile way of expressing the same thing:
INSERT INTO Site (Code, Site_name, Latitude, Longitude, Something_else)
    VALUES ("abcd", "FOO", 35.7, -119.5, "?")
    
In the same vein: SELECT * is fragile

In [None]:
cur.execute("SELECT * FROM Site LIMIT 3")
cur.fetchall()

In [None]:
cur.execute("SELECT Site_name, Code, Latitude, Longitude, FROM Site LIMIT 3")
cur.fetchall()

An extended example: Question we're trying to answer: How many nests do we have for each species?

Approach: first get all species. Then execute a count query for each species.

A digression: string interpolation in Python

In [None]:
s = "My name is %s"
print(s % "Patty")

s = "My name is %s and my professor's name is %s"
print(s % ("Patty", "Greg"))

#the new f-string
name = "Patty"
print(f"My name is {name}")
#Third way
print("My name is {}".format("Patty"))

In [None]:
query = """
    SELECT COUNT(*) FROM Bird_nests
    WHERE Species = '%s'
"""
cur.execute("SELECT Code FROM Species LIMIT 3")
for row in cur.fetchall(): #duckdb workaround
    code = row[0]
    prepared_query = query % code
    #print(prepared_query)
    cur2 = conn.cursor()
    cur2.execute(prepared_query)
    print(f"Species {code} has {cur2.fetchone()[0]} nests")
    cur2.close()

In [None]:
query = """
    SELECT COUNT(*) FROM Bird_nests
    WHERE Species = ?
"""
cur.execute("SELECT Code FROM Species LIMIT 3")
for row in cur.fetchall(): #duckdb workaround
    code = row[0]
    # not needed! prepared_query = query % code
    #print(prepared_query)
    cur2 = conn.cursor()
    cur2.execute(query, [code]) #<---- added argument here
    print(f"Species {code} has {cur2.fetchone()[0]} nests")
    cur2.close()

Let's illustrate the danger with a different example

In [None]:
abbrev = "TS"
name = "Taylor Swift"
cur.execute("""
    INSERT INTO Personnel (Abbreviation, Name)
    VALUES ('%s', '%s')
    """ % (abbrev, name)
           )

In [None]:
cur.execute("SELECT * FROM Personnel")
cur.fetchall()[-3:]

In [None]:
abbrev = "CO"
name = "Conan O'Brien"
cur.execute("""
    INSERT INTO Personnel (Abbreviation, Name)
    VALUES ('%s', '%s')
    """ % (abbrev, name)
           )

In [None]:
cur.execute("SELECT * FROM Personnel")
cur.fetchall()[-3:]

In [None]:
abbrev = "CO"
name = "Conan O'Brien"
cur.execute("""
    INSERT INTO Personnel (Abbreviation, Name)
    VALUES (?, ?)
    """ % [abbrev, name]
           )

In [None]:
"""
   INSERT INTO Personnel (Abbreviation, Name)
   VALUES ('%s', '%s')
   """ % (abbrev, name)

In [None]:
abbrev = "CO"
name = "Conan O'Brien"
cur.execute("""
   INSERT INTO Personnel (Abbreviation, Name)
   VALUES (?, ?)
   """,
    [abbrev, name])

In [None]:
cur.execute("SELECT * FROM Personnel")
cur.fetchall()[-3:]