In [1]:
import pandas as pd
import duckdb

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# insert data
customer_table_creation_query = """
drop table if exists customers;

create table customers (customer_id int, customer_name varchar);

insert into customers values
  (1, 'Thomas'),
  (2, 'Thierry'),
  (3, 'Marc');
SELECT * FROM customers
"""

order_table_creation_query = """
drop table if exists orders;

create table orders (customer_id int, product varchar);

insert into orders values
  (1, 'chocolate'),
  (3, 'tea');
SELECT * FROM orders
"""

In [3]:
# transform customers and orders tables to a pandas dataframe
customers = duckdb.sql(customer_table_creation_query).df()
orders = duckdb.sql(order_table_creation_query).df()

# 1. Inner join

## 1a. SQL

In [4]:
sql_query = """ 
    SELECT 
        customer_id,
        customer_name,
        product
    FROM customers
    INNER JOIN orders USING(customer_id)
    """

duckdb.sql(sql_query)

┌─────────────┬───────────────┬───────────┐
│ customer_id │ customer_name │  product  │
│    int32    │    varchar    │  varchar  │
├─────────────┼───────────────┼───────────┤
│           1 │ Thomas        │ chocolate │
│           3 │ Marc          │ tea       │
└─────────────┴───────────────┴───────────┘

## 1b. Pandas

In [5]:
pd.merge(customers, orders, on="customer_id", how="inner")

Unnamed: 0,customer_id,customer_name,product
0,1,Thomas,chocolate
1,3,Marc,tea


# 2. Left join

## 2a. SQL

In [6]:
sql_query = """ 
    SELECT 
        customer_id,
        customer_name,
        product
    FROM customers
    LEFT JOIN orders USING(customer_id)
    """

duckdb.sql(sql_query)

┌─────────────┬───────────────┬───────────┐
│ customer_id │ customer_name │  product  │
│    int32    │    varchar    │  varchar  │
├─────────────┼───────────────┼───────────┤
│           1 │ Thomas        │ chocolate │
│           3 │ Marc          │ tea       │
│           2 │ Thierry       │ NULL      │
└─────────────┴───────────────┴───────────┘

## 2b. Pandas

In [7]:
pd.merge(customers, orders, on="customer_id", how="left")

Unnamed: 0,customer_id,customer_name,product
0,1,Thomas,chocolate
1,2,Thierry,
2,3,Marc,tea


# 3. Caution with pandas

In [12]:
# insert data
customer_table_creation_query = """
drop table if exists customers;

create table customers (customer_id int, customer_name varchar);

insert into customers values
  (1, 'Thomas'),
  (2, 'Thierry'),
  (3, 'Marc'),
  (NULL, 'Alice');
SELECT * FROM customers
"""

order_table_creation_query = """
drop table if exists orders;

create table orders (customer_id int, product varchar);

insert into orders values
  (1, 'chocolate'),
  (3, 'tea'),
  (NULL, 'water');
SELECT * FROM orders
"""

In [13]:
# transform customers and orders tables to a pandas dataframe
customers = duckdb.sql(customer_table_creation_query).df()
orders = duckdb.sql(order_table_creation_query).df()

In [14]:
pd.merge(customers, orders, on="customer_id", how="inner")

Unnamed: 0,customer_id,customer_name,product
0,1.0,Thomas,chocolate
1,3.0,Marc,tea
2,,Alice,water


# Summary

<img src="assets/join.png" width=1000 />
