In [1]:
import pandas as pd
import duckdb

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# insert data
table_creation_query = """
drop table if exists product;

create table product (product_id int, product_code varchar, price float);

insert into product values
  (1, 'PH.CHAR.A', 800),
  (2, 'TK.TAB.B', 600),
  (3, 'PH.COM.C', 400),
  (4, 'TK.FAC.A', 100),
  (5, 'TK.COM.C', 50);
  
SELECT * FROM product"""

In [3]:
# transform product table to a pandas dataframe
df = duckdb.sql(table_creation_query).df()

# 1. Filtering products by a list

## 1a. SQL

In [4]:
sql_query = """ 
    SELECT 
        product_id,
        product_code,
        price
    FROM df
    WHERE product_code IN ('PH.CHAR.A', 'PH.COM.C')
    """

duckdb.sql(sql_query)

┌────────────┬──────────────┬───────┐
│ product_id │ product_code │ price │
│   int32    │   varchar    │ float │
├────────────┼──────────────┼───────┤
│          1 │ PH.CHAR.A    │ 800.0 │
│          3 │ PH.COM.C     │ 400.0 │
└────────────┴──────────────┴───────┘

## 1b. Pandas

In [5]:
df[df["product_code"].isin(['PH.CHAR.A', 'PH.COM.C'])]

Unnamed: 0,product_id,product_code,price
0,1,PH.CHAR.A,800.0
2,3,PH.COM.C,400.0


# 2. Filtering product codes that contain 'COM'

## 2a. SQL

In [6]:
sql_query = """ 
    SELECT 
        product_id,
        product_code,
        price
    FROM df
    WHERE product_code LIKE '%COM%'"""

duckdb.sql(sql_query)

┌────────────┬──────────────┬───────┐
│ product_id │ product_code │ price │
│   int32    │   varchar    │ float │
├────────────┼──────────────┼───────┤
│          3 │ PH.COM.C     │ 400.0 │
│          5 │ TK.COM.C     │  50.0 │
└────────────┴──────────────┴───────┘

## 2b. Pandas

In [7]:
df[df["product_code"].str.contains("COM")]

Unnamed: 0,product_id,product_code,price
2,3,PH.COM.C,400.0
4,5,TK.COM.C,50.0


# 3. Filtering product codes that start with 'PH'

## 3a. SQL

In [8]:
sql_query = """ 
    SELECT 
        product_id,
        product_code,
        price
    FROM df
    WHERE product_code LIKE 'PH%'
    """

duckdb.sql(sql_query)

┌────────────┬──────────────┬───────┐
│ product_id │ product_code │ price │
│   int32    │   varchar    │ float │
├────────────┼──────────────┼───────┤
│          1 │ PH.CHAR.A    │ 800.0 │
│          3 │ PH.COM.C     │ 400.0 │
└────────────┴──────────────┴───────┘

## 3b. Pandas

In [9]:
df[df["product_code"].str.startswith("PH")]

Unnamed: 0,product_id,product_code,price
0,1,PH.CHAR.A,800.0
2,3,PH.COM.C,400.0


# 4. Filtering product codes that end with 'A'

## 4a. SQL

In [10]:
sql_query = """ 
    SELECT 
        product_id,
        product_code,
        price
    FROM df
    WHERE product_code LIKE '%A'
    """

duckdb.sql(sql_query)

┌────────────┬──────────────┬───────┐
│ product_id │ product_code │ price │
│   int32    │   varchar    │ float │
├────────────┼──────────────┼───────┤
│          1 │ PH.CHAR.A    │ 800.0 │
│          4 │ TK.FAC.A     │ 100.0 │
└────────────┴──────────────┴───────┘

## 4b. Pandas

In [11]:
df[df["product_code"].str.endswith("A")]

Unnamed: 0,product_id,product_code,price
0,1,PH.CHAR.A,800.0
3,4,TK.FAC.A,100.0


# Summary

<img src="assets/string_filtering_sqltopandas.png" width=1000 />
