In [1]:
import pandas as pd
import duckdb

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# insert data
table_creation_query = """
drop table if exists orders;

create table orders (order_id int, product varchar, amount float);

insert into orders values
  (1, 'tea', 800),
  (2, 'chocolate', 600),
  (3, 'coffee', 400),
  (4, 'tea', 100),
  (5, 'chocolate', 50);
  
SELECT * FROM orders"""

In [3]:
# transform product table to a pandas dataframe
df = duckdb.sql(table_creation_query).df()

# 1. Distinct values

## 1a. SQL

In [4]:
sql_query = """ 
    SELECT 
        DISTINCT product
    FROM df
    """

duckdb.sql(sql_query)

┌───────────┐
│  product  │
│  varchar  │
├───────────┤
│ tea       │
│ chocolate │
│ coffee    │
└───────────┘

## 1b. Pandas

In [5]:
df["product"].unique()

array(['tea', 'chocolate', 'coffee'], dtype=object)

# 2. Count of distinct values

## 2a. SQL

In [6]:
sql_query = """ 
    SELECT 
        COUNT(DISTINCT product) AS product_count
    FROM df
    """

duckdb.sql(sql_query)

┌───────────────┐
│ product_count │
│     int64     │
├───────────────┤
│             3 │
└───────────────┘

## 2b. Pandas

In [7]:
df["product"].nunique()

3

# 3. Count of total values

## 3a. SQL

In [8]:
sql_query = """ 
    SELECT 
        COUNT(order_id) AS order_count
    FROM df
    """

duckdb.sql(sql_query)

┌─────────────┐
│ order_count │
│    int64    │
├─────────────┤
│           5 │
└─────────────┘

## 3b. Pandas

In [9]:
df["order_id"].size

5

# Summary

<img src="assets/distinct_and_count_sqltopandas.png" width=1000 /> 