In [4]:
# Source: https://ibis-project.org/posts/pydata-performance/
DATA = "pypi-data"

In [5]:
import psutil
import platform

cpu_info = platform.processor()
cpu_count = psutil.cpu_count(logical=False)
logical_cpu_count = psutil.cpu_count(logical=True)

print("\nCPU Information:")
print(f"Processor: {cpu_info}")
print(f"Physical Cores: {cpu_count}")
print(f"Logical Cores: {logical_cpu_count}")

memory_info = psutil.virtual_memory()

print("\nMemory Information:")
print(f"Total Memory: {memory_info.total / 1024 / 1024 / 1024} GB")
print(f"Available Memory: {memory_info.available / 1024 / 1024 / 1024} GB")
print(f"Used Memory: {memory_info.used / 1024 /1024 /1024} bytes")
print(f"Memory Utilization: {memory_info.percent}%")


CPU Information:
Processor: 
Physical Cores: 14
Logical Cores: 14

Memory Information:
Total Memory: 100.01587677001953 GB
Available Memory: 55.996585845947266 GB
Used Memory: 43.21443557739258 bytes
Memory Utilization: 44.0%


In [3]:
# Pandas
import pandas as pd

df = pd.read_parquet(f"{DATA}/dataset-44ce2ca1.parquet")

In [7]:
df.memory_usage().sum() / 1024 / 1024 / 1024

np.float64(10.202267900109291)

In [8]:
df = df[
    (
        df.path.str.contains(
            r"\.(?:asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
        )
        & ~df.path.str.contains(r"(?:^|/)test(?:|s|ing)|/site-packages/")
    )
]

df = (
    df.groupby(
        [
            df.uploaded_on.dt.to_period("M").dt.to_timestamp().rename("month"),
            df.path.str.extract(r"\.([a-z0-9]+)$", 0, expand=False).rename("ext"),
        ]
    )
    .agg({"project_name": lambda s: list(set(s))})
    .rename(columns={"project_name": "projects"})
    .sort_index(level="month", ascending=False)
)
df = (
    df.reset_index()
    .assign(
        ext=lambda t: t.ext.str.replace(r"cxx|cpp|cc|c|hpp|h", "C/C++", regex=True)
        .str.replace("^f.*$", "Fortran", regex=True)
        .str.replace("rs", "Rust")
        .str.replace("go", "Go")
        .str.replace("asm", "Assembly")
        .replace("", None)
    )
    .groupby(["month", "ext"])
    .agg({"projects": lambda s: len(set(sum(s, [])))})
)

In [None]:
df.memory_usage().sum() / 1024 / 1024 / 1024

In [73]:
!ls -l --block-size=M data

total 6813M
-rw-r--r-- 1 root root  121M Nov  5 23:49 customer.parquet
-rw-r--r-- 1 root root 2679M Nov  5 23:49 lineitem.parquet
-rw-r--r-- 1 root root    1M Nov  5 23:49 nation.parquet
-rw-r--r-- 1 root root  606M Nov  5 23:49 orders.parquet
-rw-r--r-- 1 root root   67M Nov  5 23:49 part.parquet
-rw-r--r-- 1 root root  431M Nov  5 23:49 partsupp.parquet
-rw-r--r-- 1 root root    1M Nov  5 23:49 region.parquet
-rw-r--r-- 1 root root    8M Nov  5 23:49 supplier.parquet
-rw-r--r-- 1 root root 2904M Nov  5 23:52 tmp_lineitem.parquet


In [75]:
# Top Supplier query
%%time
from sqlframe.duckdb import DuckDBSession
from sqlframe.duckdb import functions as F

session = DuckDBSession()


UsageError: Line magic function `%%time` not found.


In [76]:
%%time
lineitem = session.read.parquet("data/lineitem.parquet")
supplier = session.read.parquet("data/supplier.parquet")

# Step 1: Create revenue equivalent in PySpark
revenue = (
    lineitem
    .filter((col("l_shipdate") >= '1996-01-01') & (col("l_shipdate") < '1996-04-01'))
    .groupBy("l_suppkey")
    .agg(F.sum(col("l_extendedprice") * (1 - col("l_discount"))).alias("total_revenue"))
)
revenue.show()
result = (
    supplier.join(revenue, supplier.s_suppkey == revenue.l_suppkey)
    .filter(revenue.total_revenue == max_revenue)
    .select(supplier.s_suppkey, supplier.s_name, supplier.s_address, supplier.s_phone, revenue.total_revenue)
    .orderBy(supplier.s_suppkey)
)

# Show the final result
result.show()

+-----------+---------------+
| l_suppkey | total_revenue |
+-----------+---------------+
|   19636   |   904636.854  |
|   31982   |  1280955.943  |
|   30062   |  532740.5822  |
|   99027   |  1088981.9935 |
|   59626   |  546398.3081  |
|   83742   |  955721.6044  |
|   74983   |  1080551.4908 |
|   71151   |  830950.5638  |
|   77668   |  1100900.0705 |
|   98833   |  1363390.9186 |
|    1378   |  585642.9369  |
|   18046   |  670246.1742  |
|   53357   |  781010.2984  |
|   24719   |  596890.9937  |
|    823    |  1137543.7678 |
|   39201   |  684858.0793  |
|   25482   |  871666.0449  |
|   63616   |  988547.5536  |
|   59045   |  582534.5339  |
|   37840   |  932711.3246  |
+-----------+---------------+
+-----------+--------------------+------------------------------------------+-----------------+---------------+
| s_suppkey |       s_name       |                s_address                 |     s_phone     | total_revenue |
+-----------+--------------------+----------------------