In [2]:
# Source: https://ibis-project.org/posts/pydata-performance/
DATA = "pypi-data"

In [42]:
import psutil
import platform

cpu_info = platform.processor()
cpu_count = psutil.cpu_count(logical=False)
logical_cpu_count = psutil.cpu_count(logical=True)

print("\nCPU Information:")
print(f"Processor: {cpu_info}")
print(f"Physical Cores: {cpu_count}")
print(f"Logical Cores: {logical_cpu_count}")

memory_info = psutil.virtual_memory()

print("\nMemory Information:")
print(f"Total Memory: {memory_info.total / 1024 / 1024 / 1024} GB")
print(f"Available Memory: {memory_info.available / 1024 / 1024 / 1024} GB")
print(f"Used Memory: {memory_info.used / 1024 /1024 /1024} bytes")
print(f"Memory Utilization: {memory_info.percent}%")


CPU Information:
Processor: aarch64
Physical Cores: 14
Logical Cores: 14

Memory Information:
Total Memory: 100.01587295532227 GB
Available Memory: 30.458988189697266 GB
Used Memory: 68.74537658691406 bytes
Memory Utilization: 69.5%


In [None]:
import pandas as pd

# Load the data from parquet files
lineitem = pd.read_parquet("data/lineitem.parquet")
supplier = pd.read_parquet("data/supplier.parquet")



In [44]:
lineitem.memory_usage().sum() / 1024 / 1024 / 1024, supplier.memory_usage().sum() / 1024 / 1024 / 1024

(np.float64(7.1508947648108006), np.float64(0.0048430003225803375))

In [41]:
import pandas as pd

# Load the data from parquet files
lineitem = pd.read_parquet("data/lineitem.parquet")
supplier = pd.read_parquet("data/supplier.parquet")

# Convert 'l_shipdate' to datetime if it's not already
lineitem['l_shipdate'] = pd.to_datetime(lineitem['l_shipdate'])

# Step 1: Create revenue equivalent in Pandas
# Filter lineitem DataFrame for the specified date range
start_date = pd.to_datetime("1996-01-01")
end_date = pd.to_datetime("1996-04-01")

filtered_lineitem = lineitem[
    (lineitem['l_shipdate'] >= start_date) & (lineitem['l_shipdate'] < end_date)
]

# Group by 'l_suppkey' and calculate total revenue
revenue = (
    filtered_lineitem
    .groupby('l_suppkey')
    .agg(total_revenue=('l_extendedprice', lambda x: (x * (1 - filtered_lineitem.loc[x.index, 'l_discount'])).sum()))
    .reset_index()
)

# Display the revenue DataFrame
print(revenue)

# Step 2: Find maximum total revenue
max_revenue = revenue['total_revenue'].max()

# Step 3: Merge supplier with revenue where total_revenue matches maximum revenue
result = (
    supplier.merge(revenue, left_on='s_suppkey', right_on='l_suppkey')
    .loc[lambda df: df['total_revenue'] == max_revenue]
    .loc[:, ['s_suppkey', 's_name', 's_address', 's_phone', 'total_revenue']]
    .sort_values(by='s_suppkey')
)

# Show the final result
print(result)

       l_suppkey total_revenue
0              1   868343.4422
1              2   673741.8693
2              3  1073019.5287
3              4   906208.2675
4              5   524485.6030
...          ...           ...
99995      99996   916657.6366
99996      99997  1090466.5479
99997      99998   892919.3656
99998      99999  1124121.2915
99999     100000  1260249.9725

[100000 rows x 2 columns]
       s_suppkey              s_name  \
69997      69998  Supplier#000069998   

                                      s_address          s_phone total_revenue  
69997  117W54YtKASNfzkZRS8P32856BMj0HKf895vdukw  16-386-278-9829  2194132.8166  


In [None]:
df.memory_usage().sum() / 1024 / 1024 / 1024

np.float64(6.612390279769897e-07)

In [8]:
!ls -l --block-size=M data

total 6813M
-rw-r--r-- 1 root root  121M Nov  5 23:49 customer.parquet
-rw-r--r-- 1 root root 2679M Nov  5 23:49 lineitem.parquet
-rw-r--r-- 1 root root    1M Nov  5 23:49 nation.parquet
-rw-r--r-- 1 root root  606M Nov  5 23:49 orders.parquet
-rw-r--r-- 1 root root   67M Nov  5 23:49 part.parquet
-rw-r--r-- 1 root root  431M Nov  5 23:49 partsupp.parquet
-rw-r--r-- 1 root root    1M Nov  5 23:49 region.parquet
-rw-r--r-- 1 root root    8M Nov  5 23:49 supplier.parquet
-rw-r--r-- 1 root root 2904M Nov  5 23:52 tmp_lineitem.parquet


In [None]:
# Top Supplier query
from sqlframe import activate
from pyspark.sql.functions import col
from pyspark.sql import functions as F

from pyspark.sql import SparkSession

activate(engine="duckdb")
# Create a SparkSession object
session = SparkSession.builder.master("local").getOrCreate()
session


<sqlframe.duckdb.session.DuckDBSession at 0xffff7c2e6540>

In [None]:
%%time
lineitem = session.read.parquet("data/lineitem.parquet")
supplier = session.read.parquet("data/supplier.parquet")

# Step 1: Create revenue equivalent in PySpark
revenue = (
    lineitem.filter(
        (col("l_shipdate") >= "1996-01-01") & (col("l_shipdate") < "1996-04-01")
    )
    .groupBy("l_suppkey")
    .agg(F.sum(col("l_extendedprice") * (1 - col("l_discount"))).alias("total_revenue"))
)
revenue.show()

max_revenue = revenue.agg(F.max("total_revenue")).first()[0]

result = (
    supplier.join(revenue, supplier.s_suppkey == revenue.l_suppkey)
    .filter(revenue.total_revenue == max_revenue)
    .select(
        supplier.s_suppkey,
        supplier.s_name,
        supplier.s_address,
        supplier.s_phone,
        revenue.total_revenue,
    )
    .orderBy(supplier.s_suppkey)
)

# Show the final result
result.show()

+-----------+---------------+
| l_suppkey | total_revenue |
+-----------+---------------+
|   68202   |  634940.4735  |
|   27439   |  556925.5293  |
|   55808   |  959961.5108  |
|   98477   |  934817.8029  |
|   10380   |  781137.0141  |
|   31287   |   463818.166  |
|   98036   |  896149.1473  |
|   81698   |  748562.8137  |
|   98815   |  1161245.9785 |
|   36030   |  498307.5766  |
|   95993   |  890734.4854  |
|   28151   |  847368.1963  |
|   13656   |  1181352.0785 |
|   33508   |   651779.105  |
|   41089   |   618858.75   |
|   79610   |  1277050.4168 |
|   49508   |  1070704.7353 |
|   90484   |  895834.1947  |
|    7119   |  911817.1375  |
|   38689   |  832264.8655  |
+-----------+---------------+
+-----------+--------------------+------------------------------------------+-----------------+---------------+
| s_suppkey |       s_name       |                s_address                 |     s_phone     | total_revenue |
+-----------+--------------------+----------------------