In [1]:
import os
from pathlib import Path

from dotenv import load_dotenv

import polars as pl

from src.utils import polars_read_uc

In [None]:
# Constants
# UC tables
CATALOG_NAME="your_uc_catalog"
SCHEMA_NAME="your_schema"
TABLE_NAME="your_table"

# Project 
DOTENV_PATH="devops/.env"

In [3]:
# Variable resolution and loading
load_dotenv(dotenv_path=Path(DOTENV_PATH))
table_uc_name=f"{CATALOG_NAME}.{SCHEMA_NAME}.{TABLE_NAME}"

1. Read table without column pruning nor predicate pushdown

In [4]:
my_pl_df_all=polars_read_uc(table_uc_name)

In [None]:
some_agg_from_all = (
    my_pl_df_all
    .group_by("some_groupby_column")
    .agg([
        pl.col("some_column_to_agg").sum().alias("some_alias")
    ])
)

2. Read table with column pruning

In [None]:
my_pl_df_column_pruned=(
    polars_read_uc(
        table_uc_name,
        columns=["some_groupby_column", "some_column_to_agg"]
    )
)

In [None]:
some_agg_from_column_pruned = (
    my_pl_df_column_pruned
    .group_by("some_groupby_column")
    .agg([
        pl.col("some_column_to_agg").sum().alias("some_alias")
    ])
)

3. Read table with column pruning and predicate pushdown (assume table partitioned)

In [None]:
my_pl_df_small=(
    polars_read_uc(
        table_uc_name,
        partitions=[("some_partition_col", "=", "some_partition_val")],
        columns=["some_groupby_column", "some_column_to_agg"]
    )
)

In [None]:
some_agg_from_small = (
    my_pl_df_small
    .group_by("some_groupby_column")
    .agg([
        pl.col("some_column_to_agg").sum().alias("some_alias")
    ])
)

Compare sizes:

In [9]:
def get_pl_size_in_gb(pl_df):
    return pl_df.estimated_size() / (1024 ** 3)

print(f"Size all: {get_pl_size_in_gb(my_pl_df_all)}")
print(f"Size column pruning: {get_pl_size_in_gb(my_pl_df_column_pruned)}")
print(f"Size column pruning and predicate pushdown: {get_pl_size_in_gb(my_pl_df_small)}")

Size all: 2.122442901134491
Size column pruning: 0.10247126780450344
Size column pruning and predicate pushdown: 0.04640738479793072
