# A little bit of polars select and pl_series_hash
note, I'm not using buckaroo here, because for simple static examples, buckaroo isn't the best fit

In [1]:
import polars as pl
from pl_series_hash import hash_xx

In [2]:
df = pl.DataFrame({'a':[10,20,30], 'b':[10.5, 20.5, 30.5], 'c':['foo','bar','baz']})
df

a,b,c
i64,f64,str
10,10.5,"""foo"""
20,20.5,"""bar"""
30,30.5,"""baz"""


`pl.all()` says operate on all columns

`.hash()` is built into to polars, it applies an element-wise hash function, notice that the output shape is the same

In [3]:
df.select(pl.all().hash())

a,b,c
u64,u64,u64
204224335298753530,17900678026959284394,14073873166397345198
14537498149072497771,34515047721616441,6674390471925201470
14622530892095474156,835753504180906049,11934769846273094804


`pl_series_hash` is the namespace created by my polars plugin

`hash_xx()` returns a single value

In [4]:
df.select(pl.all().pl_series_hash.hash_xx())

a,b,c
u64,u64,u64
13038993034761730339,18022455248480596878,1505513022777147474


# Multiple expressions

In [5]:
import polars.selectors as cs
df.select([pl.all().pl_series_hash.hash_xx().name.prefix('pl_hash_'),
           cs.numeric().mean().name.prefix("mean_")])

pl_hash_a,pl_hash_b,pl_hash_c,mean_a,mean_b
u64,u64,u64,f64,f64
13038993034761730339,18022455248480596878,1505513022777147474,20.0,20.5


# LazyFrames
Polars LazyFrames are pointers to a datasource that queries can be executed on.  Crucially the entire dataframe isn't read into memory


In [6]:
print(df.select([pl.all().len()]))
pl.scan_parquet("../citibike-trips-2016-04.parq").select(pl.all().len()).collect()

shape: (1, 3)
┌─────┬─────┬─────┐
│ a   ┆ b   ┆ c   │
│ --- ┆ --- ┆ --- │
│ u32 ┆ u32 ┆ u32 │
╞═════╪═════╪═════╡
│ 3   ┆ 3   ┆ 3   │
└─────┴─────┴─────┘


tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000


In [7]:
# you can also slice
pl.scan_parquet("../citibike-trips-2016-04.parq")\
    .slice(0,5000)\
    .select(pl.all().len())\
    .collect()

tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000


# The bug in my polars plugin

In [1]:
!ls -alhstr ./PULocationID.parq

220680 -rw-r--r--@ 1 paddy  staff   108M Dec  8 11:13 ./PULocationID.parq


In [None]:
from buckaroo.read_utils import read_df, read
import polars as pl
from pl_series_hash import hash_xx
ldf = read_df("../2024-01-05_tripdata.parq")
ldf

In [None]:
%timeit ldf.select([pl.col('PULocationID')]).sink_parquet("PULocationID.parq")

In [None]:
%timeit ldf.select([pl.col('PULocationID').pl_series_hash.hash_xx()]).collect()

In [None]:
%timeit ldf.select([pl.col('PULocationID').mean()]).collect()

In [None]:
%timeit ldf.select([pl.col('PULocationID').hash()]).collect()

In [None]:
!time md5sum PULocationID.parq

In [None]:
#108 MB
#1.52 seconds
# do the calcs