In [30]:
import pandas as pd 
import numpy as np
import polars as pl
import time
from datetime import datetime

In [17]:
# Time with pandas
start = time.time() 
df_pandas = pd.read_parquet('data/data.parquet.gzip')
end = time.time()
pd_read = round(end - start, 3)
print(f"time using pandas: {pd_read}")

time using pandas: 0.111


In [18]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   invoice      100000 non-null  int64         
 1   date         100000 non-null  datetime64[ns]
 2   customer_id  100000 non-null  object        
 3   stockcode    100000 non-null  object        
 4   price        100000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 3.8+ MB


In [21]:
# Time with pandas
start = time.time() 
df = pl.read_parquet('data/data.parquet.gzip')
end = time.time()
pd_read = round(end - start, 3)
print(f"time using polars: {pd_read}")

time using polars: 0.049


In [26]:
df

invoice,date,customer_id,stockcode,price
i64,datetime[ns],str,str,f64
1,2020-07-25 00:00:00,"""d18734""","""y7""",50.45
2,2020-01-17 00:00:00,"""c21086""","""x7""",25.3
3,2019-07-05 00:00:00,"""d18185""","""z5""",18.4
4,2019-02-26 00:00:00,"""c18331""","""z2""",5.5
5,2019-02-10 00:00:00,"""b16309""","""y7""",18.4
6,2020-09-15 00:00:00,"""a19555""","""x4""",50.45
7,2020-03-21 00:00:00,"""e21922""","""z5""",5.5
8,2020-12-14 00:00:00,"""d20934""","""y2""",50.45
9,2020-10-05 00:00:00,"""a22528""","""y6""",12.99
10,2019-12-04 00:00:00,"""b21872""","""z6""",12.99


In [23]:
filter_df = df.filter(pl.col("price") <= 5.5)
print(filter_df)

shape: (19824, 5)
┌─────────┬─────────────────────┬─────────────┬───────────┬───────┐
│ invoice ┆ date                ┆ customer_id ┆ stockcode ┆ price │
│ ---     ┆ ---                 ┆ ---         ┆ ---       ┆ ---   │
│ i64     ┆ datetime[ns]        ┆ str         ┆ str       ┆ f64   │
╞═════════╪═════════════════════╪═════════════╪═══════════╪═══════╡
│ 4       ┆ 2019-02-26 00:00:00 ┆ c18331      ┆ z2        ┆ 5.5   │
│ 7       ┆ 2020-03-21 00:00:00 ┆ e21922      ┆ z5        ┆ 5.5   │
│ 11      ┆ 2020-04-09 00:00:00 ┆ b21716      ┆ x3        ┆ 5.5   │
│ 16      ┆ 2020-06-11 00:00:00 ┆ c16529      ┆ z1        ┆ 5.5   │
│ …       ┆ …                   ┆ …           ┆ …         ┆ …     │
│ 99980   ┆ 2020-05-19 00:00:00 ┆ e16366      ┆ y6        ┆ 5.5   │
│ 99983   ┆ 2020-08-13 00:00:00 ┆ d16743      ┆ y3        ┆ 5.5   │
│ 99986   ┆ 2020-08-12 00:00:00 ┆ b15704      ┆ y3        ┆ 5.5   │
│ 99988   ┆ 2019-06-25 00:00:00 ┆ a21538      ┆ z2        ┆ 5.5   │
└─────────┴───────────────────

In [27]:
multi_filter_df = df.filter((pl.col("invoice") <= 5) & (pl.col("invoice") <= 100))
print(multi_filter_df)

shape: (5, 5)
┌─────────┬─────────────────────┬─────────────┬───────────┬───────┐
│ invoice ┆ date                ┆ customer_id ┆ stockcode ┆ price │
│ ---     ┆ ---                 ┆ ---         ┆ ---       ┆ ---   │
│ i64     ┆ datetime[ns]        ┆ str         ┆ str       ┆ f64   │
╞═════════╪═════════════════════╪═════════════╪═══════════╪═══════╡
│ 1       ┆ 2020-07-25 00:00:00 ┆ d18734      ┆ y7        ┆ 50.45 │
│ 2       ┆ 2020-01-17 00:00:00 ┆ c21086      ┆ x7        ┆ 25.3  │
│ 3       ┆ 2019-07-05 00:00:00 ┆ d18185      ┆ z5        ┆ 18.4  │
│ 4       ┆ 2019-02-26 00:00:00 ┆ c18331      ┆ z2        ┆ 5.5   │
│ 5       ┆ 2019-02-10 00:00:00 ┆ b16309      ┆ y7        ┆ 18.4  │
└─────────┴─────────────────────┴─────────────┴───────────┴───────┘


In [32]:
date_start = datetime.strptime('2020-01-17', "%Y-%m-%d").date()
date_end = datetime.strptime('2020-07-25', "%Y-%m-%d").date()

filter_date = pl_filtered = df.filter(pl.col("date").is_between(date_start,date_end),)
filter_date

invoice,date,customer_id,stockcode,price
i64,datetime[ns],str,str,f64
1,2020-07-25 00:00:00,"""d18734""","""y7""",50.45
2,2020-01-17 00:00:00,"""c21086""","""x7""",25.3
7,2020-03-21 00:00:00,"""e21922""","""z5""",5.5
11,2020-04-09 00:00:00,"""b21716""","""x3""",5.5
13,2020-07-05 00:00:00,"""a19508""","""y5""",18.4
15,2020-02-11 00:00:00,"""b20365""","""z7""",25.3
16,2020-06-11 00:00:00,"""c16529""","""z1""",5.5
24,2020-04-05 00:00:00,"""c19798""","""y2""",50.45
28,2020-06-05 00:00:00,"""e18669""","""z4""",50.45
29,2020-04-18 00:00:00,"""c15976""","""z2""",25.3


In [33]:
lazy_filter_df = (
    pl.scan_parquet("data/data.parquet.gzip")
    .filter(
        pl.col("date").is_between(date_start,date_end),
    )
)
print(lazy_filter_df.describe_optimized_plan())


  PARQUET SCAN data/data.parquet.gzip
  PROJECT */5 COLUMNS
  SELECTION: [([(col("date")) >= (2020-01-17 00:00:00)]) & ([(col("date")) <= (2020-07-25 00:00:00)])]
