In [2]:
import pandas as pd 
import numpy as np
import polars as pl
import time
from datetime import datetime

import plotly.express as px


In [3]:
# Time with pandas
start = time.time() 
df_pandas = pd.read_parquet('data/data.parquet.gzip')
end = time.time()
pd_read = round(end - start, 3)
print(f"time using pandas: {pd_read}")

time using pandas: 0.626


In [4]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   invoice      100000 non-null  int64         
 1   date         100000 non-null  datetime64[ns]
 2   customer_id  100000 non-null  object        
 3   stockcode    100000 non-null  object        
 4   price        100000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 3.8+ MB


In [5]:
# Time with pandas
start = time.time() 
df = pl.read_parquet('data/data.parquet.gzip')
end = time.time()
pd_read = round(end - start, 3)
print(f"time using polars: {pd_read}")

time using polars: 0.064


In [6]:
df

invoice,date,customer_id,stockcode,price
i64,datetime[ns],str,str,f64
1,2020-07-25 00:00:00,"""d18734""","""y7""",50.45
2,2020-01-17 00:00:00,"""c21086""","""x7""",25.3
3,2019-07-05 00:00:00,"""d18185""","""z5""",18.4
4,2019-02-26 00:00:00,"""c18331""","""z2""",5.5
5,2019-02-10 00:00:00,"""b16309""","""y7""",18.4
6,2020-09-15 00:00:00,"""a19555""","""x4""",50.45
7,2020-03-21 00:00:00,"""e21922""","""z5""",5.5
8,2020-12-14 00:00:00,"""d20934""","""y2""",50.45
9,2020-10-05 00:00:00,"""a22528""","""y6""",12.99
10,2019-12-04 00:00:00,"""b21872""","""z6""",12.99


In [7]:
filter_df = df.filter(pl.col("price") <= 5.5)
print(filter_df)

shape: (19824, 5)
┌─────────┬─────────────────────┬─────────────┬───────────┬───────┐
│ invoice ┆ date                ┆ customer_id ┆ stockcode ┆ price │
│ ---     ┆ ---                 ┆ ---         ┆ ---       ┆ ---   │
│ i64     ┆ datetime[ns]        ┆ str         ┆ str       ┆ f64   │
╞═════════╪═════════════════════╪═════════════╪═══════════╪═══════╡
│ 4       ┆ 2019-02-26 00:00:00 ┆ c18331      ┆ z2        ┆ 5.5   │
│ 7       ┆ 2020-03-21 00:00:00 ┆ e21922      ┆ z5        ┆ 5.5   │
│ 11      ┆ 2020-04-09 00:00:00 ┆ b21716      ┆ x3        ┆ 5.5   │
│ 16      ┆ 2020-06-11 00:00:00 ┆ c16529      ┆ z1        ┆ 5.5   │
│ …       ┆ …                   ┆ …           ┆ …         ┆ …     │
│ 99980   ┆ 2020-05-19 00:00:00 ┆ e16366      ┆ y6        ┆ 5.5   │
│ 99983   ┆ 2020-08-13 00:00:00 ┆ d16743      ┆ y3        ┆ 5.5   │
│ 99986   ┆ 2020-08-12 00:00:00 ┆ b15704      ┆ y3        ┆ 5.5   │
│ 99988   ┆ 2019-06-25 00:00:00 ┆ a21538      ┆ z2        ┆ 5.5   │
└─────────┴───────────────────

In [8]:
multi_filter_df = df.filter((pl.col("invoice") <= 5) & (pl.col("invoice") <= 100))
print(multi_filter_df)

shape: (5, 5)
┌─────────┬─────────────────────┬─────────────┬───────────┬───────┐
│ invoice ┆ date                ┆ customer_id ┆ stockcode ┆ price │
│ ---     ┆ ---                 ┆ ---         ┆ ---       ┆ ---   │
│ i64     ┆ datetime[ns]        ┆ str         ┆ str       ┆ f64   │
╞═════════╪═════════════════════╪═════════════╪═══════════╪═══════╡
│ 1       ┆ 2020-07-25 00:00:00 ┆ d18734      ┆ y7        ┆ 50.45 │
│ 2       ┆ 2020-01-17 00:00:00 ┆ c21086      ┆ x7        ┆ 25.3  │
│ 3       ┆ 2019-07-05 00:00:00 ┆ d18185      ┆ z5        ┆ 18.4  │
│ 4       ┆ 2019-02-26 00:00:00 ┆ c18331      ┆ z2        ┆ 5.5   │
│ 5       ┆ 2019-02-10 00:00:00 ┆ b16309      ┆ y7        ┆ 18.4  │
└─────────┴─────────────────────┴─────────────┴───────────┴───────┘


In [9]:
date_start = datetime.strptime('2020-01-17', "%Y-%m-%d").date()
date_end = datetime.strptime('2020-07-25', "%Y-%m-%d").date()

filter_date = pl_filtered = df.filter(pl.col("date").is_between(date_start,date_end),)
filter_date

invoice,date,customer_id,stockcode,price
i64,datetime[ns],str,str,f64
1,2020-07-25 00:00:00,"""d18734""","""y7""",50.45
2,2020-01-17 00:00:00,"""c21086""","""x7""",25.3
7,2020-03-21 00:00:00,"""e21922""","""z5""",5.5
11,2020-04-09 00:00:00,"""b21716""","""x3""",5.5
13,2020-07-05 00:00:00,"""a19508""","""y5""",18.4
15,2020-02-11 00:00:00,"""b20365""","""z7""",25.3
16,2020-06-11 00:00:00,"""c16529""","""z1""",5.5
24,2020-04-05 00:00:00,"""c19798""","""y2""",50.45
28,2020-06-05 00:00:00,"""e18669""","""z4""",50.45
29,2020-04-18 00:00:00,"""c15976""","""z2""",25.3


In [10]:
fig = px.histogram(x=filter_date["stockcode"])
fig.show()

In [11]:
print(type(filter_date))
filter_date_pandas = filter_date.to_pandas()
print(type(filter_date_pandas))

<class 'polars.dataframe.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [12]:
fig = px.histogram(filter_date_pandas, x="stockcode")
fig.show()

In [13]:
filter_date.head()

invoice,date,customer_id,stockcode,price
i64,datetime[ns],str,str,f64
1,2020-07-25 00:00:00,"""d18734""","""y7""",50.45
2,2020-01-17 00:00:00,"""c21086""","""x7""",25.3
7,2020-03-21 00:00:00,"""e21922""","""z5""",5.5
11,2020-04-09 00:00:00,"""b21716""","""x3""",5.5
13,2020-07-05 00:00:00,"""a19508""","""y5""",18.4


In [14]:
time_series = filter_date.groupby("date").agg(pl.count(),).sort("date", descending=True)
time_series.head()

date,count
datetime[ns],u32
2020-07-25 00:00:00,131
2020-07-24 00:00:00,135
2020-07-23 00:00:00,122
2020-07-22 00:00:00,149
2020-07-21 00:00:00,145


In [15]:
fig = px.line(x=time_series["date"], y=time_series["count"], title='Time-series - polars')
fig.show()

In [16]:
data = {"a": [1, 2], "b": [3, 4]}
data2 = {"c": ['h', 's'], "b": [3, 4]}
data = pl.DataFrame(data)
data2 = pl.DataFrame(data2)

In [17]:
df_join = data.join(data2, on="b")
print(df_join)

shape: (2, 3)
┌─────┬─────┬─────┐
│ a   ┆ b   ┆ c   │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 1   ┆ 3   ┆ h   │
│ 2   ┆ 4   ┆ s   │
└─────┴─────┴─────┘


In [18]:
data = {"a": [1, 2], "b": [3, 4]}
data2 = {"c": ['h', 's'], "b": [3, 5]}
data = pl.DataFrame(data)
data2 = pl.DataFrame(data2)

In [19]:
df_left_join = data.join(data2, on="b", how="left")
print(df_left_join)

shape: (2, 3)
┌─────┬─────┬──────┐
│ a   ┆ b   ┆ c    │
│ --- ┆ --- ┆ ---  │
│ i64 ┆ i64 ┆ str  │
╞═════╪═════╪══════╡
│ 1   ┆ 3   ┆ h    │
│ 2   ┆ 4   ┆ null │
└─────┴─────┴──────┘


In [20]:
url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv"

dtypes = {
    "first_name": pl.Categorical,
    "gender": pl.Categorical,
    "type": pl.Categorical,
    "state": pl.Categorical,
    "party": pl.Categorical,
}

dataset = pl.read_csv(url, dtypes=dtypes).with_columns(pl.col("birthday").str.strptime(pl.Date, strict=False))

dataset.head()

last_name,first_name,middle_name,suffix,nickname,full_name,birthday,gender,type,state,district,senate_class,party,url,address,phone,contact_form,rss_url,twitter,twitter_id,facebook,youtube,youtube_id,mastodon,bioguide_id,thomas_id,opensecrets_id,lis_id,fec_ids,cspan_id,govtrack_id,votesmart_id,ballotpedia_id,washington_post_id,icpsr_id,wikipedia_id
str,cat,str,str,str,str,date,cat,cat,cat,i64,i64,cat,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,i64,str
"""Bassett""","""Richard""",,,,,1745-04-02,"""M""","""sen""","""DE""",,2.0,"""Anti-Administr…",,,,,,,,,,,,"""B000226""",,,,,,401222,,,,507,"""Richard Basset…"
"""Bland""","""Theodorick""",,,,,1742-03-21,"""M""","""rep""","""VA""",9.0,,,,,,,,,,,,,,"""B000546""",,,,,,401521,,,,786,"""Theodorick Bla…"
"""Burke""","""Aedanus""",,,,,1743-06-16,"""M""","""rep""","""SC""",2.0,,,,,,,,,,,,,,"""B001086""",,,,,,402032,,,,1260,"""Aedanus Burke"""
"""Carroll""","""Daniel""",,,,,1730-07-22,"""M""","""rep""","""MD""",6.0,,,,,,,,,,,,,,"""C000187""",,,,,,402334,,,,1538,"""Daniel Carroll…"
"""Clymer""","""George""",,,,,1739-03-16,"""M""","""rep""","""PA""",-1.0,,,,,,,,,,,,,,"""C000538""",,,,,,402671,,,,1859,"""George Clymer"""


In [21]:
lazy_filter_df = (
    pl.scan_parquet("data/data.parquet.gzip")
    .filter(
        pl.col("date").is_between(date_start,date_end),
    )
)
print(lazy_filter_df.describe_optimized_plan())


  PARQUET SCAN data/data.parquet.gzip
  PROJECT */5 COLUMNS
  SELECTION: [([(col("date")) >= (2020-01-17 00:00:00)]) & ([(col("date")) <= (2020-07-25 00:00:00)])]


In [22]:
lazy_filter_df.head()

In [23]:
lazy_filter_df.collect()

invoice,date,customer_id,stockcode,price
i64,datetime[ns],str,str,f64
1,2020-07-25 00:00:00,"""d18734""","""y7""",50.45
2,2020-01-17 00:00:00,"""c21086""","""x7""",25.3
7,2020-03-21 00:00:00,"""e21922""","""z5""",5.5
11,2020-04-09 00:00:00,"""b21716""","""x3""",5.5
13,2020-07-05 00:00:00,"""a19508""","""y5""",18.4
15,2020-02-11 00:00:00,"""b20365""","""z7""",25.3
16,2020-06-11 00:00:00,"""c16529""","""z1""",5.5
24,2020-04-05 00:00:00,"""c19798""","""y2""",50.45
28,2020-06-05 00:00:00,"""e18669""","""z4""",50.45
29,2020-04-18 00:00:00,"""c15976""","""z2""",25.3


In [32]:
q4 = (
    pl.scan_parquet("data/data.parquet.gzip")
    .with_columns(pl.col("stockcode").str.to_uppercase())
    .filter(pl.col("price") > 50)
    .collect()
)

q4.head()

invoice,date,customer_id,stockcode,price
i64,datetime[ns],str,str,f64
1,2020-07-25 00:00:00,"""d18734""","""Y7""",50.45
6,2020-09-15 00:00:00,"""a19555""","""X4""",50.45
8,2020-12-14 00:00:00,"""d20934""","""Y2""",50.45
14,2020-11-25 00:00:00,"""d17853""","""X1""",50.45
19,2020-10-13 00:00:00,"""c20050""","""X3""",50.45


In [33]:
h6 = q4.clone()