## Polars

 - Polars is very fast and memory efficient
 - written in Rust, uses Apache Arrow
 - Can handle more data than fits in memory ! (Hybrid Streaming)
 - Lazy | eager execution, Multi-threaded
 - SIMD (Single Instruction / Multiple Data)
 - Query optimization, Powerful expression API (Rust | Python | NodeJS | ...)
 - https://www.pola.rs/
 - https://pypi.org/project/polars/ 
 - https://github.com/pola-rs/polars
 - https://pola-rs.github.io/polars-book/user-guide/ - I used this to make this notebook
 - https://towardsdatascience.com/pandas-vs-polars-a-syntax-and-speed-comparison-5aa54e27497e 
 - https://pola-rs.github.io/polars-book/user-guide/howcani/selecting_data/selecting_data_expressions.html 

In [33]:
# pip install polars
# pip install 'polars[all]'
# pip install 'polars[numpy,pandas,pyarrow]' # install a subset only

import os, sys
import polars as pl
import numpy as np

In [34]:
print("polars version",pl.__version__)
print("numpy  version",np.__version__)

polars version 1.6.0
numpy  version 2.1.1


In [35]:
# get online data into Polaris DataFrame
os.makedirs("./data", exist_ok=True)
local_fname = "data/iris.csv"
csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

if os.path.exists(local_fname):
    df = pl.read_csv(local_fname)
else:
    df = pl.read_csv(csv_url)
    
col_names = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width','Class']
df.columns = col_names

# write it into local CSV file

df.write_csv("data/iris.csv")
df.write_parquet("data/iris.parquet")

# df2 = pl.read_csv("data/iris.csv")
# df2 = pl.scan_csv("data/iris.csv")         # lazy
# df2 = pl.read_parquet("data/iris.parquet")
# df2 = pl.scan_parquet("data/iris.parquet") # lazy

df_orig = df.clone()

print("type(df) =",type(df))
print("--------------------------------")
print(f"rows: {len(df)}")
display(df.head(3))  # first 3 rows
display(df.tail(3))  # last 3 rows 
display(df[1:2])     # row #1 

type(df) = <class 'polars.dataframe.frame.DataFrame'>
--------------------------------
rows: 149


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
f64,f64,f64,f64,str
4.9,3.0,1.4,0.2,"""Iris-setosa"""
4.7,3.2,1.3,0.2,"""Iris-setosa"""
4.6,3.1,1.5,0.2,"""Iris-setosa"""


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
f64,f64,f64,f64,str
6.5,3.0,5.2,2.0,"""Iris-virginica"""
6.2,3.4,5.4,2.3,"""Iris-virginica"""
5.9,3.0,5.1,1.8,"""Iris-virginica"""


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
f64,f64,f64,f64,str
4.7,3.2,1.3,0.2,"""Iris-setosa"""


In [36]:
# select subset of columns:
# df[['Sepal_Length', 'Class']] 

df2 = df.select(pl.col(['Sepal_Length', 'Class']))
print(f"rows: {len(df2)}")
df2.head()

rows: 149


Sepal_Length,Class
f64,str
4.9,"""Iris-setosa"""
4.7,"""Iris-setosa"""
4.6,"""Iris-setosa"""
5.0,"""Iris-setosa"""
5.4,"""Iris-setosa"""


In [37]:
# df.filter() instead of df.query('Sepal_Length > 5')

df2 = df.filter(pl.col('Sepal_Length') > 5 ) 
# df2 = df.filter(    df['Sepal_Length'] > 5 ) # same effect
print(f"rows: {len(df2)}")
df2.head()

rows: 117


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
f64,f64,f64,f64,str
5.4,3.9,1.7,0.4,"""Iris-setosa"""
5.4,3.7,1.5,0.2,"""Iris-setosa"""
5.8,4.0,1.2,0.2,"""Iris-setosa"""
5.7,4.4,1.5,0.4,"""Iris-setosa"""
5.4,3.9,1.3,0.4,"""Iris-setosa"""


In [38]:
# use multiple filters at once
df2 = df.filter( (pl.col('Sepal_Length') > 5) & (pl.col('Sepal_Width') < 3) )
print(f"rows: {len(df2)}")
df2.head()

rows: 51


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
f64,f64,f64,f64,str
5.5,2.3,4.0,1.3,"""Iris-versicolor"""
6.5,2.8,4.6,1.5,"""Iris-versicolor"""
5.7,2.8,4.5,1.3,"""Iris-versicolor"""
6.6,2.9,4.6,1.3,"""Iris-versicolor"""
5.2,2.7,3.9,1.4,"""Iris-versicolor"""


In [39]:
# Adding new column:
# in Pandas: df_pd["new_col"] = df_pd["col"] * 10

# in Polars
df2 = df.with_columns([(pl.col("Sepal_Length") * 10).alias("mycol")])
print(f"rows: {len(df2)}")
df2.head()

# You can add multiple columns in one expression:
# df2 = df.with_columns([(pl.col("col1") * 10).alias("new_col1"), ...])

rows: 149


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class,mycol
f64,f64,f64,f64,str,f64
4.9,3.0,1.4,0.2,"""Iris-setosa""",49.0
4.7,3.2,1.3,0.2,"""Iris-setosa""",47.0
4.6,3.1,1.5,0.2,"""Iris-setosa""",46.0
5.0,3.6,1.4,0.2,"""Iris-setosa""",50.0
5.4,3.9,1.7,0.4,"""Iris-setosa""",54.0


In [40]:
# groupby & aggregate
# Pandas: df_pd.groupby('col1')['col2'].agg('mean')
# in polars use df.group_by()

# Polars
df2 = df.group_by('Class').agg([pl.col('Sepal_Length').mean()])  # As suggested in Polars docs
df3 = df.group_by('Class').agg([pl.mean('Sepal_Length')])        # Shorter
print(f"rows: {len(df2)}")
display(df2)
print("--------------------------------")
print(f"rows: {len(df3)}")
display(df3)

rows: 3


Class,Sepal_Length
str,f64
"""Iris-setosa""",5.004082
"""Iris-virginica""",6.588
"""Iris-versicolor""",5.936


--------------------------------
rows: 3


Class,Sepal_Length
str,f64
"""Iris-setosa""",5.004082
"""Iris-versicolor""",5.936
"""Iris-virginica""",6.588


In [41]:
# alternative way - read, filter, group_by, aggregate
q = (
    pl.scan_csv(local_fname)
    .filter(pl.col("Sepal_Length") > 5)
    .group_by("Class")
    .agg(pl.all().mean())
)

df = q.collect()
display(df)

Class,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width
str,f64,f64,f64,f64
"""Iris-virginica""",6.622449,2.983673,5.573469,2.032653
"""Iris-versicolor""",5.997872,2.804255,4.317021,1.346809
"""Iris-setosa""",5.32381,3.72381,1.514286,0.280952


In [42]:
# Missing Data
# https://pola-rs.github.io/polars-book/user-guide/howcani/missing_data.html
#   Pandas:  NaN  ,         .fillna()     df['col2'].fillna(-999)
#   Polars:  None = null ,  .fill_null()
#            NaN            .fill_nan()
# 
# df.null_count() - counts number of nulls (None) values in each column
#                   and returns it as a 1-row DataFrame
#
# NaN values are NOT considered to be missing data
#            and NOT counted with the null_count() method
#
# NaN values are filled with fill_nan(), but not fill_null()
#
# Polars has is_nan() and fill_nan() methods 
#            which work similar to is_null() and fill_null() methods. 

dfn = pl.DataFrame( {"c1": [1.0, None], "c2": [np.nan, 2.0]})
display(dfn)
display(dfn.dtypes)
print("--------------------------------")
display(dfn.null_count())
print("--------------------------------")
dfn2 = dfn.fill_nan(None)
display(dfn2.dtypes)
print("--------------------------------")
display(dfn2.null_count())

c1,c2
f64,f64
1.0,
,2.0


[Float64, Float64]

--------------------------------


c1,c2
u32,u32
1,0


--------------------------------


[Float64, Float64]

--------------------------------


c1,c2
u32,u32
1,1


In [43]:
# mean() of data with None/null values - excludes None/null from calculation 
# mean() of data with np.NaN - result in NaN
# Solution - substitute NaN with None

dfn2 = dfn.with_columns([
    pl.col("c1").fill_nan(None).alias("c1_"),
    pl.col("c2").fill_nan(None).alias("c2_"),
])

display(dfn2)
print("--------------------------------")
print("mean() :")
display(dfn2.mean())
print("--------------------------------")
print("fill nulls with -999 :")
dfn3 = dfn.with_columns(pl.col('c1').fill_null(pl.lit(-999))) # As suggested in Polars docs
dfn3 = dfn.with_columns(pl.col('c1').fill_null(-999))         # Shorter
display(dfn3)
print("--------------------------------")
dfn4 = dfn2.fill_null(-999)
display(dfn4)

c1,c2,c1_,c2_
f64,f64,f64,f64
1.0,,1.0,
,2.0,,2.0


--------------------------------
mean() :


c1,c2,c1_,c2_
f64,f64,f64,f64
1.0,,1.0,2.0


--------------------------------
fill nulls with -999 :


c1,c2
f64,f64
1.0,
-999.0,2.0


--------------------------------


c1,c2,c1_,c2_
f64,f64,f64,f64
1.0,,1.0,-999.0
-999.0,2.0,-999.0,2.0


### Manipulating data in a Polars DataFrame

https://towardsdatascience.com/manipulating-values-in-polars-dataframes-1087d88dd436

In [46]:
matrix = [
    (1, 2, 3),
    (4, 5, 6),
    (7, 8, 9),
    (10, 11, 12),
    (13, 14, 15),
    (16, 17, 18)
]
df = pl.DataFrame(matrix, schema=list('abc'), orient="row")  # "schema" used instead of "columns"
df

a,b,c
i64,i64,i64
1,2,3
4,5,6
7,8,9
10,11,12
13,14,15
16,17,18


In [55]:
# apply() and map()

# pl.col('a').apply(lambda x: x*2)     # use apply and lambda
# pl.col('a') * 2                      # use simple column expression

myseries = df["a"]
transformed_series = myseries.map_elements(lambda x: x ** 2 if x % 2 == 0 else x, return_dtype=int)
df2 = df.clone()
df2 = df2.with_columns(transformed_series.alias("a"))
df2                    

a,b,c
i64,i64,i64
1,2,3
16,5,6
7,8,9
100,11,12
13,14,15
256,17,18


# Below code needs to be changed (new polars uses different syntax) XXXXXXX

In [None]:
df2 = df.select(
    pl.col('a').apply(lambda x: x*2 if x>=5 else x)
)
print(type(df2))
print(df2)

print("------------------------------")
df2 = df.select(
    pl.col('a').map_elements(lambda x: x*2) # can not use "if" on a Series
)
print(type(df2))
print(df2)

AttributeError: 'Expr' object has no attribute 'apply'

In [None]:
# changing only column 'a' - returning all columns

q = (
    df
    .lazy()
    .select(
        [
            pl.col('a').apply(lambda x: x*2),
            pl.exclude('a')
        ]
    )
)

df2 = q.collect()
df2

In [None]:
# multiply column "a" by 2 
# store the result in new column "a2"

q = (
    df
    .lazy()
    .select(
        [
            pl.col('*'),
            pl.col('a').apply(lambda x: x*2).alias("a2"),
        ]
    )
)
q.collect()

In [None]:
# applying function to rows

def test(x):
    print(x) # print row as tuple
    return x # return row as tuple
  
df2 = df.apply(test) # apply() works row by row
print("---------------------------")
display(df2)

In [None]:
# perform an integer division of all the numbers in a row by 2 
# if the sum of the numers in this row sum(x) > 10:

df.apply(lambda x: tuple([i // 2 for i in x]) if sum(x) > 10 else x)

In [None]:
# in Python multiplying tuple causes repeating
aa = (1,2,3)
aa*2             # (1, 2, 3, 1, 2, 3)

In [None]:
# duplicate all columns 
# (x*2 converts a tuple of 3 numbers to 6 numbers):

df.apply(lambda x: x*2)

In [None]:
# calculate totals by column:

df.sum()

In [None]:
# add a row with totals at the bottom of the DataFrame

pl.concat([df, df.sum()])

In [None]:
# sum by row

df.sum(axis=1)

In [None]:
# add the sum by row as a new column:

df.select(
    [
        pl.col('*'),
        df.select(pl.col('*')).sum(axis=1).alias('sum')
    ]
)

### Polars: change a value in a dataframe if a condition is met in another column

https://stackoverflow.com/questions/75984983/polars-change-a-value-in-a-dataframe-if-a-condition-is-met-in-another-column 

This is similar to the "UPDATE" operation in SQL,
<br>or using "mask" in Pandas 

In [None]:
# change value in column "a" based on value in column "b"
# here we use the **kwargs input of with_columns 
# to allow the column named to be on the left of an equal sign
# as though it were a parameter to a function

df2 = (df 
    .with_columns(
        a = pl.when(pl.col('b') == 5)
                .then(555)
                .otherwise(pl.col('a')))
)

display(df2)

In [None]:
# You can also use alias()
# Note: we can check values in more than one column
# Note: use parenthesis around (pl.when/then/otherwise) to avoid unexpected results. 

df2 = (df 
    .with_columns(
        (pl.when( (pl.col('b') >= 5) & (pl.col('c') <= 9) )
                .then(555)
                .otherwise(pl.col('a'))
        ).alias('a')
                )
      )

display(df2)

### Cheatsheet for Pandas to Polars

https://www.rhosignal.com/posts/polars-pandas-cheatsheet/

In [None]:
df = pl.DataFrame({'grp': [1, 2, 1, 2, 1, 2],
                   'x': list(range(6, 0, -1)),
                   'y': list(range(4, 10)),
                   'z': [3, 4, 5, 6, 7, None],
                   "index" : list('abcdef')})
print(df.columns)
print(len(df))
display(df)

In [None]:
# get/set value in single cell 
print(df[1, 1]) # 5
df[1, 1] = 55
print(df[1, 1]) # 5
print(df)

In [None]:
# Row slicing by location
df[1:3]

In [None]:
# Column slicing by location (removed "grp" column)
df[:, 1:]

In [None]:
# Row indexing by label
df.filter(pl.col("index") == "c")

In [None]:
# Column indexing by label
print(df[:, "x"])
print(df.select("x"))

In [None]:
# Column indexing by labels
print( df[:, ['x', 'z']] )
print( df.select(['x', 'z']) )

In [None]:
# Column slicing by label
df[:, "x":"z"]

In [None]:
# Mixed indexing
df.filter(pl.col("index") == "c")[0, 1]

In [None]:
# Reduce multiple values (mean, sum, ...)
df['z'].mean()

In [None]:
df.select(pl.col("z").mean())

In [None]:
# Add new column
df.with_columns((pl.col("z") + 1).alias("z1"))

In [None]:
# Rename columns
df.rename({"x": "x_new"})

In [None]:
# Drop columns
df.drop(['x','y'])

In [None]:
# Sort rows by a column
df.sort("x")

In [None]:
# Drop rows with missing values
df.drop_nulls()

In [None]:
# Select unique rows
df.unique()

### concat, extend, or vstack

https://www.rhosignal.com/posts/polars-extend-vstack/    

- pl.concat([df1,df2]) copies all the data to a single new location
- df1.vstack(df2) does not copy data, it just links the new DataFrame to the existing two locations in memory
- df1.extend(df2) copies the data from df2 and appends it to the data for df1

In [None]:
pl.concat([df,df]) 

In [None]:
# you can not "extend()" a DataFrame with itself.
# so first we make a copy using the "clone()" command
df2 = df.clone()
df2.extend(df)

In [None]:
# counting number of nulls int he whole DataFrame

# Create a sample DataFrame
data = {
    "A": [1, 2, None, 4, 5],
    "B": [None, 2, 3, 4, None],
    "C": [1, 2, 3, None, 5],
}

df = pl.DataFrame(data)

# print number of nulls in each column:
dfn = df.null_count()
display(dfn)
print("-----------------")
# calculate total number of nulls in DataFrame
df_nulls = 0
for col in dfn.columns:
    df_nulls += dfn[col][0]
print(f"Total number of nulls: {df_nulls}")

In [None]:
# There is no transpose function built-in
# but you can write your own.
# You can first convert DataFrame into a dict of columns using df.to_dict()
# Or you can go through columns using df[col].to_list()
# Or you can jsut take values one by one from the DataFrame like that:

def T(df):
    t_data = {}
    for i in range(df.height):        
        row_vals = []
        for c in range(df.width):
            row_vals.append(df[i,c])
        t_data[str(i)] = row_vals
    return pl.DataFrame(t_data)

# Create a sample DataFrame
data = {
    "A": [1, 2, 3],
    "B": [4, 5, 6],
    "C": [7, 8, 9],
}

df = pl.DataFrame(data)

# Transpose the DataFrame
transposed_df = T(df)
print(transposed_df)

In [None]:
# get column values
df['A'].to_list()

In [None]:
# get column as a series
s = df['A']
display(s)
print("--------------------------")
print("convert Polars column/Series to numpy using s.to_numpy()")
num_arr = df['A'].to_numpy()
print(type(num_arr),num_arr)
print("convert Polars column/Series to list")
mylist = df['A'].to_list()
print(type(mylist),mylist)
print("--------------------------")
print("convert whole Polars DataFrame to Numpy using df.to_numpy()")
print(df.to_numpy())
print("--------------------------")
print("convert whole Polars DataFrame to list of lists")
list_of_lists = list([list(row) for row in df])
print(list_of_lists)
print("convert whole Polars DataFrame to list of tuples")
list_of_tuples = list([tuple(row) for row in df])
print(list_of_tuples)

In [None]:
# converting between Pandas and Polars
# needs "pip install pyarrow"

import polars as pl
import pandas as pd

pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
print(pd_df)
print("----------------------------")
df_pl = pl.from_pandas(pd_df)
print(df_pl)
print("----------------------------")
pd_df2 = df_pl.to_pandas()
print(pd_df2)

In [None]:
# joining dataframes in Polars is similar 
# to how things work with join/merge in Pandas
# "left_on" and "right_on" parameters can use different columns 
# "how" parameter can have values "left", "right", "inner", "outer"

d1 = {"id":         [1, 2, 3, 4], "name": ["Alice", "Bob", "Charlie", "David"]}
d2 = {"account_id": [1, 2, 3, 4], "age": [25, 30, 35, 28]}

df1 = pl.DataFrame(d1)
df2 = pl.DataFrame(d2)

joined_df = df1.join(df2, left_on="id", right_on="account_id")
print(joined_df)