## Polars

 - Polars is very fast and memory efficient
 - written in Rust, uses Apache Arrow
 - Can handle more data than fits in memory ! (Hybrid Streaming)
 - Lazy | eager execution, Multi-threaded
 - SIMD (Single Instruction / Multiple Data)
 - Query optimization, Powerful expression API (Rust | Python | NodeJS | ...)
 - https://www.pola.rs/
 - https://pypi.org/project/polars/ 
 - https://github.com/pola-rs/polars
 - https://pola-rs.github.io/polars-book/user-guide/
 - https://towardsdatascience.com/pandas-vs-polars-a-syntax-and-speed-comparison-5aa54e27497e 
 - https://pola-rs.github.io/polars-book/user-guide/howcani/selecting_data/selecting_data_expressions.html 

In [1]:
# pip install polars
# pip install 'polars[all]'
# pip install 'polars[numpy,pandas,pyarrow]' # install a subset only

import os, sys
import polars as pl
import numpy as np

In [2]:
# get online data into Polaris DataFrame
os.makedirs("./data", exist_ok=True)
local_fname = "data/iris.csv"
csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

if os.path.exists(local_fname):
    df = pl.read_csv(local_fname)
else:
    df = pl.read_csv(csv_url)
    
col_names = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width','Class']
df.columns = col_names

# write it into local CSV file

df.write_csv("data/iris.csv")
df.write_parquet("data/iris.parquet")

# df2 = pl.read_csv("data/iris.csv")
# df2 = pl.scan_csv("data/iris.csv")         # lazy
# df2 = pl.read_parquet("data/iris.parquet")
# df2 = pl.scan_parquet("data/iris.parquet") # lazy

df_orig = df.clone()

print(type(df))
print("--------------------------------")
print(f"rows: {len(df)}")
display(df.head(3))  # first 3 rows
display(df.tail(3))  # last 3 rows 
display(df[1:2])     # row #1 

<class 'polars.internals.dataframe.frame.DataFrame'>
--------------------------------
rows: 149


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
f64,f64,f64,f64,str
4.9,3.0,1.4,0.2,"""Iris-setosa"""
4.7,3.2,1.3,0.2,"""Iris-setosa"""
4.6,3.1,1.5,0.2,"""Iris-setosa"""


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
f64,f64,f64,f64,str
6.5,3.0,5.2,2.0,"""Iris-virginica..."
6.2,3.4,5.4,2.3,"""Iris-virginica..."
5.9,3.0,5.1,1.8,"""Iris-virginica..."


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
f64,f64,f64,f64,str
4.7,3.2,1.3,0.2,"""Iris-setosa"""


In [3]:
# select subset of columns:
# df[['Sepal_Length', 'Class']] 

df2 = df.select(pl.col(['Sepal_Length', 'Class']))
print(f"rows: {len(df2)}")
df2.head()

rows: 149


Sepal_Length,Class
f64,str
4.9,"""Iris-setosa"""
4.7,"""Iris-setosa"""
4.6,"""Iris-setosa"""
5.0,"""Iris-setosa"""
5.4,"""Iris-setosa"""


In [4]:
# df.filter() instead of df.query('Sepal_Length > 5')

df2 = df.filter(pl.col('Sepal_Length') > 5 ) 
# df2 = df.filter(    df['Sepal_Length'] > 5 ) # same effect
print(f"rows: {len(df2)}")
df2.head()

rows: 117


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
f64,f64,f64,f64,str
5.4,3.9,1.7,0.4,"""Iris-setosa"""
5.4,3.7,1.5,0.2,"""Iris-setosa"""
5.8,4.0,1.2,0.2,"""Iris-setosa"""
5.7,4.4,1.5,0.4,"""Iris-setosa"""
5.4,3.9,1.3,0.4,"""Iris-setosa"""


In [5]:
# use multiple filters at once
df2 = df.filter( (pl.col('Sepal_Length') > 5) & (pl.col('Sepal_Width') < 3) )
print(f"rows: {len(df2)}")
df2.head()

rows: 51


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
f64,f64,f64,f64,str
5.5,2.3,4.0,1.3,"""Iris-versicolo..."
6.5,2.8,4.6,1.5,"""Iris-versicolo..."
5.7,2.8,4.5,1.3,"""Iris-versicolo..."
6.6,2.9,4.6,1.3,"""Iris-versicolo..."
5.2,2.7,3.9,1.4,"""Iris-versicolo..."


In [6]:
# Adding new column:
# in Pandas: df_pd["new_col"] = df_pd["col"] * 10

# in Polars
df2 = df.with_columns([(pl.col("Sepal_Length") * 10).alias("mycol")])
print(f"rows: {len(df2)}")
df2.head()

# You can add multiple columns in one expression:
# df2 = df.with_columns([(pl.col("col1") * 10).alias("new_col1"), ...])

rows: 149


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class,mycol
f64,f64,f64,f64,str,f64
4.9,3.0,1.4,0.2,"""Iris-setosa""",49.0
4.7,3.2,1.3,0.2,"""Iris-setosa""",47.0
4.6,3.1,1.5,0.2,"""Iris-setosa""",46.0
5.0,3.6,1.4,0.2,"""Iris-setosa""",50.0
5.4,3.9,1.7,0.4,"""Iris-setosa""",54.0


In [7]:
# groupby & aggregate
# Pandas: df_pd.groupby('col1')['col2'].agg('mean')

# Polars
df2 = df.groupby('Class').agg([pl.col('Sepal_Length').mean()])  # As suggested in Polars docs
df3 = df.groupby('Class').agg([pl.mean('Sepal_Length')])        # Shorter
print(f"rows: {len(df2)}")
display(df2)
print("--------------------------------")
print(f"rows: {len(df3)}")
display(df3)

rows: 3


Class,Sepal_Length
str,f64
"""Iris-virginica...",6.588
"""Iris-versicolo...",5.936
"""Iris-setosa""",5.004082


--------------------------------
rows: 3


Class,Sepal_Length
str,f64
"""Iris-setosa""",5.004082
"""Iris-virginica...",6.588
"""Iris-versicolo...",5.936


In [8]:
# alternative way - read, filter, groupby, aggregate
q = (
    pl.scan_csv(local_fname)
    .filter(pl.col("Sepal_Length") > 5)
    .groupby("Class")
    .agg(pl.all().mean())
)

df = q.collect()
display(df)

Class,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width
str,f64,f64,f64,f64
"""Iris-setosa""",5.32381,3.72381,1.514286,0.280952
"""Iris-virginica...",6.622449,2.983673,5.573469,2.032653
"""Iris-versicolo...",5.997872,2.804255,4.317021,1.346809


In [9]:
# Missing Data
# https://pola-rs.github.io/polars-book/user-guide/howcani/missing_data.html
#   Pandas:  NaN  ,         .fillna()     df['col2'].fillna(-999)
#   Polars:  None = null ,  .fill_null()
#            NaN            .fill_nan()
# 
# df.null_count() - counts number of nulls (None) values in each column
#                   and returns it as a 1-row DataFrame
#
# NaN values are NOT considered to be missing data
#            and NOT counted with the null_count() method
#
# NaN values are filled with fill_nan(), but not fill_null()
#
# Polars has is_nan() and fill_nan() methods 
#            which work similar to is_null() and fill_null() methods. 

dfn = pl.DataFrame( {"c1": [1.0, None], "c2": [np.nan, 2.0]})
display(dfn)
display(dfn.dtypes)
print("--------------------------------")
display(dfn.null_count())
print("--------------------------------")
dfn2 = dfn.fill_nan(None)
display(dfn2.dtypes)
print("--------------------------------")
display(dfn2.null_count())

c1,c2
f64,f64
1.0,
,2.0


[Float64, Float64]

--------------------------------


c1,c2
u32,u32
1,0


--------------------------------


[Float64, Float64]

--------------------------------


c1,c2
u32,u32
1,1


In [10]:
# mean() of data with None/null values - excludes None/null from calculation 
# mean() of data with np.NaN - result in NaN
# Solution - substitute NaN with None

dfn2 = dfn.with_columns([
    pl.col("c1").fill_nan(None).alias("c1_"),
    pl.col("c2").fill_nan(None).alias("c2_"),
])

display(dfn2)
print("--------------------------------")
print("mean() :")
display(dfn2.mean())
print("--------------------------------")
print("fill nulls with -999 :")
dfn3 = dfn.with_columns(pl.col('c1').fill_null(pl.lit(-999))) # As suggested in Polars docs
dfn3 = dfn.with_columns(pl.col('c1').fill_null(-999))         # Shorter
display(dfn3)
print("--------------------------------")
dfn4 = dfn2.fill_null(-999)
display(dfn4)

c1,c2,c1_,c2_
f64,f64,f64,f64
1.0,,1.0,
,2.0,,2.0


--------------------------------
mean() :


c1,c2,c1_,c2_
f64,f64,f64,f64
1.0,,1.0,2.0


--------------------------------
fill nulls with -999 :


c1,c2
f64,f64
1.0,
-999.0,2.0


--------------------------------


c1,c2,c1_,c2_
f64,f64,f64,f64
1.0,,1.0,-999.0
-999.0,2.0,-999.0,2.0


### Manipulating data in a Polars DataFrame

https://towardsdatascience.com/manipulating-values-in-polars-dataframes-1087d88dd436

In [11]:
matrix = [
    (1, 2, 3),
    (4, 5, 6),
    (7, 8, 9),
    (10, 11, 12),
    (13, 14, 15),
    (16, 17, 18)
]
df = pl.DataFrame(matrix, schema=list('abc'))  # "schema" used instead of "columns"
df

a,b,c
i64,i64,i64
1,2,3
4,5,6
7,8,9
10,11,12
13,14,15
16,17,18


In [12]:
# apply() and map()

# pl.col('a').apply(lambda x: x*2)     # use apply and lambda
# pl.col('a') * 2                      # use simple column expression

df2 = df.select(
    pl.col('a').apply(lambda x: x*2 if x>=5 else x)
)
print(type(df2))
print(df2)

print("------------------------------")
df2 = df.select(
    pl.col('a').map(lambda x: x*2) # can not use "if" on a Series
)
print(type(df2))
print(df2)

<class 'polars.internals.dataframe.frame.DataFrame'>
shape: (6, 1)
┌─────┐
│ a   │
│ --- │
│ i64 │
╞═════╡
│ 1   │
│ 4   │
│ 14  │
│ 20  │
│ 26  │
│ 32  │
└─────┘
------------------------------
<class 'polars.internals.dataframe.frame.DataFrame'>
shape: (6, 1)
┌─────┐
│ a   │
│ --- │
│ i64 │
╞═════╡
│ 2   │
│ 8   │
│ 14  │
│ 20  │
│ 26  │
│ 32  │
└─────┘


In [13]:
# changing only column 'a' - returning all columns

q = (
    df
    .lazy()
    .select(
        [
            pl.col('a').apply(lambda x: x*2),
            pl.exclude('a')
        ]
    )
)

df2 = q.collect()
df2

a,b,c
i64,i64,i64
2,2,3
8,5,6
14,8,9
20,11,12
26,14,15
32,17,18


In [14]:
# multiply column "a" by 2 
# store the result in new column "a2"

q = (
    df
    .lazy()
    .select(
        [
            pl.col('*'),
            pl.col('a').apply(lambda x: x*2).alias("a2"),
        ]
    )
)
q.collect()

a,b,c,a2
i64,i64,i64,i64
1,2,3,2
4,5,6,8
7,8,9,14
10,11,12,20
13,14,15,26
16,17,18,32


In [15]:
# applying function to rows

def test(x):
    print(x) # print row as tuple
    return x # return row as tuple
  
df2 = df.apply(test) # apply() works row by row
print("---------------------------")
display(df2)

(1, 2, 3)
(4, 5, 6)
(7, 8, 9)
(10, 11, 12)
(13, 14, 15)
(16, 17, 18)
---------------------------


column_0,column_1,column_2
i64,i64,i64
1,2,3
4,5,6
7,8,9
10,11,12
13,14,15
16,17,18


In [16]:
# perform an integer division of all the numbers in a row by 2 
# if the sum of the numers in this row sum(x) > 10:

df.apply(lambda x: tuple([i // 2 for i in x]) if sum(x) > 10 else x)

column_0,column_1,column_2
i64,i64,i64
1,2,3
2,2,3
3,4,4
5,5,6
6,7,7
8,8,9


In [17]:
# in Python multiplying tuple causes repeating
aa = (1,2,3)
aa*2             # (1, 2, 3, 1, 2, 3)

(1, 2, 3, 1, 2, 3)

In [18]:
# duplicate all columns 
# (x*2 converts a tuple of 3 numbers to 6 numbers):

df.apply(lambda x: x*2)

column_0,column_1,column_2,column_3,column_4,column_5
i64,i64,i64,i64,i64,i64
1,2,3,1,2,3
4,5,6,4,5,6
7,8,9,7,8,9
10,11,12,10,11,12
13,14,15,13,14,15
16,17,18,16,17,18


In [19]:
# calculate totals by column:

df.sum()

a,b,c
i64,i64,i64
51,57,63


In [20]:
# add a row with totals at the bottom of the DataFrame

pl.concat([df, df.sum()])

a,b,c
i64,i64,i64
1,2,3
4,5,6
7,8,9
10,11,12
13,14,15
16,17,18
51,57,63


In [21]:
# sum by row

df.sum(axis=1)

a
i64
6
15
24
33
42
51


In [22]:
# add the sum by row as a new column:

df.select(
    [
        pl.col('*'),
        df.select(pl.col('*')).sum(axis=1).alias('sum')
    ]
)

a,b,c,sum
i64,i64,i64,i64
1,2,3,6
4,5,6,15
7,8,9,24
10,11,12,33
13,14,15,42
16,17,18,51


### Polars: change a value in a dataframe if a condition is met in another column

https://stackoverflow.com/questions/75984983/polars-change-a-value-in-a-dataframe-if-a-condition-is-met-in-another-column 

This is similar to the "UPDATE" operation in SQL,
<br>or using "mask" in Pandas 

In [23]:
# change value in column "a" based on value in column "b"
# here we use the **kwargs input of with_columns 
# to allow the column named to be on the left of an equal sign
# as though it were a parameter to a function

df2 = (df 
    .with_columns(
        a = pl.when(pl.col('b') == 5)
                .then(555)
                .otherwise(pl.col('a')))
)

display(df2)

a,b,c
i64,i64,i64
1,2,3
555,5,6
7,8,9
10,11,12
13,14,15
16,17,18


In [24]:
# You can also use alias()
# Note: we can check values in more than one column
# Note: use parenthesis around (pl.when/then/otherwise) to avoid unexpected results. 

df2 = (df 
    .with_columns(
        (pl.when( (pl.col('b') >= 5) & (pl.col('c') <= 9) )
                .then(555)
                .otherwise(pl.col('a'))
        ).alias('a')
                )
      )

display(df2)

a,b,c
i64,i64,i64
1,2,3
555,5,6
555,8,9
10,11,12
13,14,15
16,17,18


### Cheatsheet for Pandas to Polars

https://www.rhosignal.com/posts/polars-pandas-cheatsheet/

In [25]:
df = pl.DataFrame({'grp': [1, 2, 1, 2, 1, 2],
                   'x': list(range(6, 0, -1)),
                   'y': list(range(4, 10)),
                   'z': [3, 4, 5, 6, 7, None],
                   "index" : list('abcdef')})
print(df.columns)
print(len(df))
display(df)

['grp', 'x', 'y', 'z', 'index']
6


grp,x,y,z,index
i64,i64,i64,i64,str
1,6,4,3.0,"""a"""
2,5,5,4.0,"""b"""
1,4,6,5.0,"""c"""
2,3,7,6.0,"""d"""
1,2,8,7.0,"""e"""
2,1,9,,"""f"""


In [26]:
# get/set value in single cell 
print(df[1, 1]) # 5
df[1, 1] = 55
print(df[1, 1]) # 5
print(df)

5
55
shape: (6, 5)
┌─────┬─────┬─────┬──────┬───────┐
│ grp ┆ x   ┆ y   ┆ z    ┆ index │
│ --- ┆ --- ┆ --- ┆ ---  ┆ ---   │
│ i64 ┆ i64 ┆ i64 ┆ i64  ┆ str   │
╞═════╪═════╪═════╪══════╪═══════╡
│ 1   ┆ 6   ┆ 4   ┆ 3    ┆ a     │
│ 2   ┆ 55  ┆ 5   ┆ 4    ┆ b     │
│ 1   ┆ 4   ┆ 6   ┆ 5    ┆ c     │
│ 2   ┆ 3   ┆ 7   ┆ 6    ┆ d     │
│ 1   ┆ 2   ┆ 8   ┆ 7    ┆ e     │
│ 2   ┆ 1   ┆ 9   ┆ null ┆ f     │
└─────┴─────┴─────┴──────┴───────┘


In [27]:
# Row slicing by location
df[1:3]

grp,x,y,z,index
i64,i64,i64,i64,str
2,55,5,4,"""b"""
1,4,6,5,"""c"""


In [28]:
# Column slicing by location (removed "grp" column)
df[:, 1:]

x,y,z,index
i64,i64,i64,str
6,4,3.0,"""a"""
55,5,4.0,"""b"""
4,6,5.0,"""c"""
3,7,6.0,"""d"""
2,8,7.0,"""e"""
1,9,,"""f"""


In [29]:
# Row indexing by label
df.filter(pl.col("index") == "c")

grp,x,y,z,index
i64,i64,i64,i64,str
1,4,6,5,"""c"""


In [30]:
# Column indexing by label
print(df[:, "x"])
print(df.select("x"))

shape: (6, 1)
┌─────┐
│ x   │
│ --- │
│ i64 │
╞═════╡
│ 6   │
│ 55  │
│ 4   │
│ 3   │
│ 2   │
│ 1   │
└─────┘
shape: (6, 1)
┌─────┐
│ x   │
│ --- │
│ i64 │
╞═════╡
│ 6   │
│ 55  │
│ 4   │
│ 3   │
│ 2   │
│ 1   │
└─────┘


In [31]:
# Column indexing by labels
print( df[:, ['x', 'z']] )
print( df.select(['x', 'z']) )

shape: (6, 2)
┌─────┬──────┐
│ x   ┆ z    │
│ --- ┆ ---  │
│ i64 ┆ i64  │
╞═════╪══════╡
│ 6   ┆ 3    │
│ 55  ┆ 4    │
│ 4   ┆ 5    │
│ 3   ┆ 6    │
│ 2   ┆ 7    │
│ 1   ┆ null │
└─────┴──────┘
shape: (6, 2)
┌─────┬──────┐
│ x   ┆ z    │
│ --- ┆ ---  │
│ i64 ┆ i64  │
╞═════╪══════╡
│ 6   ┆ 3    │
│ 55  ┆ 4    │
│ 4   ┆ 5    │
│ 3   ┆ 6    │
│ 2   ┆ 7    │
│ 1   ┆ null │
└─────┴──────┘


In [32]:
# Column slicing by label
df[:, "x":"z"]

x,y,z
i64,i64,i64
6,4,3.0
55,5,4.0
4,6,5.0
3,7,6.0
2,8,7.0
1,9,


In [33]:
# Mixed indexing
df.filter(pl.col("index") == "c")[0, 1]

4

In [34]:
# Reduce multiple values (mean, sum, ...)
df['z'].mean()

5.0

In [35]:
df.select(pl.col("z").mean())

z
f64
5.0


In [36]:
# Add new column
df.with_columns((pl.col("z") + 1).alias("z1"))

grp,x,y,z,index,z1
i64,i64,i64,i64,str,i64
1,6,4,3.0,"""a""",4.0
2,55,5,4.0,"""b""",5.0
1,4,6,5.0,"""c""",6.0
2,3,7,6.0,"""d""",7.0
1,2,8,7.0,"""e""",8.0
2,1,9,,"""f""",


In [37]:
# Rename columns
df.rename({"x": "x_new"})

grp,x_new,y,z,index
i64,i64,i64,i64,str
1,6,4,3.0,"""a"""
2,55,5,4.0,"""b"""
1,4,6,5.0,"""c"""
2,3,7,6.0,"""d"""
1,2,8,7.0,"""e"""
2,1,9,,"""f"""


In [38]:
# Drop columns
df.drop(['x','y'])

grp,z,index
i64,i64,str
1,3.0,"""a"""
2,4.0,"""b"""
1,5.0,"""c"""
2,6.0,"""d"""
1,7.0,"""e"""
2,,"""f"""


In [39]:
# Sort rows by a column
df.sort("x")

grp,x,y,z,index
i64,i64,i64,i64,str
2,1,9,,"""f"""
1,2,8,7.0,"""e"""
2,3,7,6.0,"""d"""
1,4,6,5.0,"""c"""
1,6,4,3.0,"""a"""
2,55,5,4.0,"""b"""


In [40]:
# Drop rows with missing values
df.drop_nulls()

grp,x,y,z,index
i64,i64,i64,i64,str
1,6,4,3,"""a"""
2,55,5,4,"""b"""
1,4,6,5,"""c"""
2,3,7,6,"""d"""
1,2,8,7,"""e"""


In [41]:
# Select unique rows
df.unique()

grp,x,y,z,index
i64,i64,i64,i64,str
1,6,4,3.0,"""a"""
2,55,5,4.0,"""b"""
1,4,6,5.0,"""c"""
2,3,7,6.0,"""d"""
1,2,8,7.0,"""e"""
2,1,9,,"""f"""


### concat, extend, or vstack

https://www.rhosignal.com/posts/polars-extend-vstack/    

- pl.concat([df1,df2]) copies all the data to a single new location
- df1.vstack(df2) does not copy data, it just links the new DataFrame to the existing two locations in memory
- df1.extend(df2) copies the data from df2 and appends it to the data for df1

In [42]:
pl.concat([df,df]) 

grp,x,y,z,index
i64,i64,i64,i64,str
1,6,4,3.0,"""a"""
2,55,5,4.0,"""b"""
1,4,6,5.0,"""c"""
2,3,7,6.0,"""d"""
1,2,8,7.0,"""e"""
2,1,9,,"""f"""
1,6,4,3.0,"""a"""
2,55,5,4.0,"""b"""
1,4,6,5.0,"""c"""
2,3,7,6.0,"""d"""


In [43]:
# you can not "extend()" a DataFrame with itself.
# so first we make a copy using the "clone()" command
df2 = df.clone()
df2.extend(df)

grp,x,y,z,index
i64,i64,i64,i64,str
1,6,4,3.0,"""a"""
2,55,5,4.0,"""b"""
1,4,6,5.0,"""c"""
2,3,7,6.0,"""d"""
1,2,8,7.0,"""e"""
2,1,9,,"""f"""
1,6,4,3.0,"""a"""
2,55,5,4.0,"""b"""
1,4,6,5.0,"""c"""
2,3,7,6.0,"""d"""


In [44]:
# counting number of nulls int he whole DataFrame

# Create a sample DataFrame
data = {
    "A": [1, 2, None, 4, 5],
    "B": [None, 2, 3, 4, None],
    "C": [1, 2, 3, None, 5],
}

df = pl.DataFrame(data)

# print number of nulls in each column:
dfn = df.null_count()
display(dfn)
print("-----------------")
# calculate total number of nulls in DataFrame
df_nulls = 0
for col in dfn.columns:
    df_nulls += dfn[col][0]
print(f"Total number of nulls: {df_nulls}")

A,B,C
u32,u32,u32
1,2,1


-----------------
Total number of nulls: 4


In [45]:
# There is no transpose function built-in
# but you can write your own.
# You can first convert DataFrame into a dict of columns using df.to_dict()
# Or you can go through columns using df[col].to_list()
# Or you can jsut take values one by one from the DataFrame like that:

def T(df):
    t_data = {}
    for i in range(df.height):        
        row_vals = []
        for c in range(df.width):
            row_vals.append(df[i,c])
        t_data[str(i)] = row_vals
    return pl.DataFrame(t_data)

# Create a sample DataFrame
data = {
    "A": [1, 2, 3],
    "B": [4, 5, 6],
    "C": [7, 8, 9],
}

df = pl.DataFrame(data)

# Transpose the DataFrame
transposed_df = T(df)
print(transposed_df)

shape: (3, 3)
┌─────┬─────┬─────┐
│ 0   ┆ 1   ┆ 2   │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1   ┆ 2   ┆ 3   │
│ 4   ┆ 5   ┆ 6   │
│ 7   ┆ 8   ┆ 9   │
└─────┴─────┴─────┘


In [46]:
# get column values
df['A'].to_list()

[1, 2, 3]

In [47]:
# get column as a series
s = df['A']
display(s)
print("--------------------------")
print("convert Polars column/Series to numpy using s.to_numpy()")
num_arr = df['A'].to_numpy()
print(type(num_arr),num_arr)
print("convert Polars column/Series to list")
mylist = df['A'].to_list()
print(type(mylist),mylist)
print("--------------------------")
print("convert whole Polars DataFrame to Numpy using df.to_numpy()")
print(df.to_numpy())
print("--------------------------")
print("convert whole Polars DataFrame to list of lists")
list_of_lists = list([list(row) for row in df])
print(list_of_lists)
print("convert whole Polars DataFrame to list of tuples")
list_of_tuples = list([tuple(row) for row in df])
print(list_of_tuples)

A
i64
1
2
3


--------------------------
convert Polars column/Series to numpy using s.to_numpy()
<class 'numpy.ndarray'> [1 2 3]
convert Polars column/Series to list
<class 'list'> [1, 2, 3]
--------------------------
convert whole Polars DataFrame to Numpy using df.to_numpy()
[[1 4 7]
 [2 5 8]
 [3 6 9]]
--------------------------
convert whole Polars DataFrame to list of lists
[[1, 2, 3], [4, 5, 6], [7, 8, 9]]
convert whole Polars DataFrame to list of tuples
[(1, 2, 3), (4, 5, 6), (7, 8, 9)]


In [48]:
# converting between Pandas and Polars
# needs "pip install pyarrow"

import polars as pl
import pandas as pd

pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
print(pd_df)
print("----------------------------")
df_pl = pl.from_pandas(pd_df)
print(df_pl)
print("----------------------------")
pd_df2 = df_pl.to_pandas()
print(pd_df2)

   a  b  c
0  1  2  3
1  4  5  6
----------------------------
shape: (2, 3)
┌─────┬─────┬─────┐
│ a   ┆ b   ┆ c   │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1   ┆ 2   ┆ 3   │
│ 4   ┆ 5   ┆ 6   │
└─────┴─────┴─────┘
----------------------------
   a  b  c
0  1  2  3
1  4  5  6


In [49]:
# joining dataframes in Polars is similar 
# to how things work with join/merge in Pandas
# "left_on" and "right_on" parameters can use different columns 
# "how" parameter can have values "left", "right", "inner", "outer"

d1 = {"id":         [1, 2, 3, 4], "name": ["Alice", "Bob", "Charlie", "David"]}
d2 = {"account_id": [1, 2, 3, 4], "age": [25, 30, 35, 28]}

df1 = pl.DataFrame(d1)
df2 = pl.DataFrame(d2)

joined_df = df1.join(df2, left_on="id", right_on="account_id")
print(joined_df)

shape: (4, 3)
┌─────┬─────────┬─────┐
│ id  ┆ name    ┆ age │
│ --- ┆ ---     ┆ --- │
│ i64 ┆ str     ┆ i64 │
╞═════╪═════════╪═════╡
│ 1   ┆ Alice   ┆ 25  │
│ 2   ┆ Bob     ┆ 30  │
│ 3   ┆ Charlie ┆ 35  │
│ 4   ┆ David   ┆ 28  │
└─────┴─────────┴─────┘
