# Pandas vs. Polars

In [1]:
# Pandas
import pandas as pd

# Polars
import polars as pl

## Create DataFrame

In [2]:
# Pandas

df_pd = pd.DataFrame({'A': [1, 2, 2], 'B': [4, 5, 6], 'C': ['blue', 'red', 'blue']})

In [3]:
# Polars

df_pl = pl.DataFrame({'A': [1, 2, 2], 'B': [4, 5, 6], 'C': ['blue', 'red', 'blue']})

## Read a CSV File

In [4]:
# Pandas

# df_pd = pd.read_csv('file.csv')

In [5]:
# Polars

# df_pl = pl.read_csv('file.csv')

## Head and Tail

In [6]:
# Pandas

df_pd.head(5)
# df_pd.tail(5)

Unnamed: 0,A,B,C
0,1,4,blue
1,2,5,red
2,2,6,blue


In [7]:
# Polars

df_pl.head(5)
# df_pl.tail(5)

A,B,C
i64,i64,str
1,4,"""blue"""
2,5,"""red"""
2,6,"""blue"""


## Get DataFrame Shape

In [8]:
# Pandas

df_pd.shape

(3, 3)

In [9]:
# Polars

df_pl.shape

(3, 3)

## Basic Info

In [10]:
# Pandas

df_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       3 non-null      int64 
 1   B       3 non-null      int64 
 2   C       3 non-null      object
dtypes: int64(2), object(1)
memory usage: 200.0+ bytes


In [11]:
# Pandas

df_pd.describe()

Unnamed: 0,A,B
count,3.0,3.0
mean,1.666667,5.0
std,0.57735,1.0
min,1.0,4.0
25%,1.5,4.5
50%,2.0,5.0
75%,2.0,5.5
max,2.0,6.0


In [12]:
# Polars

# Polars combines info & describe
df_pl.describe()

statistic,A,B,C
str,f64,f64,str
"""count""",3.0,3.0,"""3"""
"""null_count""",0.0,0.0,"""0"""
"""mean""",1.666667,5.0,
"""std""",0.57735,1.0,
"""min""",1.0,4.0,"""blue"""
"""25%""",2.0,5.0,
"""50%""",2.0,5.0,
"""75%""",2.0,6.0,
"""max""",2.0,6.0,"""red"""


## Select Columns

In [13]:
# Pandas

# Single column
df_pd['A']

0    1
1    2
2    2
Name: A, dtype: int64

In [14]:
# Pandas

# Multiple columns
df_pd[['A', 'B']]

Unnamed: 0,A,B
0,1,4
1,2,5
2,2,6


In [15]:
# Polars

# Single columns
df_pl['A']

A
i64
1
2
2


In [16]:
df_pl.select(['A', 'B'])

A,B
i64,i64
1,4
2,5
2,6


## Rename Columns

In [17]:
# Pandas

df_pd.rename(columns={'A': 'X', 'B': 'Y'})

Unnamed: 0,X,Y,C
0,1,4,blue
1,2,5,red
2,2,6,blue


In [18]:
# Polars

df_pl.rename(mapping={'A': 'X', 'B': 'Y'})

X,Y,C
i64,i64,str
1,4,"""blue"""
2,5,"""red"""
2,6,"""blue"""


## Add a New Column

In [19]:
# Pandas

df_pd['D'] = df_pd['A'] + df_pd['B']

df_pd

Unnamed: 0,A,B,C,D
0,1,4,blue,5
1,2,5,red,7
2,2,6,blue,8


In [20]:
# Polars

df_pl = df_pl.with_columns(
    (pl.col('A') + pl.col('B')).alias('D')
)

df_pl

A,B,C,D
i64,i64,str,i64
1,4,"""blue""",5
2,5,"""red""",7
2,6,"""blue""",8


## Drop Columns

In [21]:
# Pandas

df_pd.drop(columns=['A', 'B'])

Unnamed: 0,C,D
0,blue,5
1,red,7
2,blue,8


In [22]:
# Polars

df_pl.drop(['A', 'B'])

C,D
str,i64
"""blue""",5
"""red""",7
"""blue""",8


## Sort Values

In [23]:
# Pandas

df_pd.sort_values(by=['A', 'B'], ascending=[True, False])

Unnamed: 0,A,B,C,D
0,1,4,blue,5
2,2,6,blue,8
1,2,5,red,7


In [24]:
# Polars

df_pl.sort(by=['A', 'B'], descending=[False, True])

A,B,C,D
i64,i64,str,i64
1,4,"""blue""",5
2,6,"""blue""",8
2,5,"""red""",7


## Filter Rows

In [25]:
# Pandas

df_pd[df_pd['A'] > 1]

Unnamed: 0,A,B,C,D
1,2,5,red,7
2,2,6,blue,8


In [26]:
# Pandas

df_pd[df_pd['C'] == 'red']

Unnamed: 0,A,B,C,D
1,2,5,red,7


In [27]:
# Pandas

df_pd[df_pd['C'].str.contains('red')]

Unnamed: 0,A,B,C,D
1,2,5,red,7


In [28]:
# Polars

df_pl.filter(
    pl.col('A') > 1
)

A,B,C,D
i64,i64,str,i64
2,5,"""red""",7
2,6,"""blue""",8


In [29]:
# Polars

df_pl.filter(
    pl.col('C') == 'red'
)

A,B,C,D
i64,i64,str,i64
2,5,"""red""",7


In [30]:
# Polars

df_pl.filter(
    pl.col('C').str.contains('red')
)

A,B,C,D
i64,i64,str,i64
2,5,"""red""",7


## Unique Values

In [31]:
# Pandas

df_pd['A'].unique()

array([1, 2])

In [32]:
# Polars

df_pl['A'].unique()

A
i64
1
2


## Missing Values

In [33]:
# Pandas

df_pd.isnull().sum() # Count nulls

A    0
B    0
C    0
D    0
dtype: int64

In [34]:
# Pandas

df_pd.fillna(0) # Fill nulls with zero

Unnamed: 0,A,B,C,D
0,1,4,blue,5
1,2,5,red,7
2,2,6,blue,8


In [35]:
# Polars

df_pl.null_count() # Count nulls

A,B,C,D
u32,u32,u32,u32
0,0,0,0


In [36]:
# Polars

df_pl.fill_null(0) # Fill nulls with zero

A,B,C,D
i64,i64,str,i64
1,4,"""blue""",5
2,5,"""red""",7
2,6,"""blue""",8


## Group By and Aggregate

In [37]:
# Pandas

df_pd.groupby('A').agg({'B': 'sum'})

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
1,4
2,11


In [38]:
# Polars

df_pl.group_by('A').agg(pl.col('B').sum())

A,B
i64,i64
1,4
2,11


## Apply Functions

In [39]:
# Pandas

df_pd['D'] = df_pd['A'].apply(lambda x: x * 2)

df_pd

Unnamed: 0,A,B,C,D
0,1,4,blue,2
1,2,5,red,4
2,2,6,blue,4


In [40]:
# Pandas

df_pd['D'] = df_pd['C'].apply(lambda x: x.upper())

df_pd

Unnamed: 0,A,B,C,D
0,1,4,blue,BLUE
1,2,5,red,RED
2,2,6,blue,BLUE


In [41]:
# Polars

df_pl = df_pl.with_columns(
    (pl.col('A') * 2).alias('D')
)

df_pl

A,B,C,D
i64,i64,str,i64
1,4,"""blue""",2
2,5,"""red""",4
2,6,"""blue""",4


In [42]:
# Polars

df_pl = df_pl.with_columns(
    (pl.col('C').str.to_uppercase()).alias('D')
)

df_pl

A,B,C,D
i64,i64,str,str
1,4,"""blue""","""BLUE"""
2,5,"""red""","""RED"""
2,6,"""blue""","""BLUE"""


## Combine DataFrames

In [43]:
# Pandas

df_a, df_b = df_pd, df_pd

pd.concat([df_a, df_b])

Unnamed: 0,A,B,C,D
0,1,4,blue,BLUE
1,2,5,red,RED
2,2,6,blue,BLUE
0,1,4,blue,BLUE
1,2,5,red,RED
2,2,6,blue,BLUE


In [44]:
# Polars

df_a, df_b = df_pl, df_pl

pl.concat([df_a, df_b])

A,B,C,D
i64,i64,str,str
1,4,"""blue""","""BLUE"""
2,5,"""red""","""RED"""
2,6,"""blue""","""BLUE"""
1,4,"""blue""","""BLUE"""
2,5,"""red""","""RED"""
2,6,"""blue""","""BLUE"""


## Merge/Join

In [45]:
# Pandas

df_a, df_b = df_pd, df_pd

pd.merge(df_a, df_b, on='A', how='inner')

Unnamed: 0,A,B_x,C_x,D_x,B_y,C_y,D_y
0,1,4,blue,BLUE,4,blue,BLUE
1,2,5,red,RED,5,red,RED
2,2,5,red,RED,6,blue,BLUE
3,2,6,blue,BLUE,5,red,RED
4,2,6,blue,BLUE,6,blue,BLUE


In [46]:
# Polars

df_a, df_b = df_pl, df_pl

df_a.join(df_b, on='A', how='inner')

A,B,C,D,B_right,C_right,D_right
i64,i64,str,str,i64,str,str
1,4,"""blue""","""BLUE""",4,"""blue""","""BLUE"""
2,5,"""red""","""RED""",5,"""red""","""RED"""
2,6,"""blue""","""BLUE""",5,"""red""","""RED"""
2,5,"""red""","""RED""",6,"""blue""","""BLUE"""
2,6,"""blue""","""BLUE""",6,"""blue""","""BLUE"""


## Pivot Table

In [47]:
# Pandas

df_pd.pivot_table(index='A', columns='B', values='D', aggfunc='sum')

B,4,5,6
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,BLUE,,
2,,RED,BLUE


In [48]:
# Polars

df_pl.pivot(index='A', on='B', values='D', aggregate_function='sum')

A,4,5,6
i64,str,str,str
1,,,
2,,,


## Export to CSV

In [49]:
# Pandas

# df_pd.to_csv('file.csv', index=False)

In [50]:
# Polars

# df_pl.write_csv('file.csv')

## Convert to NumPy or List

In [51]:
# Pandas

df_pd.to_numpy()

array([[1, 4, 'blue', 'BLUE'],
       [2, 5, 'red', 'RED'],
       [2, 6, 'blue', 'BLUE']], dtype=object)

In [52]:
# Pandas

df_pd['A'].tolist()

[1, 2, 2]

In [53]:
# Polars

df_pl.to_numpy()

array([[1, 4, 'blue', 'BLUE'],
       [2, 5, 'red', 'RED'],
       [2, 6, 'blue', 'BLUE']], dtype=object)

In [54]:
# Polars

df_pl['A'].to_list()

[1, 2, 2]

## Lazy Execution

In [55]:
# Pandas

# Pandas does not support lazy execution

In [56]:
# Polars

lazy_df = df_pl.lazy() # Enables lazy execution

In [57]:
lazy_df = lazy_df.with_columns(
    (pl.col('A') * 100).alias('lazy')
)

lazy_df

In [58]:
lazy_df.collect() # Executes the operations

A,B,C,D,lazy
i64,i64,str,str,i64
1,4,"""blue""","""BLUE""",100
2,5,"""red""","""RED""",200
2,6,"""blue""","""BLUE""",200
