In [509]:
import polars as pl
import numpy as np
import pandas as pd

# Create Dataframe

In [510]:
df = pl.DataFrame({'teacher':['John', 'Lucy', 'John', 'Tom', 'Helen'],
                   'price': np.random.randint(20,45,5),
                   'lessons': np.random.randint(10,25,5)
                   })
df.head(5)

teacher,price,lessons
str,i32,i32
"""John""",37,19
"""Lucy""",27,19
"""John""",20,16
"""Tom""",23,23
"""Helen""",22,17


In [511]:
df_pd = pd.DataFrame({'teacher':['John', 'Lucy', 'John', 'Tom', 'Helen'],
                   'price': np.random.randint(20,45,5),
                   'lessons': np.random.randint(10,25,5)
                   })
df_pd.head(5)

Unnamed: 0,teacher,price,lessons
0,John,40,18
1,Lucy,38,20
2,John,32,24
3,Tom,41,19
4,Helen,24,18


# Convert Pandas->Polars and Polars->Pandas

In [512]:
polars_df = pl.from_pandas(df_pd)
polars_df

teacher,price,lessons
str,i32,i32
"""John""",40,18
"""Lucy""",38,20
"""John""",32,24
"""Tom""",41,19
"""Helen""",24,18


In [513]:
pandas_df = df.to_pandas()
pandas_df

Unnamed: 0,teacher,price,lessons
0,John,37,19
1,Lucy,27,19
2,John,20,16
3,Tom,23,23
4,Helen,22,17


# Filter

In [514]:
df_filtered = df.filter(
                        (pl.col('teacher')=='John') & (pl.col('price')>10)
                        )

df_filtered

teacher,price,lessons
str,i32,i32
"""John""",37,19
"""John""",20,16


In [515]:
df_pd_filtered = df_pd[(df_pd['teacher']=='John') & (df_pd['price']>10)]

df_pd_filtered

Unnamed: 0,teacher,price,lessons
0,John,40,18
2,John,32,24


# GREATER or NOT GREATER (~)

In [516]:
import polars as pl

# Sample DataFrame
df = pl.DataFrame({"A": [1, -2, 0], 
                   "B": [4, 5, -6], 
                   "C": [-7, 8, 9]
                   })

In [517]:
# Filter rows where all values are not greater than 0 (<= 0)
filtered_df = df.filter(~pl.col('A').ge(0))  # Logical negation with `~`
print(filtered_df)

shape: (1, 3)
┌─────┬─────┬─────┐
│ A   ┆ B   ┆ C   │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ -2  ┆ 5   ┆ 8   │
└─────┴─────┴─────┘


In [518]:
# Filter rows where all values are not greater than 0 (<= 0)
filtered_df = df.filter(~pl.col('A').gt(0))  # Logical negation with `~`
print(filtered_df)

shape: (2, 3)
┌─────┬─────┬─────┐
│ A   ┆ B   ┆ C   │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ -2  ┆ 5   ┆ 8   │
│ 0   ┆ -6  ┆ 9   │
└─────┴─────┴─────┘


# Operations with columns

**Divide-Polars-way1**

In [519]:
import polars as pl

# Sample DataFrame
df = pl.DataFrame({"price": [10, 20, 35], 
                   "lessons": [4, 5, 6]
                   })

In [520]:
df = df.with_columns(
                    ((pl.col('price') / pl.col('lessons'))*100).alias('profit')
                    )

df.head(5)

price,lessons,profit
i64,i64,f64
10,4,250.0
20,5,400.0
35,6,583.333333


**Divide-Polars-way2**

In [521]:
df = df.with_columns(
                    ((df['price'] / df['lessons'])*100).alias('profit')
                    )

df.head(5)

price,lessons,profit
i64,i64,f64
10,4,250.0
20,5,400.0
35,6,583.333333


**SQRT**

In [522]:
df = df.with_columns(
                        pl.col('profit').sqrt().alias('sqrt')
                    )

df.head(5)

price,lessons,profit,sqrt
i64,i64,f64,f64
10,4,250.0,15.811388
20,5,400.0,20.0
35,6,583.333333,24.152295


# Bins and value counts

In [523]:
import polars as pl

# Sample DataFrame
df = pl.DataFrame({"age": [85, 15 , 18, 33, 32, 44, 45, 46, 82, 65],
                   "freq": [3,1 , 2, 1, 3, 1, 2, 1, 1, 100]
                   })

**Without duplicates**

In [524]:
df_rfm = df.with_columns(df['age'].qcut(quantiles=5, 
                                        labels=["Low", "Medium Low", "Medium", "Medium High", "High"])
                                        .alias('categories-age'))

df_rfm = df_rfm.sort('age', descending=False)
df_rfm

age,freq,categories-age
i64,i64,cat
15,1,"""Low"""
18,2,"""Low"""
32,3,"""Medium Low"""
33,1,"""Medium Low"""
44,1,"""Medium"""
45,2,"""Medium"""
46,1,"""Medium High"""
65,100,"""Medium High"""
82,1,"""High"""
85,3,"""High"""


**With duplicates**

In [525]:
df_rfm = df.with_columns(df['freq'].qcut(quantiles=5, 
                                         labels=["Low", "Medium Low", "Medium", "Medium High", "High"], 
                                         allow_duplicates=True)
                                         .alias('categories-freq'))

df_rfm = df_rfm.sort('freq', descending=False)
df_rfm

age,freq,categories-freq
i64,i64,cat
15,1,"""Low"""
33,1,"""Low"""
44,1,"""Low"""
46,1,"""Low"""
82,1,"""Low"""
18,2,"""Medium"""
45,2,"""Medium"""
85,3,"""Medium High"""
32,3,"""Medium High"""
65,100,"""High"""


In [526]:
number = df_rfm.select(pl.col('categories-freq').value_counts(sort=True))
number.head(20)

categories-freq
struct[2]
"{""Low"",5}"
"{""Medium"",2}"
"{""Medium High"",2}"
"{""High"",1}"


# Conditions

**Condition with new column created**

In [527]:
df = pl.DataFrame({'teacher':['John', 'Lucy', 'John', 'Tom', 'Helen'],
                   'price': np.random.randint(20,45,5),
                   'lessons': np.random.randint(10,25,5)
                   })
df.head(5)

teacher,price,lessons
str,i32,i32
"""John""",42,14
"""Lucy""",22,18
"""John""",30,16
"""Tom""",37,22
"""Helen""",39,23


In [528]:
""" # Pandas version
df_pd['new column'] = np.where(df_pd['teacher']=='John', 1, 0)
df_pd.head(5)  """ 

df_con = df.with_columns(
                        pl.when(pl.col('teacher')=='John').then(1).otherwise(0).alias('new column')
                        )
df_con.head(5)

teacher,price,lessons,new column
str,i32,i32,i32
"""John""",42,14,1
"""Lucy""",22,18,0
"""John""",30,16,1
"""Tom""",37,22,0
"""Helen""",39,23,0


**Condition - with modifing an existing column - using lit**

In [529]:
df = df.with_columns(
                    pl.when(pl.col('teacher')=='John')
                    .then(pl.lit(0))
                    .otherwise('price')
                    .name.keep()
                    )                     
df

teacher,price,lessons
str,i32,i32
"""John""",0,14
"""Lucy""",22,18
"""John""",0,16
"""Tom""",37,22
"""Helen""",39,23


# NULL and NaN

In [530]:
import polars as pl

df = pl.DataFrame({
        "a": [1, 2, None, 1, None],
        "b": [1.0, float("nan"), 2.0, 1.0, 5.0],
    })

df


a,b
i64,f64
1.0,1.0
2.0,
,2.0
1.0,1.0
,5.0


In [531]:
print(
    df.select(pl.col('a').null_count())
    ) 

shape: (1, 1)
┌─────┐
│ a   │
│ --- │
│ u32 │
╞═════╡
│ 2   │
└─────┘


In [532]:
df =  df.with_columns(pl.col('a').fill_null(pl.col('b')))
df

a,b
f64,f64
1.0,1.0
2.0,
2.0,2.0
1.0,1.0
5.0,5.0


In [533]:
df = df.with_columns(pl.col('b').fill_nan('vole'))
df

a,b
f64,str
1.0,"""1.0"""
2.0,"""vole"""
2.0,"""2.0"""
1.0,"""1.0"""
5.0,"""5.0"""


# Date

**Period range**

In [534]:
from dateutil.relativedelta import relativedelta

start = pd.to_datetime('2024-01-01') - relativedelta(months=1)
end = start + relativedelta(months=5)

In [535]:
# start and end can be timestamp or datime object

df_new = pl.DataFrame(
                        pl.datetime_range(
                        start,
                        end,
                        interval='2mo',
                        eager=True,
                        )
                        .alias('date')
                    )
df_new

date
datetime[μs]
2023-12-01 00:00:00
2024-02-01 00:00:00
2024-04-01 00:00:00


In [536]:
type(start)

pandas._libs.tslibs.timestamps.Timestamp

In [537]:
date_pd = pd.DataFrame({'date': pd.period_range('2024-01-01', periods=12, freq='M')
                        })
date_pd

Unnamed: 0,date
0,2024-01
1,2024-02
2,2024-03
3,2024-04
4,2024-05
5,2024-06
6,2024-07
7,2024-08
8,2024-09
9,2024-10


In [538]:
type(date_pd['date'])

pandas.core.series.Series

**Datetime object, timestamp, duration**

In [539]:
from datetime import datetime

start = datetime.strptime('2024-01-01', '%Y-%m-%d')
start

datetime.datetime(2024, 1, 1, 0, 0)

In [540]:
end = start + relativedelta(days=14)
end

datetime.datetime(2024, 1, 15, 0, 0)

In [541]:
pl_df = pl.DataFrame(pl.datetime_range(
                                    start=start,
                                    end=end,
                                    interval='2d',
                                    eager=True).alias('date'))

pl_df

date
datetime[μs]
2024-01-01 00:00:00
2024-01-03 00:00:00
2024-01-05 00:00:00
2024-01-07 00:00:00
2024-01-09 00:00:00
2024-01-11 00:00:00
2024-01-13 00:00:00
2024-01-15 00:00:00


In [542]:
today = datetime.strptime('2024-04-01', '%Y-%m-%d')

In [543]:
# Crerate new column delta
pl_df = pl_df.with_columns((today-pl.col('date')).alias('delta'))

# Create new column duration as integer
pl_df = pl_df.with_columns(pl.col('delta').dt.total_days().alias('delta-int'))

pl_df


date,delta,delta-int
datetime[μs],duration[μs],i64
2024-01-01 00:00:00,91d,91
2024-01-03 00:00:00,89d,89
2024-01-05 00:00:00,87d,87
2024-01-07 00:00:00,85d,85
2024-01-09 00:00:00,83d,83
2024-01-11 00:00:00,81d,81
2024-01-13 00:00:00,79d,79
2024-01-15 00:00:00,77d,77


# Group by + agg

In [544]:
import polars as pl

# Sample data (replace this with your actual data)
data = {
    'name': ['Alice', 'Bob', 'Alice', 'Bob', 'Alice'],
    'money': [100, 150, 200, 75, 300],
    'date': ['2022-01-01', '2022-02-15', '2022-03-10', '2022-04-20', '2022-05-05']
}

# Create a DataFrame
df = pl.DataFrame(data)

In [545]:
# Add dtypes
df = df.with_columns([
  pl.col('date').str.to_datetime("%Y-%m-%d")
])

df

name,money,date
str,i64,datetime[μs]
"""Alice""",100,2022-01-01 00:00:00
"""Bob""",150,2022-02-15 00:00:00
"""Alice""",200,2022-03-10 00:00:00
"""Bob""",75,2022-04-20 00:00:00
"""Alice""",300,2022-05-05 00:00:00


In [546]:
grouped_data = (df.group_by("name")
                .agg(pl.col("money").sum(), 
                     pl.col("date").max())
                )
grouped_data

name,money,date
str,i64,datetime[μs]
"""Bob""",225,2022-04-20 00:00:00
"""Alice""",600,2022-05-05 00:00:00


**Count**

In [547]:
grouped_data2 = (df.group_by("name")
                .agg(pl.col("money").sum(), 
                     pl.col("date").count().alias('count'))
                )
grouped_data2

name,money,count
str,i64,u32
"""Bob""",225,2
"""Alice""",600,3


**Polars - Iloc**

In [548]:
# Sample data (replace this with your actual data)
data = {
    'name': ['Alice', 'Bob', 'Alice', 'Bob', 'Alice'],
    'money': [100, 150, 200, 75, 300],
    'date': ['2022-01-01', '2022-02-15', '2022-03-10', '2022-04-20', '2022-05-05']
}

# Create a DataFrame
df = pl.DataFrame(data)
df

name,money,date
str,i64,str
"""Alice""",100,"""2022-01-01"""
"""Bob""",150,"""2022-02-15"""
"""Alice""",200,"""2022-03-10"""
"""Bob""",75,"""2022-04-20"""
"""Alice""",300,"""2022-05-05"""


In [549]:
df[0,-1]

'2022-01-01'

# Join

In [550]:
import polars as pl

dic = { 1: 'a', 
       2: 'b', 
       3: 'c' 
       }


mapper = pl.DataFrame({
    "keys": list(dic.keys()),
    "values": list(dic.values())
})


pl.Series([1, 2, 3, 4]).to_frame("keys").join(mapper, on="keys", how="left")





""" # Strip off column from df and convert into a frame
new_column = df_rfm_eval['cluster'].to_frame("keys")


# Add new column
df_rfm_eval = df_rfm_eval.with_columns(pl.lit(new_column.join(mapper, on="keys", how="left").to_series(1)).alias('description'))
df_rfm_eval """

' # Strip off column from df and convert into a frame\nnew_column = df_rfm_eval[\'cluster\'].to_frame("keys")\n\n\n# Add new column\ndf_rfm_eval = df_rfm_eval.with_columns(pl.lit(new_column.join(mapper, on="keys", how="left").to_series(1)).alias(\'description\'))\ndf_rfm_eval '

# Cast

**Map items**

In [551]:
import polars as pl

# Assuming rfm is a Polars DataFrame
rfm = pl.DataFrame({
    'R': [1, 2, 3],
    'F': [4, 5, 6],
    'M': [7, 8, 9]
})

rfm

R,F,M
i64,i64,i64
1,4,7
2,5,8
3,6,9


In [552]:
rfm = rfm.with_columns((rfm['R'].cast(str) + rfm['F'].cast(str) + rfm['M'].cast(str)).alias('segment'))

rfm

R,F,M,segment
i64,i64,i64,str
1,4,7,"""147"""
2,5,8,"""258"""
3,6,9,"""369"""


# Replace and str.replace (REGEX)

**Replace from dictionary map**

In [553]:
import polars as pl

df = pl.DataFrame({'name':['John' , 'Alice', 'David', 'Kate']})
df

name
str
"""John"""
"""Alice"""
"""David"""
"""Kate"""


In [554]:
seg_dict = {'John' : 'idiot',
            'Alice' : 'hooker',
            'David' : 'gay'
            }

In [555]:
df = df.with_columns((pl.col("name").replace(seg_dict, default='not found')).alias('new'))
df

name,new
str,str
"""John""","""idiot"""
"""Alice""","""hooker"""
"""David""","""gay"""
"""Kate""","""not found"""


**Replace string using Regex**

In [556]:
import polars as pl
df = pl.DataFrame({
                   'name' : ['Martin', 'John', 'Evita', 'Alice', 'David', 'Anna'],
                   'age' : [35, 13, 6, 15, 40, 85]
                   
                   })
df

name,age
str,i64
"""Martin""",35
"""John""",13
"""Evita""",6
"""Alice""",15
"""David""",40
"""Anna""",85


In [557]:
'''
^ = Matches "pattern" only if it appears at the beginning of a line or string.
$ = Matches "pattern" only if it appears at the END of a line or string.
'''

dict_map = {  '^[1-9]$':'kid', 
              '[1][1-9]':'teen',
              '[2-5][0-9]' :'adult',
              '[5-9][0-9]' :'elderly'
       }

In [558]:
# Convert dictionary KEYS onto a list
keys = list(dict_map)
keys

['^[1-9]$', '[1][1-9]', '[2-5][0-9]', '[5-9][0-9]']

In [559]:
# Convert dictionary VALUES into a list
values = list(dict_map.values())
values

['kid', 'teen', 'adult', 'elderly']

In [560]:
keys[0]

'^[1-9]$'

In [561]:
df = df.with_columns((pl.col("age")
                      .str.replace(keys[0], values[0])
                      .str.replace(keys[1], values[1])
                      .str.replace(keys[2], values[2])
                      .str.replace(keys[3], values[3])
                      ).alias('age group')
                      )
df

name,age,age group
str,i64,str
"""Martin""",35,"""adult"""
"""John""",13,"""teen"""
"""Evita""",6,"""kid"""
"""Alice""",15,"""teen"""
"""David""",40,"""adult"""
"""Anna""",85,"""elderly"""


# Add column - 2 ways

**First way - using alias**

In [562]:
import polars as pl

df = pl.DataFrame({'age':[15, 20 , 36, 45],
                   'salary':[450, 350 , 600, 700],
                   })
df

age,salary
i64,i64
15,450
20,350
36,600
45,700


In [563]:
# first way
df = df.with_columns((pl.col('age')/pl.col('salary')).alias('divided'))
df

age,salary,divided
i64,i64,f64
15,450,0.033333
20,350,0.057143
36,600,0.06
45,700,0.064286


**Second way - define name of column as variable**

In [564]:
# second way
df = df.with_columns(multiplied=(pl.col('age')*pl.col('salary')))
df

age,salary,divided,multiplied
i64,i64,f64,i64
15,450,0.033333,6750
20,350,0.057143,7000
36,600,0.06,21600
45,700,0.064286,31500
