# Polars try

This is the example of polars, which is one of packages of python, for comparing time for reading data to pandas.

In [1]:
import time
import pandas as pd
import polars as pl
from datetime import datetime

In [2]:
start_time = time.time()
df = pl.DataFrame(
    {
        "integer": [1, 2, 3],
        "date": [
            datetime(2025, 1, 1),
            datetime(2025, 1, 2),
            datetime(2025, 1, 3),
        ],
        "float": [4.0, 5.0, 6.0],
        "string": ["a", "b", "c"]
    }
)
end_time = time.time()
print(df)
print("Calculate time:", end_time - start_time, ' [s]')

shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date                ┆ float ┆ string │
│ ---     ┆ ---                 ┆ ---   ┆ ---    │
│ i64     ┆ datetime[μs]        ┆ f64   ┆ str    │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1       ┆ 2025-01-01 00:00:00 ┆ 4.0   ┆ a      │
│ 2       ┆ 2025-01-02 00:00:00 ┆ 5.0   ┆ b      │
│ 3       ┆ 2025-01-03 00:00:00 ┆ 6.0   ┆ c      │
└─────────┴─────────────────────┴───────┴────────┘
Calculate time: 0.0014829635620117188  [s]


In [3]:
# start_time = time.time()

df_polars = pl.read_csv('./df_O_OK.csv')

# end_time = time.time()

df_polars.head()

# print(f"Reading time: {end_time - start_time}")

start_time = time.time()

df_polars = df_polars.with_columns((pl.col("xx") * 2).alias("xx_yy"))

end_time = time.time()

print(f"Concating time: {end_time - start_time}")
df_polars

FileNotFoundError: No such file or directory (os error 2): ./df_O_OK.csv

In [None]:
# start_time = time.time()

df_pandas = pd.read_csv('./df_O_OK.csv')

# end_time = time.time()

df_pandas.head()

# print(f"Reading time: {end_time - start_time}")

start_time = time.time()

df_pandas['new_column'] = df_pandas['xx'] ** 2

end_time = time.time()

print(f"Concating time: {end_time - start_time}")


Concating time: 0.002630949020385742


In [None]:
df_polars.head()

Unnamed: 0_level_0,xx,xy,xz,yx,yy,yz,zx,zy,zz,Unnamed: 0,xx_yy
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,-3.85354,-0.02747,-0.46924,-0.17947,-0.96837,-0.34293,-0.76518,-0.35146,-3.00874,0.0,-7.70708
1,-3.91122,0.16417,0.02368,0.11499,-0.9787,-0.08939,0.21029,0.03088,-2.82456,1.0,-7.82244
2,-4.43528,-0.09935,0.1269,-0.14693,-1.06729,0.0671,-0.04337,-0.06472,-2.26729,2.0,-8.87056
3,-3.0288,-0.14833,0.15565,-0.19327,-0.97022,-0.10448,-0.21451,-0.01428,-3.78778,3.0,-6.0576
4,-4.5201,-0.01229,-0.08943,0.02149,-1.03491,-0.03804,-0.182,-0.02524,-2.23512,4.0,-9.0402


In [None]:
df_polars.select(pl.col("xx", "yy")).head()

xx,yy
f64,f64
-3.85354,-0.96837
-3.91122,-0.9787
-4.43528,-1.06729
-3.0288,-0.97022
-4.5201,-1.03491


In [None]:
import os

file_path = './df_O_OK.csv'
file_size = os.path.getsize(file_path) / 1024**2

print(f"The size of the file is {file_size} mega bytes")

The size of the file is 27.43991756439209 mega bytes


## Pandas concat

In [None]:
# Horizontally concat
start_time = time.time()

df_pandas_new = pd.concat([df_pandas, df_pandas], axis=1)

end_time = time.time()

print(f"Concating time: {end_time - start_time}")
df_pandas_new.shape

Concating time: 0.006927013397216797


(324521, 24)

In [None]:
# Vertically concat
start_time = time.time()

df_pandas_new = pd.concat([df_pandas, df_pandas], axis=0)

end_time = time.time()

print(f"Concating time: {end_time - start_time}")
df_pandas_new.shape

Concating time: 0.008468866348266602


(649042, 12)

## Polars concat

In [None]:
# Vertically concat
start_time = time.time()

df_polars_new = pl.concat([df_polars, df_polars])
end_time = time.time()

print(f"Concating time: {end_time - start_time}")
df_polars_new.head()
df_polars_new.shape

Concating time: 0.007108211517333984


(649042, 12)

In [None]:
# Horizontally concat
df_polars_copy = df_polars.clone().rename({
    'xx': 'xx_copy', 'xy': 'xy_copy', 'xz': 'xz_copy', 
    'yx': 'yx_copy', 'yy': 'yy_copy', 'yz': 'yz_copy', 
    'zx': 'zx_copy', 'zy': 'zy_copy', 'zz': 'zz_copy', 
    '': 'copy','Unnamed: 0': 'Unnamed: 0 copy','xx_yy': 'xx_yy_copy'
    })

start_time = time.time()

df_polars_new = pl.concat([df_polars, df_polars_copy], how='horizontal')
end_time = time.time()

print(f"Concating time: {end_time - start_time}")
df_polars_new.head()
df_polars_new.shape

Concating time: 0.008697748184204102


(324521, 24)

## Pandas select

In [None]:
start_time = time.time()

pd_df_selected = df_pandas[['xx', 'xy', 'xz', 'yy', 'zz']]

end_time = time.time()

print(f"Select time: {end_time - start_time}")
pd_df_selected.head()

Select time: 0.009365081787109375


Unnamed: 0,xx,xy,xz,yy,zz
0,-3.85354,-0.02747,-0.46924,-0.96837,-3.00874
1,-3.91122,0.16417,0.02368,-0.9787,-2.82456
2,-4.43528,-0.09935,0.1269,-1.06729,-2.26729
3,-3.0288,-0.14833,0.15565,-0.97022,-3.78778
4,-4.5201,-0.01229,-0.08943,-1.03491,-2.23512


## Polars select

In [None]:
start_time = time.time()

pl_df_selected = df_polars[['xx', 'xy', 'xz', 'yy', 'zz']]

end_time = time.time()

print(f"Select time: {end_time - start_time}")

Select time: 6.890296936035156e-05


# Polars try (take 2)

## Create the new data by polars, and then convert data to csv

In [None]:
import numpy as np
import polars as pl
import pandas as pd

# create random DataFrame with 10 million rows and 2 columns
pl_df = pl.DataFrame({
    'a': np.random.rand(10000000),
    'b': np.random.rand(10000000)})

%timeit pl_df.write_csv('df.csv', separator=',')


# pd_df = pl_df.to_pandas()

# %timeit pd_df.to_csv('df_new.csv', sep=',')


343 ms ± 5.93 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Pandas sum

In [None]:
import pandas as pd
import numpy as np


# create random DataFrame with 10 million rows and 2 columns
pd_df = pd.read_csv('df.csv')


# compute sum of each column using Pandas
%timeit pd_df.sum()

11.9 ms ± 833 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Polars sum

In [None]:
import polars as pl
import numpy as np


# create random DataFrame with 10 million rows and 2 columns
pl_df = pl.read_csv('df.csv')


# compute sum of each column using Polar
%timeit pl_df.sum()

KeyboardInterrupt: 

In [None]:
from math import exp as e
def sig(z):
  return (1/(1+e(-1*z)))

## Pandas apply

In [None]:
%timeit pd_df.apply(lambda a: sig(a[0]+a[1]/100), axis=1)

39.2 s ± 133 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Polars apply

In [None]:
%timeit pl_df.apply(lambda a: sig(a[0]+a[1]/100))



4.5 s ± 43.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# create random DataFrame with 10 million rows and 2 columns
pl_df_new = pl.DataFrame({
    'c': np.random.rand(10000000),
    'd': np.random.rand(10000000)})

%timeit pl_df_new.write_csv('df_add.csv', separator=',')

341 ms ± 4.96 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Pandas concat (horizontal)

In [None]:
pd_df = pd.read_csv('df.csv')

pd_df_add = pd.read_csv('df_add.csv')

%timeit pd.concat([pd_df, pd_df_add], axis=1)

53.7 ms ± 8.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
import os
file_path = './df_add.csv'
# file = pd_df.to_csv(file_path)
file_size = os.path.getsize(file_path) / 1024**3

print(f"The size of the file is {file_size} giga bytes")

The size of the file is 0.35892702359706163 giga bytes


## Pandas concat (vertical)

In [None]:
%timeit pd.concat([pd_df, pd_df_add], axis=0)

213 ms ± 12.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Polars concat (vertical)

In [None]:
pl_df = pl.read_csv('df.csv')

# create random DataFrame with 10 million rows and 2 columns
pl_df_add = pl.read_csv('df_add.csv')
pl_df_add_new = pl_df_add.clone().rename({
    'c': 'a', 'd': 'b'
    })

%timeit pl.concat([pl_df, pl_df_add_new])

58.2 ms ± 2.63 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Polars concat (horizontal)

In [None]:
print(pl_df_add.columns)
pl_df_add_new = pl_df_add.clone().rename({
    'c': 'c_copy', 'd': 'd_copy'
    })

%timeit pl.concat([pl_df, pl_df_add_new], how='horizontal')


['c', 'd']
53.8 ms ± 3.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
