Port of dplyr and other related R packages in python, using pipda.
Documentation | Reference Maps | Notebook Examples | API | Blog
Unlike other similar packages in python that just mimic the piping syntax, datar
follows the API designs from the original packages as much as possible, and is tested thoroughly with the cases from the original packages. So that minimal effort is needed for those who are familar with those R packages to transition to python.
pip install -U datar
# to make sure dependencies to be up-to-date
# pip install -U varname pipda datar
datar
requires python 3.7.1+ and is backended by pandas (1.2+)
.
from datar import f
from datar.dplyr import mutate, filter, if_else
from datar.tibble import tibble
# or
# from datar.all import f, mutate, filter, if_else, tibble
df = tibble(
x=range(4),
y=['zero', 'one', 'two', 'three']
)
df >> mutate(z=f.x)
"""# output
x y z
<int64> <object> <int64>
0 0 zero 0
1 1 one 1
2 2 two 2
3 3 three 3
"""
df >> mutate(z=if_else(f.x>1, 1, 0))
"""# output:
x y z
<int64> <object> <int64>
0 0 zero 0
1 1 one 0
2 2 two 1
3 3 three 1
"""
df >> filter(f.x>1)
"""# output:
x y
<int64> <object>
0 2 two
1 3 three
"""
df >> mutate(z=if_else(f.x>1, 1, 0)) >> filter(f.z==1)
"""# output:
x y z
<int64> <object> <int64>
0 2 two 1
1 3 three 1
"""
# works with plotnine
# example grabbed from https://github.com/has2k1/plydata
import numpy
from datar.base import sin, pi
from plotnine import ggplot, aes, geom_line, theme_classic
df = tibble(x=numpy.linspace(0, 2*pi, 500))
(df >>
mutate(y=sin(f.x), sign=if_else(f.y>=0, "positive", "negative")) >>
ggplot(aes(x='x', y='y')) +
theme_classic() +
geom_line(aes(color='sign'), size=1.2))
# easy to integrate with other libraries
# for example: klib
import klib
from pipda import register_verb
from datar.datasets import iris
from datar.dplyr import pull
dist_plot = register_verb(func=klib.dist_plot)
iris >> pull(f.Sepal_Length) >> dist_plot()
See datar-cli
Example:
❯ datar import table2 | datar head
country year type count
<object> <int64> <object> <int64>
0 Afghanistan 1999 cases 745
1 Afghanistan 1999 population 19987071
2 Afghanistan 2000 cases 2666
3 Afghanistan 2000 population 20595360
4 Brazil 1999 cases 37737
5 Brazil 1999 population 172006362
❯ datar import table2 | \
datar mutate --count "if_else(f.year==1999, f.count*2, f.count)"
country year type count
<object> <int64> <object> <int64>
0 Afghanistan 1999 cases 1490
1 Afghanistan 1999 population 39974142
2 Afghanistan 2000 cases 2666
3 Afghanistan 2000 population 20595360
4 Brazil 1999 cases 75474
5 Brazil 1999 population 344012724
6 Brazil 2000 cases 80488
7 Brazil 2000 population 174504898
8 China 1999 cases 424516
9 China 1999 population 2545830544
10 China 2000 cases 213766
11 China 2000 population 1280428583