In [1]:
from siuba.siu import _, Lam
from siuba import meta_hook
from siuba.tidy import (
    mutate, group_by, ungroup, filter,
    summarize, transmute, select, arrange,
    if_else, case_when, count, add_count,
    nest, unnest, Pipeable
)

import pandas as pd

from pandas import DataFrame, Series

In [2]:
df = DataFrame({
    "repo": ["pandas", "dplyr", "ggplot2", "plotnine"],
    "owner": ["pandas-dev", "tidyverse", "tidyverse", "has2k1"],
    "language": ["python", "R", "R", "python"],
    "stars": [17800, 2800, 3500, 1450],
    "x": [1,2,3,None]
    })

## mutate

In [3]:
from pandas.core.groupby import DataFrameGroupBy

In [4]:
gdf = group_by(df, 'language', "owner")

weird = gdf.apply(lambda d: DataFrame({'rel_stars': d.stars - d.stars.min(), 'two': 2}))

out = mutate(gdf, rel_stars1 = _.stars - _.stars.min())
out2 = mutate(out, rel_stars2 = _.stars + _.stars)

ungroup(out)


Unnamed: 0,repo,owner,language,stars,x,rel_stars1
0,dplyr,tidyverse,R,2800,2.0,0
1,ggplot2,tidyverse,R,3500,3.0,700
2,plotnine,has2k1,python,1450,,0
3,pandas,pandas-dev,python,17800,1.0,0


## filter

In [5]:
# TODO: change name filter to query?

# regular filter
filter(df, _.stars > 3000, _.stars < 15000)

# grouped filter
gdf = group_by(df, "language")

ungroup(filter(gdf, _.stars != _.stars.min()))

Unnamed: 0,repo,owner,language,stars,x
0,ggplot2,tidyverse,R,3500,3.0
1,pandas,pandas-dev,python,17800,1.0


## summarize

In [6]:
# summarize DataFrame
summarize(df, min_stars = _.stars.min())

# summarize grouped DataFrame
gdf = group_by(df, "language")

summarize(gdf, ttl_stars = _.stars.sum(), wat = _.stars.min())

Unnamed: 0,language,ttl_stars,wat
0,R,6300,2800
0,python,19250,1450


## transmute

In [7]:
transmute(df, "language", rel_stars1 = _.stars - _.stars.min())

ungroup(transmute(gdf, "language", rel_stars1 = _.stars - _.stars.min()))

Unnamed: 0,language,rel_stars1
0,R,0
1,R,700
2,python,16350
3,python,0


## select

In [8]:
# thoughts:
#  + can use dynamic values, e.g. colname == .x
#  + if select implements some name class, then nothing magic happening
#    e.g. _.y == _.x is equivalent to lambda cols: cols.y == cols.x
#  - long winded (==, _.y seems harder to read than "y")
# select(df, _.y == _.x, -_.language)

select(df, _.y == _.x, -_.language)

# considered alternative with strings. E.g...
# select(df, "y = x", "language")
# select(df, dict(y = "x"), "language")

Unnamed: 0,y,stars,owner,repo
0,1.0,17800,pandas-dev,pandas
1,2.0,2800,tidyverse,dplyr
2,3.0,3500,tidyverse,ggplot2
3,,1450,has2k1,plotnine


## arrange

In [9]:
arrange(df, -_.owner, _.repo)

arrange(df, _.owner.str.len())

Unnamed: 0,repo,owner,language,stars,x
3,plotnine,has2k1,python,1450,
1,dplyr,tidyverse,R,2800,2.0
2,ggplot2,tidyverse,R,3500,3.0
0,pandas,pandas-dev,python,17800,1.0


## helpers (if_else, case_when)

In [10]:
if_else(df.repo == "dplyr", "yeah", "no")

array(['no', 'yeah', 'no', 'no'], dtype='<U4')

In [11]:
case_when(df, {
    _.stars > 10000: "incredible!",
    _.stars > 1000: "pretty good!",
    _.stars > 100 : "keep going!",
    True: "I don't know"
})

array(['incredible!', 'pretty good!', 'pretty good!', 'pretty good!'],
      dtype='<U12')

## nest and unnest 

In [12]:
# data column is an array of DataFrames
nest(df, -_.language, key = "data")

Unnamed: 0,language,data
0,R,owner stars x repo 0 tidyverse...
1,python,owner stars x repo 0 pandas-...


In [13]:
unnest(nest(df, -_.language, key = "data"), "data")

Unnamed: 0,owner,stars,x,repo,language
0,tidyverse,2800,2.0,dplyr,R
1,tidyverse,3500,3.0,ggplot2,R
2,pandas-dev,17800,1.0,pandas,python
3,has2k1,1450,,plotnine,python


## count 

In [14]:
count(df, "language", "owner")

Unnamed: 0,language,owner,n
0,R,tidyverse,2
1,python,has2k1,1
2,python,pandas-dev,1


In [15]:
add_count(df, "language", "owner")

Unnamed: 0,repo,owner,language,stars,x,n
0,pandas,pandas-dev,python,17800,1.0,1
1,dplyr,tidyverse,R,2800,2.0,2
2,ggplot2,tidyverse,R,3500,3.0,2
3,plotnine,has2k1,python,1450,,1


## Piping

In [16]:
f = Pipeable(f = lambda x: x + 1) >> Pipeable(f = lambda x: "x is: {}".format(x))

f(2)

'x is: 3'

In [17]:
# For now, pass _ as first argument to pipe...
(df
 >> mutate(_, new_repo = _.repo + " waattt")
 >> filter(_, _.stars > 5000)
 )

Unnamed: 0,repo,owner,language,stars,x,new_repo
0,pandas,pandas-dev,python,17800,1.0,pandas waattt
