# Setup

In [1]:
from piper import piper
from piper.verbs import *
from piper.factory import *

# Examples

### info()

In [2]:
%piper get_sample_data() >> info() >> head()

Dataframe with (rows, cols) (367, 7) consumes 0.08 Mb
7 rows, 6 columns


Unnamed: 0,columns,data_type,unique,isna,isnull,total_count
0,dates,datetime64[ns],367,0,0,367
1,order_dates,datetime64[ns],367,0,0,367
2,countries,object,8,0,0,367
3,regions,object,4,0,0,367


### %piper (assign to dataframe variable)

In [3]:
%piper df <- get_sample_data()

### columns()

In [4]:
%piper df >> columns(astype='text')

"['dates', 'order_dates', 'countries', 'regions', 'ids', 'values_1', 'values_2']"

### select_contains()

In [5]:
%%piper 
df 
>> contains('value') 
>> head()

367 rows, 2 columns


Unnamed: 0,values_1,values_2
0,311,26
1,150,375
2,396,88
3,319,233


### where()

In [6]:
%piper get_sample_data() >> where("countries == 'Italy' ") >> head()

47 rows, 7 columns


Unnamed: 0,dates,order_dates,countries,regions,ids,values_1,values_2
0,2020-01-01,2020-01-07,Italy,East,A,311,26
3,2020-01-04,2020-01-10,Italy,East,B,319,233
4,2020-01-05,2020-01-11,Italy,East,D,261,187
7,2020-01-08,2020-01-14,Italy,South,E,273,56


### count()

In [7]:
%piper get_sample_data() >> count('countries') >> head()

8 rows, 2 columns


Unnamed: 0,countries,n
0,France,56
1,Spain,47
2,Italy,47
3,Norway,46


## Example 1

In [8]:
df = get_sample_data()
df = df[df['countries'] == 'Italy']
df['new_column'] = df['countries'] + ' ' + df['regions']
df['new_calculated_field'] = df['values_1'] + df['values_2']
df.insert(4, 'new_column', df.pop('new_column'))
df.drop(columns='dates', inplace=True)
df.head()

Unnamed: 0,order_dates,countries,regions,new_column,ids,values_1,values_2,new_calculated_field
0,2020-01-07,Italy,East,Italy East,A,311,26,337
3,2020-01-10,Italy,East,Italy East,B,319,233,552
4,2020-01-11,Italy,East,Italy East,D,261,187,448
7,2020-01-14,Italy,South,Italy South,E,273,56,329
8,2020-01-15,Italy,East,Italy East,B,385,92,477


### Alternative piper solution

In [9]:
%%piper
get_sample_data() 
>> where("countries == 'Italy'") 
>> assign(new_column = lambda x: x.countries + ' ' + x.regions, 
          calculated_field = lambda x: x.values_1 + x.values_2)
>> relocate('new_column', loc='after', ref_column='regions') 
>> drop(columns='dates')
>> head(5)

47 rows, 8 columns


Unnamed: 0,order_dates,countries,regions,new_column,ids,values_1,values_2,calculated_field
0,2020-01-07,Italy,East,Italy East,A,311,26,337
3,2020-01-10,Italy,East,Italy East,B,319,233,552
4,2020-01-11,Italy,East,Italy East,D,261,187,448
7,2020-01-14,Italy,South,Italy South,E,273,56,329
8,2020-01-15,Italy,East,Italy East,B,385,92,477


In [10]:
(get_sample_data()
.pipe(where, "countries == 'Italy'")
.pipe(assign, new_column = lambda x: x.countries + ' ' + x.regions,
      calculated_field = lambda x: x.values_1 + x.values_2)
.pipe(relocate, 'new_column', loc='after', ref_column='regions')
.pipe(drop, columns='dates')
.pipe(head, 5))

47 rows, 8 columns


Unnamed: 0,order_dates,countries,regions,new_column,ids,values_1,values_2,calculated_field
0,2020-01-07,Italy,East,Italy East,A,311,26,337
3,2020-01-10,Italy,East,Italy East,B,319,233,552
4,2020-01-11,Italy,East,Italy East,D,261,187,448
7,2020-01-14,Italy,South,Italy South,E,273,56,329
8,2020-01-15,Italy,East,Italy East,B,385,92,477


## Example 2

In [11]:
%%piper 
pd.read_csv('inputs/Belgium - Customers.csv') 
>> clean_columns() 
>> select(['customer', 'bill_type', 'customer_name']) 
>> where("customer > 1200") 
>> to_csv('outputs/test.csv')

In [12]:
%%piper
df <- get_sample_data() 
>> select('-dates') 
>> where("countries.isin(['Italy'])")  
>> assign(column_test=lambda x: x.countries + ' ABC') 

In [13]:
head(df)

47 rows, 7 columns


Unnamed: 0,order_dates,countries,regions,ids,values_1,values_2,column_test
0,2020-01-07,Italy,East,A,311,26,Italy ABC
3,2020-01-10,Italy,East,B,319,233,Italy ABC
4,2020-01-11,Italy,East,D,261,187,Italy ABC
7,2020-01-14,Italy,South,E,273,56,Italy ABC


## Example 3

In [14]:
%%piper 
df <- get_sample_data() 
>> select(['-countries', '-regions']) 
>> where(" ids.isin(['A', 'D']) & values_1 > 300 ") 
>> assign(new_field = lambda x: x.ids + ' ' + x.dates.astype(str),
>>        values_mult=lambda x: x.values_1 * x.values_2) 
>> relocate(['values_mult', 'new_field'], loc=2) 
# >> relocate('values_mult', loc=5) 
>> order_by(['ids', 'values_1'], ascending=[False, True]) 

In [15]:
head(df)

40 rows, 5 columns


Unnamed: 0,dates,order_dates,ids,values_1,values_2
38,2020-02-08,2020-02-14,D,305,384
281,2020-10-08,2020-10-14,D,306,39
177,2020-06-26,2020-07-02,D,313,64
191,2020-07-10,2020-07-16,D,316,289


## Example 4

In [16]:
%%piper

get_sample_data() 

>> select('-dates')  
>> where("countries.isin(['Italy', 'Spain'])")  

>> assign(column_test=lambda x: x.countries + ' ABC') 

>> group_by(['countries', 'regions']) 
>> summarise(total_values_1=pd.NamedAgg('values_1', 'sum'),
             new_field=pd.NamedAgg('values_2', 'sum')) 

>> head()

8 rows, 2 columns


Unnamed: 0_level_0,Unnamed: 1_level_0,total_values_1,new_field
countries,regions,Unnamed: 2_level_1,Unnamed: 3_level_1
Italy,East,3023,3763
Italy,North,1868,1926
Italy,South,2520,2790
Italy,West,2489,1917


## Example 5:

In [17]:
def an_arbitary_function(df, a='default a:', b='default b:'):
    ''' This user defined function accepts and receives
    a dataframe object - so it can be used with piper tool
    '''
    print(a*4)
    print(b*5)
    
    return df

In [18]:
%%piper 
get_sample_data() 
>> an_arbitary_function(a='override a:, ') 
>> select(['-dates']) 
>> where(" ~countries.isin(['Italy', 'Spain']) & ids.isin(['A','D'])") 
>> select('-order_dates') 
>> group_by(['countries']) 
>> summarise(total_values_1 = pd.NamedAgg('values_1', 'sum'), 
             total_values_2 = pd.NamedAgg('values_2', 'sum')) 

override a:, override a:, override a:, override a:, 
default b:default b:default b:default b:default b:


Unnamed: 0_level_0,total_values_1,total_values_2
countries,Unnamed: 1_level_1,Unnamed: 2_level_1
France,4912,5887
Germany,2459,2607
Norway,3992,4269
Portugal,3907,4470
Sweden,4372,4700
Switzerland,3739,3489


## Example 6:

In [19]:
df = get_sample_data()

rule = 'Q'
grouper = pd.Grouper(key='dates', freq=rule)
grouper2 = pd.Grouper(key='order_dates', freq=rule)

index = [grouper, grouper2, 'regions', 'ids']
index_names = ['period', 'order_month', 'region', 'code']

percent_x = lambda x: (x*100/x.sum()).round(2)
group_percent_index = ['period', 'order_month', 'region']

piper_group_percent_example = lambda x: x.groupby(group_percent_index)['values_2'].transform(percent_x)

### Conventional pandas

In [20]:
p2 = resample_pivot(df, index=index, grouper=[grouper, grouper2], rule=rule)
p2.rename_axis(index_names, axis='rows', inplace=True)

p2['%reg_totval2'] = p2.groupby(group_percent_index)['values_2'].transform(percent_x)
p2.head(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,values_1,values_2,%reg_totval2
period,order_month,region,code,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020 Mar,2020 Mar,East,A,1823,1474,34.01
2020 Mar,2020 Mar,East,B,1016,840,19.38
2020 Mar,2020 Mar,East,C,126,367,8.47
2020 Mar,2020 Mar,East,D,377,777,17.93
2020 Mar,2020 Mar,East,E,776,876,20.21
2020 Mar,2020 Mar,North,A,879,864,26.43


### Alternative piper solution

In [21]:
%%piper 
p2 <- 
resample_pivot(df, index=index, grouper=[grouper, grouper2], rule=rule)
>> rename_axis(index_names, axis='rows')
>> assign(reg_totval2_percent = piper_group_percent_example) 

In [22]:
head(p2, 6)

101 rows, 3 columns


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,values_1,values_2,reg_totval2_percent
period,order_month,region,code,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020 Mar,2020 Mar,East,A,1823,1474,34.01
2020 Mar,2020 Mar,East,B,1016,840,19.38
2020 Mar,2020 Mar,East,C,126,367,8.47
2020 Mar,2020 Mar,East,D,377,777,17.93
2020 Mar,2020 Mar,East,E,776,876,20.21
2020 Mar,2020 Mar,North,A,879,864,26.43


## Example 6: Calculating a group percentage

In [23]:
rule = 'A'
grouper = pd.Grouper(key='dates', freq=rule)
grouper2 = pd.Grouper(key='order_dates', freq=rule)
index = [grouper, grouper2, 'ids']

group_percent_index = ['dates', 'order_dates']
func = lambda x: (x * 100 / x.sum()).round(2)

### Conventional pandas

In [24]:
df = get_sample_data()
gb = df.groupby(index).agg(totval1=('values_1', 'sum'),
                           totval2=('values_2', 'sum'))
gb['percent_val1'] = gb.groupby(group_percent_index)['totval1'].transform(func)
gb['percent_val2'] = gb.groupby(group_percent_index)['totval2'].transform(func)
gb.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,totval1,totval2,percent_val1,percent_val2
dates,order_dates,ids,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-31,2020-12-31,A,17589,19248,24.31,26.1
2020-12-31,2020-12-31,B,13807,15241,19.08,20.67
2020-12-31,2020-12-31,C,13342,12770,18.44,17.32
2020-12-31,2020-12-31,D,13363,12461,18.47,16.9
2020-12-31,2020-12-31,E,14263,14031,19.71,19.02


### Alternative piper solution

In [25]:
piper_func1 = lambda x: x.groupby(group_percent_index)['totval1'].transform(func)
piper_func2 = lambda x: x.groupby(group_percent_index)['totval2'].transform(func)

In [26]:
%%piper 
p2 <- get_sample_data()  
>> group_by([grouper, grouper2, 'ids']) 
>> summarise(totval1=(pd.NamedAgg('values_1', 'sum')),
             totval2=(pd.NamedAgg('values_2', 'sum'))) 
>> assign(percent_val1 = piper_func1,
          percent_val2 = piper_func2) 

In [27]:
head(p2)

10 rows, 4 columns


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,totval1,totval2,percent_val1,percent_val2
dates,order_dates,ids,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-31,2020-12-31,A,17589,19248,24.31,26.1
2020-12-31,2020-12-31,B,13807,15241,19.08,20.67
2020-12-31,2020-12-31,C,13342,12770,18.44,17.32
2020-12-31,2020-12-31,D,13363,12461,18.47,16.9


## Example 7: Resample with group percentage

In [28]:
rule = 'A'
grouper = pd.Grouper(key='dates', freq=rule)
grouper2 = pd.Grouper(key='order_dates', freq=rule)
index = [grouper, grouper2, 'regions']

In [29]:
group_percent_index = ['dates', 'order_dates']
func = lambda x: (x * 100 / x.sum()).round(2)

In [30]:
p2 = resample_pivot(get_sample_data(), index=index, grouper=grouper, rule=rule)
p2['percent_val1'] = p2.groupby(group_percent_index)['values_1'].transform(func)
p2['percent_val2'] = p2.groupby(group_percent_index)['values_2'].transform(func)
p2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,values_1,values_2,percent_val1,percent_val2
dates,order_dates,regions,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020,2020-12-31,East,19201,21960,26.53,29.78
2020,2020-12-31,North,16614,17032,22.96,23.09
2020,2020-12-31,South,17752,17775,24.53,24.1
2020,2020-12-31,West,18797,16984,25.98,23.03
2020,2021-12-31,East,237,389,27.3,29.72
2020,2021-12-31,North,591,826,68.09,63.1
2020,2021-12-31,South,40,94,4.61,7.18
2021,2021-12-31,North,372,103,100.0,100.0


### Alternative piper solution

In [31]:
rule = 'M'
grouper = pd.Grouper(key='dates', freq=rule)
grouper2 = pd.Grouper(key='order_dates', freq=rule)
index = [grouper, grouper2, 'regions']

group_percent_index = ['dates', 'order_dates']
func = lambda x: (x * 100 / x.sum()).round(2)

In [32]:
piper_func1 = lambda x: x.groupby(group_percent_index)['totval1'].transform(func)
piper_func2 = lambda x: x.groupby(group_percent_index)['totval2'].transform(func)

In [33]:
%%piper
resample_pivot(get_sample_data(), index=index, grouper=grouper, rule=rule)
>> rename(columns={'values_1': 'totval1', 'values_2': 'totval2'})
>> assign(percent_val1 = piper_func1,
          percent_val2 = piper_func2) 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,totval1,totval2,percent_val1,percent_val2
dates,order_dates,regions,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020 Jan,2020-01-31,East,2081,1643,39.87,36.34
2020 Jan,2020-01-31,North,1324,1055,25.37,23.34
2020 Jan,2020-01-31,South,971,939,18.61,20.77
2020 Jan,2020-01-31,West,843,884,16.15,19.55
2020 Jan,2020-02-29,East,171,393,15.86,41.76
...,...,...,...,...,...,...
2020 Dec,2020-12-31,West,1047,981,21.82,18.24
2020 Dec,2021-01-31,East,237,389,27.30,29.72
2020 Dec,2021-01-31,North,591,826,68.09,63.10
2020 Dec,2021-01-31,South,40,94,4.61,7.18


In [34]:
%%piper
resample_pivot(get_sample_data(), index=index, grouper=grouper, rule=rule)
>> rename(columns={'values_1': 'totval1', 'values_2': 'totval2'})
>> add_group_calc(column='percent_val1', value='totval1', index=group_percent_index)
>> add_group_calc(column='percent_val2', value='totval2', index=group_percent_index)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,totval1,totval2,percent_val1,percent_val2
dates,order_dates,regions,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020 Jan,2020-01-31,East,2081,1643,39.87,36.34
2020 Jan,2020-01-31,North,1324,1055,25.37,23.34
2020 Jan,2020-01-31,South,971,939,18.61,20.77
2020 Jan,2020-01-31,West,843,884,16.15,19.55
2020 Jan,2020-02-29,East,171,393,15.86,41.76
...,...,...,...,...,...,...
2020 Dec,2020-12-31,West,1047,981,21.82,18.24
2020 Dec,2021-01-31,East,237,389,27.30,29.72
2020 Dec,2021-01-31,North,591,826,68.09,63.10
2020 Dec,2021-01-31,South,40,94,4.61,7.18
