# Setup

In [1]:
from piper.defaults import *

piper v0.1.0: Monday, 29 March 2021 19:05:03


# Examples

### info()

In [2]:
%piper sample_data() >> info() >> head()

Dataframe consumes 0.08 Mb


7 rows, 6 columns


Unnamed: 0,columns,type,n,isna,isnull,unique
0,dates,datetime64[ns],367,0,0,367
1,order_dates,datetime64[ns],367,0,0,367
2,countries,object,367,0,0,8
3,regions,object,367,0,0,4


### %piper (assign to dataframe variable)

In [3]:
%piper df <- sample_data()

### columns()

In [4]:
%piper df >> columns(astype='text')

"['dates', 'order_dates', 'countries', 'regions', 'ids', 'values_1', 'values_2']"

### select_contains()

In [5]:
%%piper 
df 
>> select(df.columns.str.contains('value')) 
>> head()

Use %piper/%%piper --info to see rendered pandas pipe statement


'NoneType' object has no attribute 'pipe'


### where()

In [6]:
%piper sample_data() >> where("countries == 'Italy' ") >> head()

47 rows, 7 columns


Unnamed: 0,dates,order_dates,countries,regions,ids,values_1,values_2
0,2020-01-01,2020-01-07,Italy,East,A,311,26
3,2020-01-04,2020-01-10,Italy,East,B,319,233
4,2020-01-05,2020-01-11,Italy,East,D,261,187
7,2020-01-08,2020-01-14,Italy,South,E,273,56


### count()

In [7]:
%piper sample_data() >> count('countries') >> head()

8 rows, 3 columns


Unnamed: 0_level_0,n,%,cum %
countries,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
France,56,15.26,15.26
Italy,47,12.81,28.07
Spain,47,12.81,40.87
Norway,46,12.53,53.41


## Example 1

In [8]:
df = sample_data()
df = df[df['countries'] == 'Italy']
df['new_column'] = df['countries'] + ' ' + df['regions']
df['new_calculated_field'] = df['values_1'] + df['values_2']
df.insert(4, 'new_column', df.pop('new_column'))
df.drop(columns='dates', inplace=True)
df.head()

Unnamed: 0,order_dates,countries,regions,new_column,ids,values_1,values_2,new_calculated_field
0,2020-01-07,Italy,East,Italy East,A,311,26,337
3,2020-01-10,Italy,East,Italy East,B,319,233,552
4,2020-01-11,Italy,East,Italy East,D,261,187,448
7,2020-01-14,Italy,South,Italy South,E,273,56,329
8,2020-01-15,Italy,East,Italy East,B,385,92,477


### Alternative piper solution

In [9]:
%%piper
sample_data() 
>> where("countries == 'Italy'") 
>> assign(new_column = lambda x: x.countries + ' ' + x.regions, 
          calculated_field = lambda x: x.values_1 + x.values_2)
>> relocate('new_column', loc='after', ref_column='regions') 
>> drop(columns='dates')
>> head(5)

47 rows, 8 columns


Unnamed: 0,order_dates,countries,regions,new_column,ids,values_1,values_2,calculated_field
0,2020-01-07,Italy,East,Italy East,A,311,26,337
3,2020-01-10,Italy,East,Italy East,B,319,233,552
4,2020-01-11,Italy,East,Italy East,D,261,187,448
7,2020-01-14,Italy,South,Italy South,E,273,56,329
8,2020-01-15,Italy,East,Italy East,B,385,92,477


In [10]:
(sample_data()
.pipe(where, "countries == 'Italy'")
.pipe(assign, new_column = lambda x: x.countries + ' ' + x.regions,
      calculated_field = lambda x: x.values_1 + x.values_2)
.pipe(relocate, 'new_column', loc='after', ref_column='regions')
.pipe(drop, columns='dates')
.pipe(head, 5))

47 rows, 8 columns


Unnamed: 0,order_dates,countries,regions,new_column,ids,values_1,values_2,calculated_field
0,2020-01-07,Italy,East,Italy East,A,311,26,337
3,2020-01-10,Italy,East,Italy East,B,319,233,552
4,2020-01-11,Italy,East,Italy East,D,261,187,448
7,2020-01-14,Italy,South,Italy South,E,273,56,329
8,2020-01-15,Italy,East,Italy East,B,385,92,477


## Example 2

In [11]:
%%piper 
pd.read_csv('inputs/Belgium - Customers.csv') 
>> clean_columns() 
>> select(['customer', 'bill_type', 'customer_name']) 
>> where("customer > 1200") 
>> to_csv('outputs/test.csv')

In [12]:
%%piper
df <- sample_data() 
>> select('-dates') 
>> where("countries.isin(['Italy'])")  
>> assign(column_test=lambda x: x.countries + ' ABC') 

In [13]:
head(df)

47 rows, 7 columns


Unnamed: 0,order_dates,countries,regions,ids,values_1,values_2,column_test
0,2020-01-07,Italy,East,A,311,26,Italy ABC
3,2020-01-10,Italy,East,B,319,233,Italy ABC
4,2020-01-11,Italy,East,D,261,187,Italy ABC
7,2020-01-14,Italy,South,E,273,56,Italy ABC


## Example 3

In [14]:
%%piper 
df <- sample_data() 
>> select(['-countries', '-regions']) 
>> where(" ids.isin(['A', 'D']) & values_1 > 300 ") 
>> assign(new_field = lambda x: x.ids + ' ' + x.dates.astype(str),
>>        values_mult=lambda x: x.values_1 * x.values_2) 
>> relocate(['values_mult', 'new_field'], loc=2) 
# >> relocate('values_mult', loc=5) 
>> order_by(['ids', 'values_1'], ascending=[False, True]) 

In [15]:
head(df)

40 rows, 5 columns


Unnamed: 0,dates,order_dates,ids,values_1,values_2
38,2020-02-08,2020-02-14,D,305,384
281,2020-10-08,2020-10-14,D,306,39
177,2020-06-26,2020-07-02,D,313,64
191,2020-07-10,2020-07-16,D,316,289


## Example 4

In [16]:
%%piper

sample_data() 

>> select('-dates')  
>> where("countries.isin(['Italy', 'Spain'])")  

>> assign(column_test=lambda x: x.countries + ' ABC') 

>> group_by(['countries', 'regions']) 
>> summarise(total_values_1=pd.NamedAgg('values_1', 'sum'),
             new_field=pd.NamedAgg('values_2', 'sum')) 

>> head()

8 rows, 2 columns


Unnamed: 0_level_0,Unnamed: 1_level_0,total_values_1,new_field
countries,regions,Unnamed: 2_level_1,Unnamed: 3_level_1
Italy,East,3023,3763
Italy,North,1868,1926
Italy,South,2520,2790
Italy,West,2489,1917


## Example 5:

In [17]:
def an_arbitary_function(df, a='default a:', b='default b:'):
    ''' This user defined function accepts and receives
    a dataframe object - so it can be used with piper tool
    '''
    print(a*4)
    print(b*5)
    
    return df

In [18]:
%%piper 
sample_data() 
>> an_arbitary_function(a='override a:, ') 
>> select(['-dates']) 
>> where(" ~countries.isin(['Italy', 'Spain']) & ids.isin(['A','D'])") 
>> select('-order_dates') 
>> group_by(['countries']) 
>> summarise(total_values_1 = pd.NamedAgg('values_1', 'sum'), 
             total_values_2 = pd.NamedAgg('values_2', 'sum')) 

override a:, override a:, override a:, override a:, 
default b:default b:default b:default b:default b:


Unnamed: 0_level_0,total_values_1,total_values_2
countries,Unnamed: 1_level_1,Unnamed: 2_level_1
France,4912,5887
Germany,2459,2607
Norway,3992,4269
Portugal,3907,4470
Sweden,4372,4700
Switzerland,3739,3489


## Example 6: Group % with pivot_table

In [19]:
df = sample_data()

rule = 'Q'
index = ['dates', 'order_dates', 'regions', 'ids']
rename_cols = ['period', 'order_month', 'region', 'code']

group_percent_index = ['period', 'order_month', 'region']

### pandas

In [20]:
percent_x = lambda x: (x*100/x.sum()).round(2)
piper_group_percent_example = lambda x: x.groupby(group_percent_index)['values_2'].transform(percent_x)

# ----
p2 = pivot_table(df, index=index, freq=rule, format_date=True)
p2.rename_axis(rename_cols, axis='rows', inplace=True)

p2['%reg_totval2'] = p2.groupby(group_percent_index)['values_2'].transform(percent_x)
p2.head(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,values_1,values_2,%reg_totval2
period,order_month,region,code,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Mar 2020,Mar 2020,East,A,227.875,184.25,15.39
Mar 2020,Mar 2020,East,B,203.2,168.0,14.03
Mar 2020,Mar 2020,East,C,126.0,367.0,30.65
Mar 2020,Mar 2020,East,D,125.666667,259.0,21.63
Mar 2020,Mar 2020,East,E,194.0,219.0,18.29
Mar 2020,Mar 2020,North,A,219.75,216.0,22.82


### piper

In [21]:
%%piper 

sample_data() 
>> pivot_table(index=index, freq='Q', format_date=True)
>> rename_axis(rename_cols, axis='rows')
>> transform(group_percent_index, totval2_percent=('values_2', 'percent'))
>> head(6)

101 rows, 3 columns


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,values_1,values_2,totval2_percent
period,order_month,region,code,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Mar 2020,Mar 2020,East,A,227.875,184.25,15.39
Mar 2020,Mar 2020,East,B,203.2,168.0,14.03
Mar 2020,Mar 2020,East,C,126.0,367.0,30.65
Mar 2020,Mar 2020,East,D,125.666667,259.0,21.63
Mar 2020,Mar 2020,East,E,194.0,219.0,18.29
Mar 2020,Mar 2020,North,A,219.75,216.0,22.82


## Example 6: Group % with groupby

### Pandas

In [22]:
rule = 'A'
grouper = pd.Grouper(key='dates', freq=rule)
grouper2 = pd.Grouper(key='order_dates', freq=rule)
index = [grouper, grouper2, 'ids']

group_percent_index = ['dates', 'order_dates']
func = lambda x: (x * 100 / x.sum()).round(2)

In [23]:
df = sample_data()
gb = df.groupby(index).agg(totval1=('values_1', 'sum'), totval2=('values_2', 'sum'))
gb['percent_val1'] = gb.groupby(group_percent_index)['totval1'].transform(func)
gb['percent_val2'] = gb.groupby(group_percent_index)['totval2'].transform(func)
gb

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,totval1,totval2,percent_val1,percent_val2
dates,order_dates,ids,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-31,2020-12-31,A,17589,19248,24.31,26.1
2020-12-31,2020-12-31,B,13807,15241,19.08,20.67
2020-12-31,2020-12-31,C,13342,12770,18.44,17.32
2020-12-31,2020-12-31,D,13363,12461,18.47,16.9
2020-12-31,2020-12-31,E,14263,14031,19.71,19.02
2020-12-31,2021-12-31,A,215,369,24.77,28.19
2020-12-31,2021-12-31,B,82,218,9.45,16.65
2020-12-31,2021-12-31,C,391,329,45.05,25.13
2020-12-31,2021-12-31,E,180,393,20.74,30.02
2021-12-31,2021-12-31,B,372,103,100.0,100.0


### Alternative piper solution - groupby

In [24]:
index = ['dates', 'order_dates', 'ids']
sub_index = ['dates', 'order_dates']

In [25]:
%%piper 
sample_data()  
>> group_by(index, freq='A') 
>> summarise(totval1=('values_1', 'sum'), totval2=('values_2', 'sum')) 
>> transform(sub_index, percent_val1=('totval1', 'percent')) 
>> transform(sub_index, percent_val2=('totval2', 'percent')) 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,totval1,totval2,percent_val1,percent_val2
dates,order_dates,ids,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-31,2020-12-31,A,17589,19248,24.31,26.1
2020-12-31,2020-12-31,B,13807,15241,19.08,20.67
2020-12-31,2020-12-31,C,13342,12770,18.44,17.32
2020-12-31,2020-12-31,D,13363,12461,18.47,16.9
2020-12-31,2020-12-31,E,14263,14031,19.71,19.02
2020-12-31,2021-12-31,A,215,369,24.77,28.19
2020-12-31,2021-12-31,B,82,218,9.45,16.65
2020-12-31,2021-12-31,C,391,329,45.05,25.13
2020-12-31,2021-12-31,E,180,393,20.74,30.02
2021-12-31,2021-12-31,B,372,103,100.0,100.0


## Example 7:  Group % with assign

In [26]:
rule = 'M'
grouper = pd.Grouper(key='dates', freq=rule)
grouper2 = pd.Grouper(key='order_dates', freq=rule)

index = [grouper, grouper2, 'regions']
group_percent_index = ['dates', 'order_dates']

In [27]:
func = lambda x: (x * 100 / x.sum()).round(2)

piper_func1 = lambda x: x.groupby(group_percent_index)['totval1'].transform(func)
piper_func2 = lambda x: x.groupby(group_percent_index)['totval2'].transform(func)

In [28]:
%%piper
pd.pivot_table(sample_data(), index=index)
>> rename(columns={'values_1': 'totval1', 'values_2': 'totval2'})
>> assign(percent_val1 = piper_func1, percent_val2 = piper_func2) 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,totval1,totval2,percent_val1,percent_val2
dates,order_dates,regions,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-31,2020-01-31,East,231.222222,182.555556,28.04,24.81
2020-01-31,2020-01-31,North,220.666667,175.833333,26.76,23.89
2020-01-31,2020-01-31,South,161.833333,156.500000,19.63,21.27
2020-01-31,2020-01-31,West,210.750000,221.000000,25.56,30.03
2020-01-31,2020-02-29,East,171.000000,393.000000,28.56,55.64
...,...,...,...,...,...,...
2020-12-31,2020-12-31,West,174.500000,163.500000,22.66,19.22
2020-12-31,2021-01-31,East,118.500000,194.500000,33.33,34.50
2020-12-31,2021-01-31,North,197.000000,275.333333,55.41,48.83
2020-12-31,2021-01-31,South,40.000000,94.000000,11.25,16.67


In [29]:
index = [grouper, grouper2, 'regions']

In [30]:
%%piper

sample_data()
>> pd.pivot_table(get_sample_data(), index=index)
>> rename(columns={'values_1': 'totval1', 'values_2': 'totval2'})
>> transform(group_percent_index, val1_perc=('totval1', 'percent'))
>> transform(group_percent_index, val2_perc=('totval2', 'percent'))

Use %piper/%%piper --info to see rendered pandas pipe statement


name 'get_sample_data' is not defined
