# Setup

In [1]:
from piper.defaults import *

piper v0.1.0: Monday, 29 March 2021 19:05:07


# Import data

In [2]:
url = 'https://github.com/datagy/pivot_table_pandas/raw/master/sample_pivot.xlsx'

%piper df <- pd.read_excel(url, parse_dates=['Date'])
%piper df >> info()

Dataframe consumes 0.15 Mb


Unnamed: 0,columns,type,n,isna,isnull,unique
0,Date,datetime64[ns],1000,0,0,347
1,Region,object,1000,0,0,4
2,Type,object,1000,0,0,3
3,Units,float64,1000,89,89,33
4,Sales,int64,1000,0,0,329


## Region (ordered categorical)

In [3]:
%%piper
df 
>> count('Region')
>> list()

['n', '%', 'cum %']

In [4]:
region_categories = pd.CategoricalDtype(['North', 'South', 'East', 'West'], ordered=True)
df.Region = df.Region.astype(region_categories)

### Region/Type => count() >> adorn() - ignore_row_index=False

In [5]:
%%piper 
df >> count(['Region', 'Type'], sort_values=None, totals=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,%,cum %
Region,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
North,Children's Clothing,85.0,8.5,8.5
North,Men's Clothing,89.0,8.9,17.4
North,Women's Clothing,142.0,14.2,31.6
South,Children's Clothing,45.0,4.5,36.1
South,Men's Clothing,39.0,3.9,40.0
South,Women's Clothing,53.0,5.3,45.3
East,Children's Clothing,113.0,11.3,56.6
East,Men's Clothing,122.0,12.2,68.8
East,Women's Clothing,176.0,17.6,86.4
West,Children's Clothing,42.0,4.2,90.6


### Region/Type => counts() >> adorn() - ignore_row_index=True

In [6]:
%%piper 
df  >> count(['Region', 'Type'], sort_values=None, totals=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,%,cum %
Region,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
North,Children's Clothing,85.0,8.5,8.5
North,Men's Clothing,89.0,8.9,17.4
North,Women's Clothing,142.0,14.2,31.6
South,Children's Clothing,45.0,4.5,36.1
South,Men's Clothing,39.0,3.9,40.0
South,Women's Clothing,53.0,5.3,45.3
East,Children's Clothing,113.0,11.3,56.6
East,Men's Clothing,122.0,12.2,68.8
East,Women's Clothing,176.0,17.6,86.4
West,Children's Clothing,42.0,4.2,90.6


### Region => count() >> adorn() - ignore_row_index=True

In [7]:
%%piper 
df >> count('Region', totals=True)

Unnamed: 0,n,%,cum %
East,411,41.1,41.1
North,316,31.6,72.7
South,137,13.7,86.4
West,136,13.6,100.0
Total,1000,100.0,


## Units (missing data)

In [8]:
%piper count(df, 'Units') >> head(5)

34 rows, 3 columns


Unnamed: 0_level_0,n,%,cum %
Units,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,89.0,8.9,8.9
18.0,42.0,4.2,13.1
26.0,37.0,3.7,16.8
13.0,36.0,3.6,20.4
34.0,34.0,3.4,23.8


In [9]:
%piper df.query('Units.isna()') >> head(4)

89 rows, 5 columns


Unnamed: 0,Date,Region,Type,Units,Sales
11,2020-08-09,North,Men's Clothing,,270
19,2020-03-08,North,Men's Clothing,,644
25,2020-05-23,North,Men's Clothing,,240
37,2020-04-24,North,Men's Clothing,,900


### Units --> fillna to ensure pandas aggregation functions work correctly

In [10]:
df.Units.fillna(0, inplace=True)

## Region (summary/groupby) with totals

### Region => groupby() >> adorn() - ignore_row_index=True

In [11]:
%%piper 
df.groupby(['Region']).agg(TotalSales=('Sales', 'sum'))
>> pd.DataFrame.reset_index()
>> adorn(ignore_row_index=True)

Use %piper/%%piper --info to see rendered pandas pipe statement


adorn() got an unexpected keyword argument 'ignore_row_index'


### Region/Type => groupby() >> adorn() - ignore_row_index=False

In [12]:
g1 = df.groupby(['Region', 'Type']).agg(TotalSales=('Sales', 'sum'))
g1 = adorn(g1, axis='both').astype(int)
head(g1, g1.shape[0])

13 rows, 2 columns


Unnamed: 0_level_0,Unnamed: 1_level_0,TotalSales,All
Region,Type,Unnamed: 2_level_1,Unnamed: 3_level_1
North,Children's Clothing,37306,37306
North,Men's Clothing,39975,39975
North,Women's Clothing,61419,61419
South,Children's Clothing,18570,18570
South,Men's Clothing,18542,18542
South,Women's Clothing,22203,22203
East,Children's Clothing,45849,45849
East,Men's Clothing,51685,51685
East,Women's Clothing,70229,70229
West,Children's Clothing,20182,20182


In [13]:
%%piper 
df.groupby(['Type', 'Region']).agg(TotalSales=('Sales', 'sum')).unstack()
>> adorn(axis='both', ignore_row_index=False)
>> flatten_cols(remove_prefix='TotalSales')
>> adorn(axis='column', col_row_name='total_north_south', 
         columns=['North', 'South'], ignore_row_index=False)
>> adorn(axis='column', col_row_name='total_east_west', 
         columns=['East', 'West'], ignore_row_index=False)
>> adorn(axis='column', col_row_name='Total', 
         columns=['total_north_south', 'total_east_west'], ignore_row_index=False)
>> head(4)

Use %piper/%%piper --info to see rendered pandas pipe statement


adorn() got an unexpected keyword argument 'ignore_row_index'


### Region/Type => groupby() >> adorn() >> flatten_cols()

In [14]:
%%piper 
df.groupby(['Type', 'Region']).agg(TotalUnits=('Units', 'count')).unstack()
>> adorn(axis='both', ignore_row_index=False).astype(int)
>> flatten_cols(remove_prefix='TotalUnits')
>> head(4)

Use %piper/%%piper --info to see rendered pandas pipe statement


adorn() got an unexpected keyword argument 'ignore_row_index'


In [15]:
%%piper 
df.groupby(['Type', 'Region']).agg(TotalUnits=('Units', 'count'))
>> transform(index='Type', rank=('TotalUnits', 'rank'))
>> assign(rank=lambda x: x['rank'].astype(int))
>> order_by(['Type', 'rank'])
>> adorn(axis='row', columns='TotalUnits', ignore_row_index=False)

Use %piper/%%piper --info to see rendered pandas pipe statement


adorn() got an unexpected keyword argument 'ignore_row_index'


In [16]:
%%piper 
df.groupby(['Type', 'Region']).agg(TotalUnits=('Units', 'count'))
>> where("Region.isin(['East', 'North'])")
>> assign(calc_field=lambda x: x.TotalUnits * 100)
>> transform(index='Type', rank=('TotalUnits', 'rank'))
>> assign(rank=lambda x: x['rank'].astype(int))
>> order_by(['Type', 'rank'], ascending=[True, True])
>> adorn(axis='row', columns=['TotalUnits', 'calc_field'], ignore_row_index=False)
>> reset_index()

Use %piper/%%piper --info to see rendered pandas pipe statement


adorn() got an unexpected keyword argument 'ignore_row_index'


### Region/Type => pivot_table()

In [17]:
%%piper

df.pivot_table(index='Type',
               columns='Region',
               values='Date', 
               margins=True,
               aggfunc='count')
# >> adorn()

Region,North,South,East,West,All
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Children's Clothing,85,45,113,42,285
Men's Clothing,89,39,122,41,291
Women's Clothing,142,53,176,53,424
All,316,137,411,136,1000
