# Setup

In [1]:
from piper.defaults import *
from piper.jde import *

piper v0.1.0: Monday, 29 March 2021 19:08:11


# Importing data into a dataframe

In [2]:
%piper df <- sample_data()
%piper df >> head()

367 rows, 7 columns


Unnamed: 0,dates,order_dates,countries,regions,ids,values_1,values_2
0,2020-01-01,2020-01-07,Italy,East,A,311,26
1,2020-01-02,2020-01-08,Portugal,South,D,150,375
2,2020-01-03,2020-01-09,Spain,East,A,396,88
3,2020-01-04,2020-01-10,Italy,East,B,319,233


## count() / counts()

In [3]:
%%piper
df 
>> count('countries', totals=True)
>> head()

9 rows, 3 columns


Unnamed: 0,n,%,cum %
France,56,15.26,15.26
Italy,47,12.81,28.07
Spain,47,12.81,40.87
Norway,46,12.53,53.41


In [4]:
%%piper
df 
>> count('countries')
>> head()

8 rows, 3 columns


Unnamed: 0_level_0,n,%,cum %
countries,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
France,56,15.26,15.26
Italy,47,12.81,28.07
Spain,47,12.81,40.87
Norway,46,12.53,53.41


## where()

In [5]:
query = "regions == 'East' and countries.isin(['Italy'])"

In [6]:
%%piper 
df 
>> where(query) 
>> count(columns=['regions', 'countries'],
          percent=True,
          cum_percent=True)
# .loc[('East', 'Italy'), 'cum %']

Unnamed: 0_level_0,Unnamed: 1_level_0,n,%,cum %
regions,countries,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
East,Italy,17,100.0,100.0


### standard pandas

In [7]:
df = df.query(query)
df = count(df, ['regions', 'countries'], percent=True, cum_percent=True)
df.loc[('East', 'Italy'), 'cum %']

100.0

## summarise()

In [8]:
%%piper 
sample_data() 
>> select('-dates')
# >> where("regions == 'East' and values_1 == 29 ")
# >> group_by(['regions', 'countries'])
>> summarise({'values_1': 'sum', 'values_2': 'sum'})

values_1    73604
values_2    75163
dtype: int64

## add_jde_batch()

In [9]:
%%piper
sample_data()
# >> memory() 
# >> select(slice('regions', 'values_2'))
# >> assign(new_col = lambda x: x.values_2 * 4)
# >> group_by('regions')
# >> summarise({'new_col': 'mean'})
# >> rename(columns={'new_col': 'mean_value'}).reset_index()
>> sample(10)
>> add_jde_batch(col_prefix='aa_')
>> head().reset_index(drop=True)

367 rows, 7 columns


10 rows, 11 columns


Unnamed: 0,aa_us,aa_bt,aa_tn,aa_ln,dates,order_dates,countries,regions,ids,values_1,values_2
0,userid,ABC_20210329,1,,2020-03-21,2020-03-27,France,South,A,368,235
1,userid,ABC_20210329,1,,2020-07-17,2020-07-23,Switzerland,West,E,98,220
2,userid,ABC_20210329,1,,2020-03-01,2020-03-07,France,West,A,214,296
3,userid,ABC_20210329,1,,2020-05-24,2020-05-30,Sweden,East,A,368,376


## Categorical examples

In [10]:
%piper df <- sample_data()
%piper df >> head()

367 rows, 7 columns


Unnamed: 0,dates,order_dates,countries,regions,ids,values_1,values_2
0,2020-01-01,2020-01-07,Italy,East,A,311,26
1,2020-01-02,2020-01-08,Portugal,South,D,150,375
2,2020-01-03,2020-01-09,Spain,East,A,396,88
3,2020-01-04,2020-01-10,Italy,East,B,319,233


### countries

In [11]:
%piper df >> count('countries') >> list()

categories = ['France', 'Spain', 'Italy', 'Portugal',
 'Norway', 'Sweden', 'Germany', 'Switzerland']

country_cat_dtype = pd.CategoricalDtype(categories, ordered=True)
df.countries = df.countries.astype(country_cat_dtype)
logger.info(df.countries.dtype)

category


### regions

In [12]:
categories = ['North', 'East', 'West', 'South']
region_cat_type = pd.CategoricalDtype(categories, ordered=True)
df.regions = df.regions.astype(region_cat_type)

In [13]:
%piper df >> count('regions') >> pd.DataFrame.transpose()

regions,East,West,South,North
n,103.0,94.0,87.0,83.0
%,28.07,25.61,23.71,22.62
cum %,28.07,53.68,77.38,100.0


In [14]:
df.regions.dtype

CategoricalDtype(categories=['North', 'East', 'West', 'South'], ordered=True)

## groupby - agg using dictionary of aggregate functions

In [15]:
g1 = (df.groupby(['countries', 'regions'])
        .agg({'values_1': np.sum, 'values_2': np.sum})
        .rename(columns={'values_1': 'totval1', 'values_2': 'totval2'})
        .unstack()
     )
head(g1)

8 rows, 8 columns


Unnamed: 0_level_0,totval1,totval1,totval1,totval1,totval2,totval2,totval2,totval2
regions,North,East,West,South,North,East,West,South
countries,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
France,2275,2170,4861,2118,2926,3029,4065,1726
Spain,2447,2450,1752,3322,2457,2302,1447,2665
Italy,1868,3023,2489,2520,1926,3763,1917,2790
Portugal,1857,2469,1699,2476,1861,2837,2015,3456


## groupby - agg using tuples

In [16]:
g1 = (df.groupby(['countries', 'regions'])
        .agg(totval1=('values_1', 'sum'), totval2=('values_2', 'sum')) 
        .unstack()
     )
head(g1)

8 rows, 8 columns


Unnamed: 0_level_0,totval1,totval1,totval1,totval1,totval2,totval2,totval2,totval2
regions,North,East,West,South,North,East,West,South
countries,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
France,2275,2170,4861,2118,2926,3029,4065,1726
Spain,2447,2450,1752,3322,2457,2302,1447,2665
Italy,1868,3023,2489,2520,1926,3763,1917,2790
Portugal,1857,2469,1699,2476,1861,2837,2015,3456


## groupby - agg using named tuples

In [17]:
g1 = (df.groupby(['countries', 'regions'])
        .agg(totval1=pd.NamedAgg('values_1', 'sum'),
             totval2=pd.NamedAgg('values_2', 'sum'))  
        .unstack()
     )
head(g1)

8 rows, 8 columns


Unnamed: 0_level_0,totval1,totval1,totval1,totval1,totval2,totval2,totval2,totval2
regions,North,East,West,South,North,East,West,South
countries,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
France,2275,2170,4861,2118,2926,3029,4065,1726
Spain,2447,2450,1752,3322,2457,2302,1447,2665
Italy,1868,3023,2489,2520,1926,3763,1917,2790
Portugal,1857,2469,1699,2476,1861,2837,2015,3456


## pivot_table - agg using dictionary of aggregate functions

In [18]:
p1 = (pd.pivot_table(df, columns='regions', index='countries',
                     values=['values_1', 'values_2'], aggfunc=np.sum)
        .rename(columns={'values_1': 'totval1', 'values_2': 'totval2'}))
head(p1)

8 rows, 8 columns


Unnamed: 0_level_0,totval1,totval1,totval1,totval1,totval2,totval2,totval2,totval2
regions,North,East,West,South,North,East,West,South
countries,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
France,2275,2170,4861,2118,2926,3029,4065,1726
Spain,2447,2450,1752,3322,2457,2302,1447,2665
Italy,1868,3023,2489,2520,1926,3763,1917,2790
Portugal,1857,2469,1699,2476,1861,2837,2015,3456


## crosstab equivalent 

In [19]:
xt1 = pd.crosstab(index=df.countries, columns=df.regions,
            values=df.values_1, aggfunc=np.sum)
xt2 = pd.crosstab(index=df.countries, columns=df.regions,
            values=df.values_2, aggfunc=np.sum)
head(pd.concat([xt1, xt2], axis=1))

8 rows, 8 columns


regions,North,East,West,South,North,East,West,South
countries,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
France,2275,2170,4861,2118,2926,3029,4065,1726
Spain,2447,2450,1752,3322,2457,2302,1447,2665
Italy,1868,3023,2489,2520,1926,3763,1917,2790
Portugal,1857,2469,1699,2476,1861,2837,2015,3456


## reordered categoricals

In [20]:
reordered_countries = ['Germany', 'Switzerland', 'France', 'Spain',
                       'Italy', 'Portugal', 'Norway', 'Sweden']

df.countries = df.countries.cat.reorder_categories(reordered_countries)

categories = ['North', 'South', 'East', 'West']
df.regions = df.regions.cat.reorder_categories(categories)

In [21]:
xt1 = pd.crosstab(index=df.countries, columns=df.regions,
            values=df.values_1, aggfunc=np.sum)
xt2 = pd.crosstab(index=df.countries, columns=df.regions,
            values=df.values_2, aggfunc=np.sum)
head(pd.concat([xt1, xt2], axis=1), 10)

8 rows, 8 columns


regions,North,South,East,West,North,South,East,West
countries,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Germany,2239,1753,1764,1575,1736,2028,1362,1321
Switzerland,2723,2498,1626,1920,2356,2362,1770,2307
France,2275,2118,2170,4861,2926,1726,3029,4065
Spain,2447,3322,2450,1752,2457,2665,2302,1447
Italy,1868,2520,3023,2489,1926,2790,3763,1917
Portugal,1857,2476,2469,1699,1861,3456,2837,2015
Norway,2633,1670,3741,1234,3282,1237,4352,857
Sweden,1535,1435,2195,3267,1417,1605,2934,3055
