In [1]:
import os
import re
import pandas as pd
import numpy as np

In [18]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel(r'../data/data.xlsx',converters={'id':str})  # Normally for numeric-like ID field, read in as string
print('id data type:', df.id.dtypes)
df

id data type: object


Unnamed: 0,id,color,quality,sales_dt,unit,price
0,1,blue,good,2018-09-01,10.0,30.0
1,2,green,good,2018-01-27,5.0,20.0
2,3,red,,NaT,,
3,4,red,bad,2018-01-27,2.0,5.0
4,5,red,good,2018-09-01,10.0,20.0
5,6,blue,bad,2018-05-06,6.0,10.0
6,7,green,good,2018-05-07,8.0,30.0
7,8,blue,,NaT,,
8,9,blue,bad,2018-01-27,4.0,10.0
9,10,green,good,2018-09-01,10.0,20.0


### Before aggregation
- check if your data has NAs and understand how the aggregation functions treat NAs

In [3]:
df.quality.count()   # Default pd.Series.count() will NOT count NA rows

8

In [4]:
df.price.nunique()   # Default pd.Series.nunique() will NOT count NA rows

4

In [5]:
print(df.price.sum())        # Default pd.Series.sum() will treat NAs as 0s
print(np.sum(df.price))      # same for np.sum()
print(np.nansum(df.price))   # I prefer np.nansum to explicitly ignore NAs

145.0
145.0
145.0


### Aggregation

In [23]:
summary = df.groupby('color',as_index=False).agg({
    'id':'count',                                                          # how many records 
    'quality':'count',                                                     # how many records where quality is not null
    'sales_dt':['min','max'],                                              # first and last sales date
    
    'unit' :[np.nansum,                                                     # Total Unit - treat NA as 0
             lambda x: df.loc[df.quality=='good','unit'][x.index].sum()],   # Units where quality=='good'
    
    'price':[pd.Series.nunique,         # unique prices ignoring NAs
             lambda x: (x>=15).sum()]   # count of prices >= 15
}).rename(columns={'sales_dt':'hehe'})

summary

Unnamed: 0_level_0,color,id,quality,hehe,hehe,unit,unit,price,price
Unnamed: 0_level_1,Unnamed: 1_level_1,count,count,min,max,nansum,<lambda>,nunique,<lambda>
0,blue,4,3,2018-01-27,2018-09-01,20.0,10.0,2.0,1.0
1,green,3,3,2018-01-27,2018-09-01,23.0,23.0,2.0,3.0
2,red,3,2,2018-01-27,2018-09-01,12.0,10.0,2.0,1.0


### How to rename the columns

In [21]:
summary.columns.values

array([('color', ''), ('id', 'count'), ('quality', 'count'),
       ('sales_dt', 'min'), ('sales_dt', 'max'), ('unit', 'nansum'),
       ('unit', '<lambda>'), ('price', 'nunique'), ('price', '<lambda>')],
      dtype=object)

In [24]:
summary.columns = [level1+'_'+level2 for (level1, level2) in summary.columns.values]
summary

Unnamed: 0,color_,id_count,quality_count,hehe_min,hehe_max,unit_nansum,unit_<lambda>,price_nunique,price_<lambda>
0,blue,4,3,2018-01-27,2018-09-01,20.0,10.0,2.0,1.0
1,green,3,3,2018-01-27,2018-09-01,23.0,23.0,2.0,3.0
2,red,3,2,2018-01-27,2018-09-01,12.0,10.0,2.0,1.0


### Rename in aggregation

### Mapping

In [30]:
df.groupby('color').map(df.groupby('color')['unit'].sum())

AttributeError: 'DataFrameGroupBy' object has no attribute 'map'