In [1]:
import pandas as pd

In [2]:
# import data
transactions = pd.read_excel('grocery_database.xlsx', sheet_name = 'transactions')
transactions.head()

Unnamed: 0,customer_id,transaction_date,transaction_id,product_area_id,num_items,sales_cost
0,1,2020-04-10,435657533999,3,7,19.16
1,1,2020-04-10,435657533999,2,5,7.71
2,1,2020-06-02,436189770685,4,4,26.97
3,1,2020-06-02,436189770685,1,2,38.52
4,1,2020-06-10,436265380298,4,4,22.13


In [3]:
# import data
product_areas = pd.read_excel('grocery_database.xlsx', sheet_name = 'product_areas')
product_areas.head()

Unnamed: 0,product_area_id,product_area_name,profit_margin
0,1,Non-Food,0.25
1,2,Vegetables,0.18
2,3,Fruit,0.14
3,4,Dairy,0.19
4,5,Meat,0.11


In [4]:
# aggregating total sales
transactions['sales_cost'].sum().round(2)

1549727.82

In [5]:
# merge data frames for group by example
transactions = pd.merge(transactions, product_areas, how = 'inner', on = 'product_area_id')
transactions.head()

Unnamed: 0,customer_id,transaction_date,transaction_id,product_area_id,num_items,sales_cost,product_area_name,profit_margin
0,1,2020-04-10,435657533999,3,7,19.16,Fruit,0.14
1,1,2020-07-20,436667313620,3,8,19.99,Fruit,0.14
2,1,2020-07-22,436686191960,3,20,61.69,Fruit,0.14
3,1,2020-08-12,436899350757,3,16,52.28,Fruit,0.14
4,1,2020-08-13,436908803653,3,14,34.58,Fruit,0.14


In [6]:
# view value counts (without groupby)
transactions['product_area_name'].value_counts()

Fruit         8699
Vegetables    8473
Non-Food      7784
Dairy         7360
Meat          6190
Name: product_area_name, dtype: int64

In [7]:
# view value counts (with groupby)
transactions.groupby('product_area_name')['sales_cost'].count()

product_area_name
Dairy         7360
Fruit         8699
Meat          6190
Non-Food      7784
Vegetables    8473
Name: sales_cost, dtype: int64

In [8]:
# view sum of transactions by product area
transactions.groupby('product_area_name')['sales_cost'].sum()

product_area_name
Dairy         175792.77
Fruit         252033.53
Meat          240892.82
Non-Food      747129.45
Vegetables    133879.25
Name: sales_cost, dtype: float64

In [9]:
# view quantiles of transactions by product area
transactions.groupby('product_area_name')['sales_cost'].quantile([0.25, 0.5, 0.75])

product_area_name      
Dairy              0.25      9.5300
                   0.50     17.9150
                   0.75     30.8425
Fruit              0.25     11.1400
                   0.50     21.7100
                   0.75     38.5500
Meat               0.25     15.4500
                   0.50     28.0000
                   0.75     49.8950
Non-Food           0.25     31.9575
                   0.50     63.1550
                   0.75    126.4575
Vegetables         0.25      4.8100
                   0.50     11.6100
                   0.75     21.8500
Name: sales_cost, dtype: float64

In [10]:
# output of grouby is a series, reset index to convert to data frame
sales_summary = transactions.groupby('product_area_name')['sales_cost'].sum().reset_index()
sales_summary

Unnamed: 0,product_area_name,sales_cost
0,Dairy,175792.77
1,Fruit,252033.53
2,Meat,240892.82
3,Non-Food,747129.45
4,Vegetables,133879.25


In [11]:
# grouby with multiple columns (pass in a list of column names)
sales_summary = transactions.groupby(['transaction_date', 'product_area_name'])['sales_cost'].sum().reset_index()
sales_summary

Unnamed: 0,transaction_date,product_area_name,sales_cost
0,2020-04-01,Dairy,1043.14
1,2020-04-01,Fruit,1137.81
2,2020-04-01,Meat,1448.21
3,2020-04-01,Non-Food,4667.76
4,2020-04-01,Vegetables,1306.45
...,...,...,...
910,2020-09-30,Dairy,1064.09
911,2020-09-30,Fruit,1479.63
912,2020-09-30,Meat,2050.98
913,2020-09-30,Non-Food,3825.52


In [12]:
# group by with multiple columns and multiple aggregations
sales_summary = transactions.groupby(['transaction_date', 'product_area_name'])[['num_items', 'sales_cost']].sum().reset_index()
sales_summary

Unnamed: 0,transaction_date,product_area_name,num_items,sales_cost
0,2020-04-01,Dairy,203,1043.14
1,2020-04-01,Fruit,361,1137.81
2,2020-04-01,Meat,138,1448.21
3,2020-04-01,Non-Food,232,4667.76
4,2020-04-01,Vegetables,663,1306.45
...,...,...,...,...
910,2020-09-30,Dairy,205,1064.09
911,2020-09-30,Fruit,470,1479.63
912,2020-09-30,Meat,190,2050.98
913,2020-09-30,Non-Food,177,3825.52


In [13]:
# use agg to apply different aggregations to a single column
sales_summary = transactions.groupby('product_area_name')['sales_cost'].agg(['sum', 'mean']).reset_index()
sales_summary

Unnamed: 0,product_area_name,sum,mean
0,Dairy,175792.77,23.884887
1,Fruit,252033.53,28.972701
2,Meat,240892.82,38.916449
3,Non-Food,747129.45,95.982715
4,Vegetables,133879.25,15.80069


In [14]:
# use agg to apply different aggregations to multiple columns
sales_summary = transactions.groupby(['transaction_date', 'product_area_name'])[['num_items', 'sales_cost']].agg(['sum', 'mean']).reset_index()
sales_summary

Unnamed: 0_level_0,transaction_date,product_area_name,num_items,num_items,sales_cost,sales_cost
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,sum,mean
0,2020-04-01,Dairy,203,4.413043,1043.14,22.676957
1,2020-04-01,Fruit,361,8.395349,1137.81,26.460698
2,2020-04-01,Meat,138,4.181818,1448.21,43.885152
3,2020-04-01,Non-Food,232,5.948718,4667.76,119.686154
4,2020-04-01,Vegetables,663,15.785714,1306.45,31.105952
...,...,...,...,...,...,...
910,2020-09-30,Dairy,205,5.000000,1064.09,25.953415
911,2020-09-30,Fruit,470,9.215686,1479.63,29.012353
912,2020-09-30,Meat,190,3.877551,2050.98,41.856735
913,2020-09-30,Non-Food,177,4.425000,3825.52,95.638000


In [15]:
# use dict with agg to apply different aggregations to specific columns
sales_summary = transactions.groupby('product_area_name').agg({ # dict column: aggregation(s)
    'num_items': 'mean',
    'sales_cost': ['sum', 'mean']
}).reset_index()
sales_summary

Unnamed: 0_level_0,product_area_name,num_items,sales_cost,sales_cost
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,sum,mean
0,Dairy,4.606386,175792.77,23.884887
1,Fruit,9.366134,252033.53,28.972701
2,Meat,3.739095,240892.82,38.916449
3,Non-Food,4.418808,747129.45,95.982715
4,Vegetables,7.725481,133879.25,15.80069


In [16]:
# use describe with groupby for summary statistics
sales_summary = transactions.groupby('product_area_name')['sales_cost'].describe().reset_index()
sales_summary

Unnamed: 0,product_area_name,count,mean,std,min,25%,50%,75%,max
0,Dairy,7360.0,23.884887,20.788729,2.0,9.53,17.915,30.8425,309.25
1,Fruit,8699.0,28.972701,25.327057,1.01,11.14,21.71,38.55,288.35
2,Meat,6190.0,38.916449,34.507572,5.0,15.45,28.0,49.895,392.62
3,Non-Food,7784.0,95.982715,90.828659,10.02,31.9575,63.155,126.4575,669.34
4,Vegetables,8473.0,15.80069,15.773775,0.0,4.81,11.61,21.85,557.73
