## Groupping and Aggregation

In [1]:
# import
import pandas as pd

In [2]:
# dataset
walt = pd.read_csv('../datasets/WALMART_SALES_DATA.csv')

In [3]:
walt

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.90,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.242170,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,28-09-2012,713173.95,0,64.88,3.997,192.013558,8.684
6431,45,05-10-2012,733455.07,0,64.89,3.985,192.170412,8.667
6432,45,12-10-2012,734464.36,0,54.47,4.000,192.327265,8.667
6433,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667


#### Aggregation functions

In [4]:
walt['Weekly_Sales'].describe()

count    6.435000e+03
mean     1.046965e+06
std      5.643666e+05
min      2.099862e+05
25%      5.533501e+05
50%      9.607460e+05
75%      1.420159e+06
max      3.818686e+06
Name: Weekly_Sales, dtype: float64

In [5]:
# aggregate weekly sales
# sum
print(walt['Weekly_Sales'].sum())

# max
print(walt['Weekly_Sales'].max())

# mean 
print(walt['Weekly_Sales'].mean())

6737218987.11
3818686.45
1046964.8775617732


In [6]:
# holiday_flag - count holiday
print(walt['Holiday_Flag'].sum())

450


In [7]:
# holiday_flag - count
print(walt['Date'].count())
print(walt['Date'].nunique())

6435
143


#### Groupping

In [8]:
# base case: how to do groupping
sum_byWk = walt.groupby('Date').Weekly_Sales.sum().reset_index()
sum_byWk.head()

Unnamed: 0,Date,Weekly_Sales
0,01-04-2011,43458991.19
1,01-06-2012,48281649.72
2,01-07-2011,47578519.5
3,01-10-2010,42239875.87
4,02-03-2012,46861034.97


In [9]:
# base case: how to do groupping
sum_byWk = walt.groupby('Date').Weekly_Sales.mean().reset_index()
sum_byWk.head()

Unnamed: 0,Date,Weekly_Sales
0,01-04-2011,965755.4
1,01-06-2012,1072926.0
2,01-07-2011,1057300.0
3,01-10-2010,938663.9
4,02-03-2012,1041356.0


In [10]:
import numpy as np
# two aggregation functions
sum_byWk = walt.groupby('Date').Weekly_Sales.agg([sum, np.mean, max, min]).reset_index()
sum_byWk.head()

Unnamed: 0,Date,sum,mean,max,min
0,01-04-2011,43458991.19,965755.4,1927993.09,232769.09
1,01-06-2012,48281649.72,1072926.0,2179360.94,261131.09
2,01-07-2011,47578519.5,1057300.0,2074668.19,226702.36
3,01-10-2010,42239875.87,938663.9,1933719.21,224294.39
4,02-03-2012,46861034.97,1041356.0,2206319.9,248051.53


In [11]:
# base case: how to do groupping
sum_byWk = walt.groupby(['Holiday_Flag', 'Date']).Weekly_Sales.agg([sum, np.mean, max, min]).reset_index()
sum_byWk

Unnamed: 0,Holiday_Flag,Date,sum,mean,max,min
0,0,01-04-2011,43458991.19,9.657554e+05,1927993.09,232769.09
1,0,01-06-2012,48281649.72,1.072926e+06,2179360.94,261131.09
2,0,01-07-2011,47578519.50,1.057300e+06,2074668.19,226702.36
3,0,01-10-2010,42239875.87,9.386639e+05,1933719.21,224294.39
4,0,02-03-2012,46861034.97,1.041356e+06,2206319.90,248051.53
...,...,...,...,...,...,...
138,1,12-02-2010,48336677.63,1.074148e+06,2188307.39,286857.13
139,1,25-11-2011,66593605.26,1.479858e+06,3004702.33,255996.47
140,1,26-11-2010,65821003.24,1.462689e+06,2939946.38,240044.57
141,1,30-12-2011,46042461.04,1.023166e+06,2043245.00,215359.21


In [12]:
# base case: how to do groupping
sum_byWk = walt.groupby('Holiday_Flag').Weekly_Sales.agg([sum, np.mean, max, min]).reset_index()
sum_byWk

Unnamed: 0,Holiday_Flag,sum,mean,max,min
0,0,6231919000.0,1041256.0,3818686.45,209986.25
1,1,505299600.0,1122888.0,3004702.33,215359.21


In [13]:
# which store has the highest total sales
sum_byWk = walt.groupby('Store').Weekly_Sales.agg([sum, np.mean, max, min]).reset_index()
sum_byWk.sort_values(by='sum').head()

Unnamed: 0,Store,sum,mean,max,min
32,33,37160221.96,259861.692028,331173.51,209986.25
43,44,43293087.84,302748.866014,376233.89,241937.11
4,5,45475688.9,318011.81049,507900.07,260636.71
35,36,53412214.97,373511.992797,489372.02,270677.98
37,38,55159626.42,385731.653287,499267.66,303908.81


In [14]:
# which store has the highest average sales
sum_byWk = walt.groupby('Store').Weekly_Sales.agg([sum, np.mean, max, min]).reset_index()
sum_byWk.sort_values(by='max', ascending=False).head()

Unnamed: 0,Store,sum,mean,max,min
13,14,288999900.0,2020978.0,3818686.45,1479514.66
19,20,301397800.0,2107677.0,3766687.43,1761016.51
9,10,271617700.0,1899425.0,3749057.69,1627707.31
3,4,299544000.0,2094713.0,3676388.98,1762539.3
12,13,286517700.0,2003620.0,3595903.2,1633663.12


In [15]:
# which store has the highest average sales
sum_byWk = walt.groupby('Store').Weekly_Sales.agg([sum, np.mean, max, min]).reset_index()
sum_byWk.sort_values(by='max', ascending=False).head()

Unnamed: 0,Store,sum,mean,max,min
13,14,288999900.0,2020978.0,3818686.45,1479514.66
19,20,301397800.0,2107677.0,3766687.43,1761016.51
9,10,271617700.0,1899425.0,3749057.69,1627707.31
3,4,299544000.0,2094713.0,3676388.98,1762539.3
12,13,286517700.0,2003620.0,3595903.2,1633663.12


#### Example Usage: Rolling Average

In [16]:
#walt['Weekly_Sales'].rolling(3).mean()
R3M = walt.groupby('Store').rolling(3, min_periods=0).Weekly_Sales.mean().reset_index(drop=True)

In [17]:
walt['R3M'] = R3M
walt['R3M'].head()

0    1.643691e+06
1    1.642824e+06
2    1.632539e+06
3    1.554551e+06
4    1.525501e+06
Name: R3M, dtype: float64

In [18]:
walt[walt['Store']==45].head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,R3M
6292,45,05-02-2010,890689.51,0,27.31,2.784,181.87119,8.992,890689.51
6293,45,12-02-2010,656988.64,1,27.73,2.773,181.982317,8.992,773839.075
6294,45,19-02-2010,841264.04,0,31.27,2.745,182.034782,8.992,796314.063333
6295,45,26-02-2010,741891.65,0,34.89,2.754,182.077469,8.992,746714.776667
6296,45,05-03-2010,777951.22,0,37.13,2.777,182.120157,8.992,787035.636667


In [19]:
walt.isnull().sum()

Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
R3M             0
dtype: int64

In [20]:
walt['R3M'] = walt['R3M'].fillna(0)
walt.isnull().sum()

Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
R3M             0
dtype: int64