# DataGrouping

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

%matplotlib inline

## Introduction to grouping

In [2]:
products = pd.read_csv('./course-files/course-sources/WA_Sales_Products_2012-14.csv')
products.head()

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
0,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Deluxe Cook Set,2012,Q1 2012,59628.66,489,0.347548
1,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Double Flame,2012,Q1 2012,35950.32,252,0.474274
2,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Dome,2012,Q1 2012,89940.48,147,0.352772
3,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Gazer 2,2012,Q1 2012,165883.41,303,0.282938
4,United States,Fax,Outdoors Shop,Camping Equipment,Sleeping Bags,Hibernator Lite,2012,Q1 2012,119822.2,1415,0.29145


In [3]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88475 entries, 0 to 88474
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Retailer country   88475 non-null  object 
 1   Order method type  88475 non-null  object 
 2   Retailer type      88475 non-null  object 
 3   Product line       88475 non-null  object 
 4   Product type       88475 non-null  object 
 5   Product            88475 non-null  object 
 6   Year               88475 non-null  int64  
 7   Quarter            88475 non-null  object 
 8   Revenue            88475 non-null  float64
 9   Quantity           88475 non-null  int64  
 10  Gross margin       87894 non-null  float64
dtypes: float64(2), int64(2), object(7)
memory usage: 7.4+ MB


In [4]:
products.describe()

Unnamed: 0,Year,Revenue,Quantity,Gross margin
count,88475.0,88475.0,88475.0,87894.0
mean,2012.855281,42638.29,780.586166,0.449718
std,0.778342,65784.02,1541.645422,0.123642
min,2012.0,0.0,1.0,-12.853678
25%,2012.0,8184.36,131.0,0.36988
50%,2013.0,21026.28,333.0,0.450634
75%,2013.0,50390.6,816.0,0.52013
max,2014.0,1635688.0,67875.0,0.770476


In [5]:
products['Retailer country'].value_counts()

United States     7482
Canada            5923
France            5779
Germany           5397
Japan             5359
United Kingdom    5102
Netherlands       4199
Switzerland       4103
Italy             4018
Austria           3862
Mexico            3845
Belgium           3710
Australia         3665
China             3652
Spain             3557
Singapore         3443
Finland           3409
Korea             3399
Brazil            3288
Sweden            2925
Denmark           2358
Name: Retailer country, dtype: int64

In [6]:
products['Retailer country'].nunique()

21

In [7]:
countries = products['Retailer country'].unique()
countries

array(['United States', 'Canada', 'Mexico', 'Brazil', 'Japan',
       'Singapore', 'Korea', 'China', 'Australia', 'Netherlands',
       'Sweden', 'Denmark', 'Finland', 'France', 'Germany', 'Switzerland',
       'United Kingdom', 'Belgium', 'Austria', 'Italy', 'Spain'],
      dtype=object)

In [8]:
my_own_groups = dict()
for country in countries:
    my_own_groups[country] = products[products['Retailer country'] == country]

my_own_groups

{'United States':       Retailer country Order method type  Retailer type          Product line  \
 0        United States               Fax  Outdoors Shop     Camping Equipment   
 1        United States               Fax  Outdoors Shop     Camping Equipment   
 2        United States               Fax  Outdoors Shop     Camping Equipment   
 3        United States               Fax  Outdoors Shop     Camping Equipment   
 4        United States               Fax  Outdoors Shop     Camping Equipment   
 ...                ...               ...            ...                   ...   
 84049    United States       Sales visit  Eyewear Store  Personal Accessories   
 84050    United States       Sales visit  Eyewear Store  Personal Accessories   
 84051    United States       Sales visit  Eyewear Store  Personal Accessories   
 84052    United States       Sales visit  Eyewear Store  Personal Accessories   
 84053    United States       Sales visit  Eyewear Store  Personal Accessories   

In [11]:
my_own_groups['Belgium'].head()

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
7270,Belgium,Web,Golf Shop,Personal Accessories,Watches,Mountain Man Digital,2012,Q1 2012,2324.46,57,0.509564
7271,Belgium,Web,Golf Shop,Personal Accessories,Watches,Mountain Man Deluxe,2012,Q1 2012,5267.96,68,0.496579
7272,Belgium,Web,Golf Shop,Personal Accessories,Watches,Infinity,2012,Q1 2012,22483.0,96,0.458979
7273,Belgium,Web,Golf Shop,Personal Accessories,Watches,TX,2012,Q1 2012,16100.0,87,0.448898
7274,Belgium,Web,Golf Shop,Personal Accessories,Eyewear,Polar Sun,2012,Q1 2012,2682.52,44,0.571075


In [12]:
my_own_groups['Mexico'].describe()

Unnamed: 0,Year,Revenue,Quantity,Gross margin
count,3845.0,3845.0,3845.0,3818.0
mean,2012.847594,36246.971397,662.449155,0.44884
std,0.769929,49260.175492,1149.019516,0.110507
min,2012.0,0.0,1.0,-0.17918
25%,2012.0,8203.86,131.0,0.368511
50%,2013.0,19364.0,327.0,0.44955
75%,2013.0,43050.24,749.0,0.517297
max,2014.0,464070.9,19214.0,0.770476


## The groupby() method
args:
* `by=None`

In [3]:
products = pd.read_csv('./course-files/course-sources/WA_Sales_Products_2012-14.csv')
products.head()

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
0,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Deluxe Cook Set,2012,Q1 2012,59628.66,489,0.347548
1,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Double Flame,2012,Q1 2012,35950.32,252,0.474274
2,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Dome,2012,Q1 2012,89940.48,147,0.352772
3,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Gazer 2,2012,Q1 2012,165883.41,303,0.282938
4,United States,Fax,Outdoors Shop,Camping Equipment,Sleeping Bags,Hibernator Lite,2012,Q1 2012,119822.2,1415,0.29145


In [4]:
products.groupby('Retailer country')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000020E531CB940>

In [5]:
groups = products.groupby('Retailer country')
type(groups)

pandas.core.groupby.generic.DataFrameGroupBy

In [7]:
products['Retailer country'].nunique(), len(groups), 

(21, 21)

In [29]:
groups.size()  # similar to value_counts()

Retailer country
Australia         3665
Austria           3862
Belgium           3710
Brazil            3288
Canada            5923
China             3652
Denmark           2358
Finland           3409
France            5779
Germany           5397
Italy             4018
Japan             5359
Korea             3399
Mexico            3845
Netherlands       4199
Singapore         3443
Spain             3557
Sweden            2925
Switzerland       4103
United Kingdom    5102
United States     7482
dtype: int64

In [10]:
groups.first()  # get first row from each group

Unnamed: 0_level_0,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
Retailer country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Australia,Telephone,Golf Shop,Personal Accessories,Watches,Mountain Man Digital,2012,Q1 2012,2161.34,53,0.509564
Austria,Telephone,Golf Shop,Personal Accessories,Watches,Mountain Man Analog,2012,Q1 2012,6562.3,137,0.373695
Belgium,Web,Golf Shop,Personal Accessories,Watches,Mountain Man Digital,2012,Q1 2012,2324.46,57,0.509564
Brazil,Web,Golf Shop,Personal Accessories,Watches,Venue,2012,Q1 2012,15987.0,219,0.419206
Canada,Fax,Outdoors Shop,Personal Accessories,Watches,Venue,2012,Q1 2012,12045.0,165,0.41863
China,Fax,Golf Shop,Golf Equipment,Irons,Hailstorm Steel Irons,2012,Q1 2012,69057.8,214,0.315153
Denmark,Web,Golf Shop,Personal Accessories,Watches,Venue,2012,Q1 2012,2482.0,34,0.41863
Finland,Web,Department Store,Camping Equipment,Cooking Gear,TrailChef Water Bag,2012,Q1 2012,16007.34,2586,0.526656
France,Fax,Golf Shop,Golf Equipment,Irons,Lady Hailstorm Steel Irons,2012,Q1 2012,19530.81,39,0.445356
Germany,Mail,Warehouse Store,Camping Equipment,Cooking Gear,TrailChef Cook Set,2012,Q1 2012,21197.46,402,0.33681


In [12]:
products[products['Retailer country'] == 'Australia'].head(1)

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
3735,Australia,Telephone,Golf Shop,Personal Accessories,Watches,Mountain Man Digital,2012,Q1 2012,2161.34,53,0.509564


In [13]:
products[products['Retailer country'] == 'Canada'].head(1)

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
864,Canada,Fax,Outdoors Shop,Personal Accessories,Watches,Venue,2012,Q1 2012,12045.0,165,0.41863


In [16]:
groups.last().head()

Unnamed: 0_level_0,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
Retailer country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Australia,Sales visit,Department Store,Golf Equipment,Putters,Blue Steel Putter,2014,Q3 2014,14248.39,169,0.511327
Austria,Web,Sports Store,Golf Equipment,Golf Accessories,Course Pro Gloves,2014,Q3 2014,11759.84,2194,0.524254
Belgium,Web,Sports Store,Personal Accessories,Navigation,Astro Pilot,2014,Q3 2014,33294.0,93,0.350754
Brazil,Web,Sports Store,Golf Equipment,Golf Accessories,Course Pro Gloves,2014,Q3 2014,8066.8,1505,0.524254
Canada,Web,Sports Store,Golf Equipment,Golf Accessories,Course Pro Gloves,2014,Q3 2014,13753.76,2566,0.524254


In [21]:
groups.groups

{'Australia': [3735, 3736, 3737, 3738, 3739, 3740, 3741, 3742, 3743, 3744, 3745, 3746, 3747, 3748, 3749, 3750, 3751, 3752, 3753, 3754, 3755, 3756, 3757, 3758, 3759, 3760, 3761, 3762, 3763, 3764, 3765, 3766, 3767, 3768, 3769, 3770, 3771, 3772, 3773, 3774, 3775, 3776, 3777, 3778, 3779, 3780, 3781, 3782, 3783, 3784, 3785, 3786, 3787, 3788, 3789, 3790, 3791, 3792, 3793, 3794, 3795, 3796, 3797, 3798, 3799, 3800, 3801, 3802, 3803, 3804, 3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833, 3834, ...], 'Austria': [7588, 7589, 7590, 7591, 7592, 7593, 7594, 7595, 7596, 7597, 7598, 7599, 7600, 7601, 7602, 7603, 7604, 7605, 7606, 7607, 7608, 7609, 7610, 7611, 7612, 7613, 7614, 7615, 7616, 7617, 7618, 7619, 7620, 7621, 7622, 7623, 7624, 7625, 7626, 7627, 7628, 7629, 7630, 7631, 7632, 7633, 7634, 7635, 7636, 7637, 7638, 7639, 7640, 7641, 7642, 7643, 7644, 7645, 7646, 7647, 7648, 7

In [22]:
type(groups.groups)

pandas.io.formats.printing.PrettyDict

In [23]:
groups.groups['Belgium']

Int64Index([ 7270,  7271,  7272,  7273,  7274,  7275,  7276,  7277,  7278,
             7279,
            ...
            87859, 87860, 87861, 87862, 87863, 87864, 87865, 87866, 87867,
            87868],
           dtype='int64', length=3710)

In [24]:
type(groups.groups['Belgium'])

pandas.core.indexes.numeric.Int64Index

In [26]:
products.loc[groups.groups['Belgium']].head()

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
7270,Belgium,Web,Golf Shop,Personal Accessories,Watches,Mountain Man Digital,2012,Q1 2012,2324.46,57,0.509564
7271,Belgium,Web,Golf Shop,Personal Accessories,Watches,Mountain Man Deluxe,2012,Q1 2012,5267.96,68,0.496579
7272,Belgium,Web,Golf Shop,Personal Accessories,Watches,Infinity,2012,Q1 2012,22483.0,96,0.458979
7273,Belgium,Web,Golf Shop,Personal Accessories,Watches,TX,2012,Q1 2012,16100.0,87,0.448898
7274,Belgium,Web,Golf Shop,Personal Accessories,Eyewear,Polar Sun,2012,Q1 2012,2682.52,44,0.571075


In [27]:
groups.get_group('Belgium').head()

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
7270,Belgium,Web,Golf Shop,Personal Accessories,Watches,Mountain Man Digital,2012,Q1 2012,2324.46,57,0.509564
7271,Belgium,Web,Golf Shop,Personal Accessories,Watches,Mountain Man Deluxe,2012,Q1 2012,5267.96,68,0.496579
7272,Belgium,Web,Golf Shop,Personal Accessories,Watches,Infinity,2012,Q1 2012,22483.0,96,0.458979
7273,Belgium,Web,Golf Shop,Personal Accessories,Watches,TX,2012,Q1 2012,16100.0,87,0.448898
7274,Belgium,Web,Golf Shop,Personal Accessories,Eyewear,Polar Sun,2012,Q1 2012,2682.52,44,0.571075


## Aggregations

In [30]:
products = pd.read_csv('./course-files/course-sources/WA_Sales_Products_2012-14.csv')
products.head()

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
0,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Deluxe Cook Set,2012,Q1 2012,59628.66,489,0.347548
1,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Double Flame,2012,Q1 2012,35950.32,252,0.474274
2,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Dome,2012,Q1 2012,89940.48,147,0.352772
3,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Gazer 2,2012,Q1 2012,165883.41,303,0.282938
4,United States,Fax,Outdoors Shop,Camping Equipment,Sleeping Bags,Hibernator Lite,2012,Q1 2012,119822.2,1415,0.29145


### Standard aggregation methods
returns Data Frame with values for each column that values could be counted<br>
* `count()` counts not NaN values in each column
* `min()` and `max()` all columns
* `mean()` counts mean for numerical columns only

In [31]:
groups = products.groupby(by='Retailer country')
groups.mean().head()

Unnamed: 0_level_0,Year,Revenue,Quantity,Gross margin
Retailer country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia,2012.859482,29822.638237,545.915689,0.451174
Austria,2012.875971,29354.380023,544.330917,0.450913
Belgium,2012.878976,26134.38542,484.911321,0.44841
Brazil,2012.878954,33170.824647,595.332117,0.448023
Canada,2012.839946,41682.874366,746.098936,0.451011


In [32]:
groups.sum().head()

Unnamed: 0_level_0,Year,Revenue,Quantity,Gross margin
Retailer country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia,7377130,109300000.0,2000781,1641.822415
Austria,7773727,113366600.0,2102206,1723.839177
Belgium,7467781,96958570.0,1799021,1653.288385
Brazil,6618346,109065700.0,1957452,1466.380765
Canada,11922051,246887700.0,4419144,2662.767825


In [33]:
groups.count().head()

Unnamed: 0_level_0,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
Retailer country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Australia,3665,3665,3665,3665,3665,3665,3665,3665,3665,3639
Austria,3862,3862,3862,3862,3862,3862,3862,3862,3862,3823
Belgium,3710,3710,3710,3710,3710,3710,3710,3710,3710,3687
Brazil,3288,3288,3288,3288,3288,3288,3288,3288,3288,3273
Canada,5923,5923,5923,5923,5923,5923,5923,5923,5923,5904


In [34]:
groups.min().head()

Unnamed: 0_level_0,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
Retailer country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Australia,E-mail,Department Store,Camping Equipment,Binoculars,Aloe Relief,2012,Q1 2012,0.0,1,-0.389517
Austria,Sales visit,Department Store,Camping Equipment,Binoculars,Aloe Relief,2012,Q1 2012,0.0,4,-0.086616
Belgium,Fax,Department Store,Camping Equipment,Binoculars,Aloe Relief,2012,Q1 2012,0.0,5,-1.476136
Brazil,E-mail,Department Store,Camping Equipment,Binoculars,Aloe Relief,2012,Q1 2012,0.0,4,-2.085496
Canada,E-mail,Department Store,Camping Equipment,Binoculars,Aloe Relief,2012,Q1 2012,0.0,1,-1.373907


In [35]:
groups.max().head()

Unnamed: 0_level_0,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
Retailer country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Australia,Web,Warehouse Store,Personal Accessories,Woods,Zone,2014,Q4 2013,516135.14,20031,0.770476
Austria,Web,Warehouse Store,Personal Accessories,Woods,Zone,2014,Q4 2013,321202.98,9662,0.770476
Belgium,Web,Warehouse Store,Personal Accessories,Woods,Zone,2014,Q4 2013,310499.39,13200,0.770476
Brazil,Web,Warehouse Store,Personal Accessories,Woods,Zone,2014,Q4 2013,352749.39,12157,0.770476
Canada,Web,Warehouse Store,Personal Accessories,Woods,Zone,2014,Q4 2013,648467.6,27268,0.770476


In [41]:
products.loc[
    (products['Retailer country'] == 'Australia') &
    (products['Order method type'] == 'Web') &
    (products['Retailer type'] == 'Warehouse Store') &
    (products['Product line'] == 'Personal Accessories') &
    (products['Product type'] == 'Woods')
].head()

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin


In [42]:
groups['Revenue'].sum().head()

Retailer country
Australia    1.093000e+08
Austria      1.133666e+08
Belgium      9.695857e+07
Brazil       1.090657e+08
Canada       2.468877e+08
Name: Revenue, dtype: float64

In [43]:
groups[['Revenue', 'Quantity']].mean().head()

Unnamed: 0_level_0,Revenue,Quantity
Retailer country,Unnamed: 1_level_1,Unnamed: 2_level_1
Australia,29822.638237,545.915689
Austria,29354.380023,544.330917
Belgium,26134.38542,484.911321
Brazil,33170.824647,595.332117
Canada,41682.874366,746.098936


## Grouping and Multiindex

In [44]:
products = pd.read_csv('./course-files/course-sources/WA_Sales_Products_2012-14.csv')
products.head()

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
0,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Deluxe Cook Set,2012,Q1 2012,59628.66,489,0.347548
1,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Double Flame,2012,Q1 2012,35950.32,252,0.474274
2,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Dome,2012,Q1 2012,89940.48,147,0.352772
3,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Gazer 2,2012,Q1 2012,165883.41,303,0.282938
4,United States,Fax,Outdoors Shop,Camping Equipment,Sleeping Bags,Hibernator Lite,2012,Q1 2012,119822.2,1415,0.29145


In [46]:
groups = products.groupby(by=['Retailer country', 'Year'])
groups.size()  # how much transaction by a country and a year

Retailer country  Year
Australia         2012    1402
                  2013    1376
                  2014     887
Austria           2012    1436
                  2013    1469
                          ... 
United Kingdom    2013    1845
                  2014    1258
United States     2012    3309
                  2013    2506
                  2014    1667
Length: 63, dtype: int64

In [49]:
# we have 3 times more groups beacause for each country we have 3 rows by each year
groups.sum().head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Revenue,Quantity,Gross margin
Retailer country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia,2012,25607043.5,522186,625.835424
Australia,2013,47799737.14,839750,625.65069
Australia,2014,35893188.5,638845,390.336301
Austria,2012,34568089.63,702134,636.115492
Austria,2013,44996737.04,803771,665.086198
Austria,2014,33801788.98,596301,422.637486
Belgium,2012,30396297.49,635694,592.01729
Belgium,2013,38375432.68,662157,667.302813
Belgium,2014,28186839.74,501170,393.968282
Brazil,2012,34234876.3,688643,540.191035


In [54]:
groups.get_group(('Australia', 2012)).head()

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
3735,Australia,Telephone,Golf Shop,Personal Accessories,Watches,Mountain Man Digital,2012,Q1 2012,2161.34,53,0.509564
3736,Australia,Telephone,Golf Shop,Personal Accessories,Binoculars,Seeker Mini,2012,Q1 2012,5689.6,70,0.507874
3737,Australia,Telephone,Golf Shop,Personal Accessories,Navigation,Glacier GPS,2012,Q1 2012,17337.34,158,0.284152
3738,Australia,Telephone,Golf Shop,Golf Equipment,Irons,Hailstorm Steel Irons,2012,Q1 2012,96118.5,255,0.413692
3739,Australia,Telephone,Golf Shop,Golf Equipment,Irons,Hailstorm Titanium Irons,2012,Q1 2012,40149.72,46,0.437513


## agg() method

In [55]:
products = pd.read_csv('./course-files/course-sources/WA_Sales_Products_2012-14.csv')
products.head()

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
0,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Deluxe Cook Set,2012,Q1 2012,59628.66,489,0.347548
1,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Double Flame,2012,Q1 2012,35950.32,252,0.474274
2,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Dome,2012,Q1 2012,89940.48,147,0.352772
3,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Gazer 2,2012,Q1 2012,165883.41,303,0.282938
4,United States,Fax,Outdoors Shop,Camping Equipment,Sleeping Bags,Hibernator Lite,2012,Q1 2012,119822.2,1415,0.29145


In [56]:
groups = products.groupby(by=['Retailer country', 'Year'])
groups['Revenue'].sum().head()

Retailer country  Year
Australia         2012    25607043.50
                  2013    47799737.14
                  2014    35893188.50
Austria           2012    34568089.63
                  2013    44996737.04
Name: Revenue, dtype: float64

In [57]:
groups[['Quantity', 'Revenue']].sum().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity,Revenue
Retailer country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,2012,522186,25607043.5
Australia,2013,839750,47799737.14
Australia,2014,638845,35893188.5
Austria,2012,702134,34568089.63
Austria,2013,803771,44996737.04


In [58]:
groups['Gross margin'].mean().head()

Retailer country  Year
Australia         2012    0.447985
                  2013    0.457347
                  2014    0.446609
Austria           2012    0.446084
                  2013    0.456790
Name: Gross margin, dtype: float64

In [59]:
groups.Revenue.sum().head()

Retailer country  Year
Australia         2012    25607043.50
                  2013    47799737.14
                  2014    35893188.50
Austria           2012    34568089.63
                  2013    44996737.04
Name: Revenue, dtype: float64

In [60]:
groups.Quantity.sum().head()

Retailer country  Year
Australia         2012    522186
                  2013    839750
                  2014    638845
Austria           2012    702134
                  2013    803771
Name: Quantity, dtype: int64

In [62]:
groups.agg(dict(Revenue=sum)).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Revenue
Retailer country,Year,Unnamed: 2_level_1
Australia,2012,25607043.5
Australia,2013,47799737.14
Australia,2014,35893188.5
Austria,2012,34568089.63
Austria,2013,44996737.04


In [65]:
groups.agg({'Gross margin': 'mean'}).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Gross margin
Retailer country,Year,Unnamed: 2_level_1
Australia,2012,0.447985
Australia,2013,0.457347
Australia,2014,0.446609
Austria,2012,0.446084
Austria,2013,0.45679


In [67]:
groups.agg(
    {
        "Revenue": sum,
        'Quantity': 'sum',
        'Gross margin': 'mean'
    }
).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Revenue,Quantity,Gross margin
Retailer country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia,2012,25607043.5,522186,0.447985
Australia,2013,47799737.14,839750,0.457347
Australia,2014,35893188.5,638845,0.446609
Austria,2012,34568089.63,702134,0.446084
Austria,2013,44996737.04,803771,0.45679


In [68]:
groups.agg(
    {
        "Revenue": [sum, min, max],
        'Quantity': 'sum',
        'Gross margin': 'mean'
    }
).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Revenue,Revenue,Revenue,Quantity,Gross margin
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,min,max,sum,mean
Retailer country,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Australia,2012,25607043.5,0.0,254026.08,522186,0.447985
Australia,2013,47799737.14,0.0,443746.6,839750,0.457347
Australia,2014,35893188.5,0.0,516135.14,638845,0.446609
Austria,2012,34568089.63,0.0,199715.4,702134,0.446084
Austria,2013,44996737.04,0.0,317161.24,803771,0.45679


In [71]:
groups.agg(
    {
        "Revenue": ['sum'],
        'Quantity': 'sum',
        'Gross margin': 'mean'
    }
).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Revenue,Quantity,Gross margin
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,sum,mean
Retailer country,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Australia,2012,25607043.5,522186,0.447985
Australia,2013,47799737.14,839750,0.457347
Australia,2014,35893188.5,638845,0.446609
Austria,2012,34568089.63,702134,0.446084
Austria,2013,44996737.04,803771,0.45679


## Data processing from an group object

In [72]:
products = pd.read_csv('./course-files/course-sources/WA_Sales_Products_2012-14.csv')
products.head()

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
0,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Deluxe Cook Set,2012,Q1 2012,59628.66,489,0.347548
1,United States,Fax,Outdoors Shop,Camping Equipment,Cooking Gear,TrailChef Double Flame,2012,Q1 2012,35950.32,252,0.474274
2,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Dome,2012,Q1 2012,89940.48,147,0.352772
3,United States,Fax,Outdoors Shop,Camping Equipment,Tents,Star Gazer 2,2012,Q1 2012,165883.41,303,0.282938
4,United States,Fax,Outdoors Shop,Camping Equipment,Sleeping Bags,Hibernator Lite,2012,Q1 2012,119822.2,1415,0.29145


In [78]:
groups = products.groupby(by='Retailer country')

### `Items` in Groups

In [117]:
abbc = dict(a=1,b=2,c=3)
print(abbc.keys())
print(abbc.values())
print(abbc.items())

for el in abbc:
    print(f'{el = }\t{abbc[el] = }')

# by 2 variables like in Groovy, it doesn't work :(
# for ke, va in abbc:
#     print(ke, va)

dict_keys(['a', 'b', 'c'])
dict_values([1, 2, 3])
dict_items([('a', 1), ('b', 2), ('c', 3)])
el = 'a'	abbc[el] = 1
el = 'b'	abbc[el] = 2
el = 'c'	abbc[el] = 3


Each element in a group is an element of a dict.<br>The element contains a tuple with 2 value:
* a key
* the value of a key.

#### `Keys` in groups

In [108]:
for country in groups:
    print(country[0], type(country), len(country), sep='\t')

Australia	<class 'tuple'>	2
Austria	<class 'tuple'>	2
Belgium	<class 'tuple'>	2
Brazil	<class 'tuple'>	2
Canada	<class 'tuple'>	2
China	<class 'tuple'>	2
Denmark	<class 'tuple'>	2
Finland	<class 'tuple'>	2
France	<class 'tuple'>	2
Germany	<class 'tuple'>	2
Italy	<class 'tuple'>	2
Japan	<class 'tuple'>	2
Korea	<class 'tuple'>	2
Mexico	<class 'tuple'>	2
Netherlands	<class 'tuple'>	2
Singapore	<class 'tuple'>	2
Spain	<class 'tuple'>	2
Sweden	<class 'tuple'>	2
Switzerland	<class 'tuple'>	2
United Kingdom	<class 'tuple'>	2
United States	<class 'tuple'>	2


In [109]:
country_names = list()
for country in groups:
    country_names.append(country[0])
country_names

['Australia',
 'Austria',
 'Belgium',
 'Brazil',
 'Canada',
 'China',
 'Denmark',
 'Finland',
 'France',
 'Germany',
 'Italy',
 'Japan',
 'Korea',
 'Mexico',
 'Netherlands',
 'Singapore',
 'Spain',
 'Sweden',
 'Switzerland',
 'United Kingdom',
 'United States']

#### `Values` in groups

In [110]:
for country in groups:
    print(country[1])

      Retailer country Order method type     Retailer type  \
3735         Australia         Telephone         Golf Shop   
3736         Australia         Telephone         Golf Shop   
3737         Australia         Telephone         Golf Shop   
3738         Australia         Telephone         Golf Shop   
3739         Australia         Telephone         Golf Shop   
...                ...               ...               ...   
85862        Australia       Sales visit  Department Store   
85863        Australia       Sales visit  Department Store   
85864        Australia       Sales visit  Department Store   
85865        Australia       Sales visit  Department Store   
85866        Australia       Sales visit  Department Store   

               Product line Product type                   Product  Year  \
3735   Personal Accessories      Watches      Mountain Man Digital  2012   
3736   Personal Accessories   Binoculars               Seeker Mini  2012   
3737   Personal Accessories

      Retailer country Order method type  Retailer type          Product line  \
2750         Singapore               Fax  Outdoors Shop     Camping Equipment   
2751         Singapore               Fax  Outdoors Shop     Camping Equipment   
2752         Singapore               Fax  Outdoors Shop     Camping Equipment   
2753         Singapore               Fax  Outdoors Shop     Camping Equipment   
2754         Singapore               Fax  Outdoors Shop     Camping Equipment   
...                ...               ...            ...                   ...   
85255        Singapore               Web  Outdoors Shop  Personal Accessories   
85256        Singapore               Web  Outdoors Shop  Personal Accessories   
85257        Singapore               Web  Outdoors Shop  Personal Accessories   
85258        Singapore               Web  Outdoors Shop  Personal Accessories   
85259        Singapore               Web  Outdoors Shop  Personal Accessories   

        Product type       

In [111]:
for country in groups:
    print(country[0], len(country[1]))

Australia 3665
Austria 3862
Belgium 3710
Brazil 3288
Canada 5923
China 3652
Denmark 2358
Finland 3409
France 5779
Germany 5397
Italy 4018
Japan 5359
Korea 3399
Mexico 3845
Netherlands 4199
Singapore 3443
Spain 3557
Sweden 2925
Switzerland 4103
United Kingdom 5102
United States 7482


In [123]:
for country, country_data in groups:
    print(country, len(country_data))
    print(type(country), type(country_data))

Australia 3665
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Austria 3862
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Belgium 3710
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Brazil 3288
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Canada 5923
<class 'str'> <class 'pandas.core.frame.DataFrame'>
China 3652
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Denmark 2358
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Finland 3409
<class 'str'> <class 'pandas.core.frame.DataFrame'>
France 5779
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Germany 5397
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Italy 4018
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Japan 5359
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Korea 3399
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Mexico 3845
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Netherlands 4199
<class 'str'> <class 'pandas.core.frame.DataFrame'>
Singapore 3443
<class 'str'> <c

In [121]:
for country, country_data in groups:
    print(
        country,
        country_data['Revenue'].max() - country_data.Revenue.min(),
        sep='\t'
    )

Australia	516135.14
Austria	321202.98
Belgium	310499.39
Brazil	352749.39
Canada	648467.6
China	1058301.46
Denmark	313721.1
Finland	770549.32
France	472225.66
Germany	566579.2
Italy	487457.3
Japan	686566.27
Korea	580965.0
Mexico	464070.9
Netherlands	506648.03
Singapore	556619.8
Spain	482286.38
Sweden	295462.2
Switzerland	267702.42
United Kingdom	605546.68
United States	1635687.96


In [122]:
for country, country_data in groups:
    print(
        country,
        country_data['Revenue'].max(),
        country_data['Revenue'].idxmax(),  # do you remember the idxmax() from Series lessons?
        sep='\t'
    )

Australia	516135.14	70573
Austria	321202.98	82615
Belgium	310499.39	49613
Brazil	352749.39	44350
Canada	648467.6	76609
China	1058301.46	70217
Denmark	313721.1	63555
Finland	770549.32	80129
France	472225.66	80530
Germany	566579.2	81240
Italy	487457.3	66622
Japan	686566.27	44802
Korea	580965.0	62036
Mexico	464070.9	68672
Netherlands	506648.03	46489
Singapore	556619.8	61696
Spain	482286.38	75501
Sweden	295462.2	79730
Switzerland	267702.42	48559
United Kingdom	605546.68	15898
United States	1635687.96	67651


In [127]:
for country, country_data in groups:
    print(
        country,
        country_data['Revenue'].max(),
        country_data['Revenue'].idxmax(),
        country_data.loc[country_data['Revenue'].idxmax()],
        '\n'*2,
        sep='\t'
    )

Australia	516135.14	70573	Retailer country                        Australia
Order method type                             Web
Retailer type                           Golf Shop
Product line                       Golf Equipment
Product type                                Woods
Product              Hailstorm Titanium Woods Set
Year                                         2014
Quarter                                   Q1 2014
Revenue                                 516135.14
Quantity                                      439
Gross margin                              0.48694
Name: 70573, dtype: object	


Austria	321202.98	82615	Retailer country                          Austria
Order method type                             Web
Retailer type                           Golf Shop
Product line                       Golf Equipment
Product type                                Woods
Product              Hailstorm Titanium Woods Set
Year                                         2014
Quarter             

### A new data frame and adding to its the biggest rows

In [130]:
the_biggest_revenues = pd.DataFrame()
for country, country_data in groups:
    the_biggest_revenues = the_biggest_revenues.append(  # strange adding: val = val.append(..)
        country_data.loc[country_data['Revenue'].idxmax()]
    )
the_biggest_revenues.head()

Unnamed: 0,Gross margin,Order method type,Product,Product line,Product type,Quantity,Quarter,Retailer country,Retailer type,Revenue,Year
70573,0.48694,Web,Hailstorm Titanium Woods Set,Golf Equipment,Woods,439.0,Q1 2014,Australia,Golf Shop,516135.14,2014.0
82615,0.487383,Web,Hailstorm Titanium Woods Set,Golf Equipment,Woods,266.0,Q2 2014,Austria,Golf Shop,321202.98,2014.0
49613,0.293879,Web,Star Lite,Camping Equipment,Tents,877.0,Q2 2013,Belgium,Sports Store,310499.39,2013.0
44350,0.294116,Web,Star Lite,Camping Equipment,Tents,996.0,Q2 2013,Brazil,Sports Store,352749.39,2013.0
76609,0.290493,Web,Star Gazer 2,Camping Equipment,Tents,1172.0,Q2 2014,Canada,Sports Store,648467.6,2014.0


In [132]:
products.nlargest(1, 'Revenue')  # an upgrade?

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
67651,United States,Web,Sports Store,Golf Equipment,Woods,Hailstorm Titanium Woods Set,2014,Q1 2014,1635687.96,1416,0.486219


In [133]:
# upraded
the_biggest_revenues = pd.DataFrame()
for country, country_data in groups:
    the_biggest_revenues = the_biggest_revenues.append(  # strange adding: val = val.append(..)
        country_data.nlargest(n=1, columns='Revenue')
    )
the_biggest_revenues.head()

Unnamed: 0,Retailer country,Order method type,Retailer type,Product line,Product type,Product,Year,Quarter,Revenue,Quantity,Gross margin
70573,Australia,Web,Golf Shop,Golf Equipment,Woods,Hailstorm Titanium Woods Set,2014,Q1 2014,516135.14,439,0.48694
82615,Austria,Web,Golf Shop,Golf Equipment,Woods,Hailstorm Titanium Woods Set,2014,Q2 2014,321202.98,266,0.487383
49613,Belgium,Web,Sports Store,Camping Equipment,Tents,Star Lite,2013,Q2 2013,310499.39,877,0.293879
44350,Brazil,Web,Sports Store,Camping Equipment,Tents,Star Lite,2013,Q2 2013,352749.39,996,0.294116
76609,Canada,Web,Sports Store,Camping Equipment,Tents,Star Gazer 2,2014,Q2 2014,648467.6,1172,0.290493
