In [1]:
import sys
import pandas as pd
import numpy as np
import IPython
from IPython.display import display

print( f"Python {sys.version}" )
print( f"Pandas {pd.__version__}" )
print( f"NumPy {np.__version__}" )
print( f"IPython {IPython.__version__}" )

Python 3.10.12 (main, Jul 29 2024, 16:56:48) [GCC 11.4.0]
Pandas 2.1.4
NumPy 1.26.4
IPython 7.34.0


In [2]:
# Load the pokemon dataset
df_pokemon = pd.read_csv('https://raw.githubusercontent.com/ShaileshDhama/Exploratory-Data-Analysis-On-Pokemon-Dataset/master/Complete%20Pokemon.csv')
df_pokemon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   abilities          801 non-null    object 
 1   against_bug        801 non-null    float64
 2   against_dark       801 non-null    float64
 3   against_dragon     801 non-null    float64
 4   against_electric   801 non-null    float64
 5   against_fairy      801 non-null    float64
 6   against_fight      801 non-null    float64
 7   against_fire       801 non-null    float64
 8   against_flying     801 non-null    float64
 9   against_ghost      801 non-null    float64
 10  against_grass      801 non-null    float64
 11  against_ground     801 non-null    float64
 12  against_ice        801 non-null    float64
 13  against_normal     801 non-null    float64
 14  against_poison     801 non-null    float64
 15  against_psychic    801 non-null    float64
 16  against_rock       801 non

# 1. Pandas: Aggregation

- Why aggregating?
  - Efficient data summarization
  - Allow us to squeeze data (This is very useful when the data is so big and cannot fit in one DataFrame)

- For multi-dimensional data, be careful of which axis that we want to apply the aggregation along.

## 1.1 Numeric aggregation

`mean()`, `sum()`, `min()`, `max()`, `median()`, `mode()`, `std()`, `corr()` => These functions can only be applied to numeric data.

In [3]:
# Prepare the data to play around
df = df_pokemon

Aggregation on pandas.Series:

In [4]:
# Count the number of NA values in a specific column
df['type2'].isna().sum()     # When summing bool values, True equals 1 and False equals 0

384

In [5]:
# For all pokemons, find the average values of 'hp', 'attack', and 'defense'
print( f"Average hp: {df.loc[ :, 'hp' ].mean()}" )
print( f"Average attack: {df.loc[ :, 'attack' ].mean()}" )
print( f"Average defense: {df.loc[ :, 'defense' ].mean()}" )

Average hp: 68.95880149812734
Average attack: 77.85767790262172
Average defense: 73.00873907615481


Aggregation on pandas.DataFrame:

In [6]:
# Reminder: df.isna() results in pandas.DataFrame of bool
df.isna()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
797,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
798,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
799,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False


In [7]:
# Count the number of NA values for each column
df.isna().sum()     # This equals df.isna().sum(axis=0)

Unnamed: 0,0
abilities,0
against_bug,0
against_dark,0
against_dragon,0
against_electric,0
against_fairy,0
against_fight,0
against_fire,0
against_flying,0
against_ghost,0


In [8]:
# Count the number of NA values for each row
df.isna().sum(axis=1)

Unnamed: 0,0
0,0
1,0
2,0
3,1
4,1
...,...
796,1
797,1
798,1
799,2


In [9]:
# Find a mean value for each column (by default NA values are skipped)
df.loc[ : , ['hp','attack','defense'] ].mean()  # This equals df.loc[ : , ['hp','attack','defense'] ].mean(axis=0)

Unnamed: 0,0
hp,68.958801
attack,77.857678
defense,73.008739


In [10]:
# Find a mean value for each row
# By default, pandas will try to use values in all columns first, then try to use only numeric data
df.loc[ : , ['hp','attack','defense'] ].mean(axis=1)

Unnamed: 0,0
0,47.666667
1,61.666667
2,101.000000
3,44.666667
4,60.000000
...,...
796,100.333333
797,123.666667
798,125.666667
799,101.666667


In [11]:
# Show all pokemons whose 'hp' attribute is not below the average hp
print( f"Average hp: {df['hp'].mean()}" )
df.loc[ df['hp'] >= df['hp'].mean() , ['pokedex_number','name','hp'] ]

Average hp: 68.95880149812734


Unnamed: 0,pokedex_number,name,hp
2,3,Venusaur,80
5,6,Charizard,78
8,9,Blastoise,79
17,18,Pidgeot,83
19,20,Raticate,75
...,...,...,...
795,796,Xurkitree,83
796,797,Celesteela,97
798,799,Guzzlord,223
799,800,Necrozma,97


In [12]:
# Show pokemons whose 'defense' attribute is the highest
print( f"Maximum defense: {df['defense'].max()}" )
df.loc[ df['defense'] == df['defense'].max() , ['pokedex_number','name','hp','defense'] ]

Maximum defense: 230


Unnamed: 0,pokedex_number,name,hp,defense
207,208,Steelix,75,230
212,213,Shuckle,20,230
305,306,Aggron,70,230


## 1.2 Categorical aggregation

`unique()`, `nunique()`, `count()`, `value_counts()` => These functions can be applied to both numeric and non-numeric data.

For pandas.Series:

In [13]:
# Numeric data: Show unique values
df['hp'].unique()

array([ 45,  60,  80,  39,  58,  78,  44,  59,  79,  50,  40,  65,  63,
        83,  30,  75,  35,  55,  70,  90,  46,  61,  81,  95,  38,  73,
       115, 140,  10,  25,  52, 105,  85, 250,  20, 130,  48, 160,  41,
        91, 106, 100, 125, 190, 255,  28,  68, 150,  31,   1,  64,  84,
       104,  72, 144, 170, 110,  43,  66,  86,  99,  76,  53,  37,  77,
        67,  97, 111,  49,  71, 103,  57, 108, 135,  74,  69, 120, 116,
        62,  54,  36,  51, 114, 165, 109,  89,  92,  56,  88, 123, 101,
        82,  42, 126, 216,  47, 137, 107, 223])

In [14]:
# Numeric data: Show the number of unique values
df['hp'].nunique()

99

In [15]:
# Numeric data: Count non-NA cells
df['hp'].count()

801

In [16]:
# Numeric data: View unique values and their counts
# Result is sorted by the count (frequency) from max to min
df['hp'].value_counts()

Unnamed: 0_level_0,count
hp,Unnamed: 1_level_1
60,68
70,55
50,54
75,44
65,44
...,...
170,1
99,1
37,1
111,1


In [17]:
# Non-numeric data without NA: Show unique values
df['type1'].unique()

array(['grass', 'fire', 'water', 'bug', 'normal', 'poison', 'electric',
       'ground', 'fairy', 'fighting', 'psychic', 'rock', 'ghost', 'ice',
       'dragon', 'dark', 'steel', 'flying'], dtype=object)

In [18]:
# Non-numeric data without NA: Show the number of unique values
df['type1'].nunique()

18

In [19]:
# Non-numeric data without NA: Count non-NA cells
df['type1'].count()

801

In [20]:
# Non-numeric data without NA: View unique values and their counts
df['type1'].value_counts()

Unnamed: 0_level_0,count
type1,Unnamed: 1_level_1
water,114
normal,105
grass,78
bug,72
psychic,53
fire,52
rock,45
electric,39
poison,32
ground,32


In [21]:
# Non-numeric data with NA: Show unique values
df['type2'].unique()

array(['poison', nan, 'flying', 'dark', 'electric', 'ice', 'ground',
       'fairy', 'grass', 'fighting', 'psychic', 'steel', 'fire', 'rock',
       'water', 'dragon', 'ghost', 'bug', 'normal'], dtype=object)

In [22]:
# Non-numeric data with NA: Show the number of unique values
df['type2'].nunique()     # default:dropna=True

18

In [23]:
# Non-numeric data with NA: Show the number of unique values
df['type2'].nunique(dropna=False)     # default:dropna=True

19

In [24]:
# Non-numeric data with NA: Count non-NA cells
df['type2'].count()

417

In [25]:
# Non-numeric data with NA: View unique values and their counts
# Result is sorted by the count (frequency) from max to min
df['type2'].value_counts()    # default:dropna=True

Unnamed: 0_level_0,count
type2,Unnamed: 1_level_1
flying,95
poison,34
ground,34
fairy,29
psychic,29
fighting,25
steel,22
dark,21
grass,20
water,17


In [26]:
# Non-numeric data with NA: View unique values and their counts
# Result is sorted by the count (frequency) from max to min
df['type2'].value_counts(dropna=False)    # default:dropna=True

Unnamed: 0_level_0,count
type2,Unnamed: 1_level_1
,384
flying,95
poison,34
ground,34
psychic,29
fairy,29
fighting,25
steel,22
dark,21
grass,20


For pandas.DataFrame:

In [27]:
# unique() is available for pandas.Series but not for pandas.DataFrame
df.unique()   # AttributeError

AttributeError: 'DataFrame' object has no attribute 'unique'

In [28]:
df.nunique(dropna=False)    #default:dropna=True

Unnamed: 0,0
abilities,482
against_bug,5
against_dark,5
against_dragon,4
against_electric,6
against_fairy,5
against_fight,6
against_fire,5
against_flying,5
against_ghost,5


In [29]:
# Using value_counts() on pandas.DataFrame

# It returns a MultiIndex Series
x = df[['type1','type2']].value_counts()

print( type(x), end='\n\n' )
print( x.index, end='\n\n' )
x

<class 'pandas.core.series.Series'>

MultiIndex([('normal',   'flying'),
            ( 'grass',   'poison'),
            (   'bug',   'flying'),
            (   'bug',   'poison'),
            ( 'water',   'ground'),
            ( 'water',   'flying'),
            (  'rock',    'water'),
            ( 'grass',   'flying'),
            (  'rock',   'ground'),
            (  'fire', 'fighting'),
            ...
            (   'ice',    'ghost'),
            ('ground',    'steel'),
            ('ground', 'electric'),
            ( 'grass',   'ground'),
            ( 'grass',    'grass'),
            ( 'grass',    'ghost'),
            ( 'ghost',    'fairy'),
            ( 'ghost',   'dragon'),
            ( 'ghost',     'dark'),
            ( 'water',    'steel')],
           names=['type1', 'type2'], length=148)



Unnamed: 0_level_0,Unnamed: 1_level_0,count
type1,type2,Unnamed: 2_level_1
normal,flying,26
grass,poison,14
bug,flying,13
bug,poison,11
water,ground,9
...,...,...
grass,ghost,1
ghost,fairy,1
ghost,dragon,1
ghost,dark,1


## 1.3 Aggregate using one or more operations over a specified axis

For pandas.Series:

In [30]:
# Numerical aggregation

def my_mean( x ):
  return x.mean()
def my_max( x ):
  return x.max()

# Apply two functions to the Series
# Four commands below give the same results (pd.Series), use only one line

# Style 1: Pass a list of our custom functions
#df['hp'].agg([my_mean, my_max])

# Style 2: Pass a list of lambda functions
#df['hp'].agg([ lambda x: x.mean() , lambda x: x.max() ])

# Style 3: Pass a list of bulit-in function names
df['hp'].agg(['mean', 'max'])

# Style 4: Pass a list of bulit-in function names
#df['hp'].agg([np.mean, np.max])

Unnamed: 0,hp
mean,68.958801
max,255.0


In [31]:
# Categorical aggregation
# When a function is passed, the function's input(s) cannot be specified
df['type1'].agg( [pd.Series.nunique, pd.Series.count] )  # Return pd.Series

Unnamed: 0,type1
nunique,18
count,801


In [32]:
# Categorical aggregation with a lambda function
df['type2'].agg( [pd.Series.nunique, lambda x: x.nunique(dropna=False)] )  # Return pd.Series

Unnamed: 0,type2
nunique,18
<lambda>,19


In [33]:
# Categorical aggregation that returns pd.Series with mixed types
x = df['type1'].agg( [pd.Series.nunique, pd.Series.unique] )  # Return pd.Series
x

Unnamed: 0,type1
nunique,18
unique,"[grass, fire, water, bug, normal, poison, elec..."


In [34]:
# Inspect the result (x) of the above cell in more detail
for i,ii in enumerate(x.index):
  print(f'row index {i}: {ii}')
  print(f'|-- type: {type(x[ii])}')
  if isinstance(x[ii], np.ndarray):
    print(f'    |-- shape: {x[ii].shape}')
  elif isinstance(x[ii], list):
    print(f'    |-- len: {len(x[ii])}')
  print(f'|-- value: {x[ii]}\n')

row index 0: nunique
|-- type: <class 'int'>
|-- value: 18

row index 1: unique
|-- type: <class 'numpy.ndarray'>
    |-- shape: (18,)
|-- value: ['grass' 'fire' 'water' 'bug' 'normal' 'poison' 'electric' 'ground'
 'fairy' 'fighting' 'psychic' 'rock' 'ghost' 'ice' 'dragon' 'dark' 'steel'
 'flying']



For pandas.DataFrame:

In [35]:
# Numerical aggregation applied for each column separately

# Find mean, min, and max values regarding each (numeric) column
df[ ['hp','attack','defense'] ].agg( ['mean','min','max'] )   # default: axis=0
#df[ ['hp','attack','defense', 'type1', 'type2'] ].agg( ['mean','min','max'] )   # ValueError: could not convert string to float

Unnamed: 0,hp,attack,defense
mean,68.958801,77.857678,73.008739
min,1.0,5.0,5.0
max,255.0,185.0,230.0


In [36]:
# Numerical aggregation applied for each row separately

# Find mean, min, and max values regarding each row
df[ ['hp','attack','defense'] ].agg( ['mean','min','max'], axis=1 )

Unnamed: 0,mean,min,max
0,47.666667,45.0,49.0
1,61.666667,60.0,63.0
2,101.000000,80.0,123.0
3,44.666667,39.0,52.0
4,60.000000,58.0,64.0
...,...,...,...
796,100.333333,97.0,103.0
797,123.666667,59.0,181.0
798,125.666667,53.0,223.0
799,101.666667,97.0,107.0


In [37]:
# Numerical aggregation applied for each column separately
# Use a different set of functions for each column
df[ ['hp','attack','defense'] ].agg( {'hp':['mean','min','max'],
                                      'attack':['min', 'max'],
                                      'defense':['mean','min'] } )  # default: axis=0

Unnamed: 0,hp,attack,defense
mean,68.958801,,73.008739
min,1.0,5.0,5.0
max,255.0,185.0,


In [38]:
# Categorical aggregation applied for each column separately

sub_df = df[ ['pokedex_number','name','type1','type2'] ]
sub_df.agg( [pd.Series.nunique, pd.Series.count, pd.Series.unique] )   # default: axis=0

Unnamed: 0,pokedex_number,name,type1,type2
nunique,801,801,18,18
count,801,801,801,417
unique,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Bulbasaur, Ivysaur, Venusaur, Charmander, Cha...","[grass, fire, water, bug, normal, poison, elec...","[poison, nan, flying, dark, electric, ice, gro..."


# 2. Pandas: Transform data

This section will focus on `transform()` and `apply()`.

In [39]:
# Prepare the data to play around
df = df_pokemon[ ['pokedex_number','name','type1','type2','hp','attack','defense'] ]
df

Unnamed: 0,pokedex_number,name,type1,type2,hp,attack,defense
0,1,Bulbasaur,grass,poison,45,49,49
1,2,Ivysaur,grass,poison,60,62,63
2,3,Venusaur,grass,poison,80,100,123
3,4,Charmander,fire,,39,52,43
4,5,Charmeleon,fire,,58,64,58
...,...,...,...,...,...,...,...
796,797,Celesteela,steel,flying,97,101,103
797,798,Kartana,grass,steel,59,181,131
798,799,Guzzlord,dark,dragon,223,101,53
799,800,Necrozma,psychic,,97,107,101


For pandas.Series:

In [40]:
# When a transform function IS NOT an aggregate function
plus20 = lambda x: x+20

# Both commands produce the same result (pd.Series), use only one
df['hp'].apply(plus20)
#df['hp'].transform(plus20)

Unnamed: 0,hp
0,65
1,80
2,100
3,59
4,78
...,...
796,117
797,79
798,243
799,117


In [41]:
# When a transform function IS an aggregate function

# transform() cannot produce aggregated results BUT apply() can.
print( "apply('mean') :" , df['hp'].apply('mean') )        # OK
print( "transform('mean') :", df['hp'].transform('mean') ) # ValueError

apply('mean') : 68.95880149812734


ValueError: Function did not transform

For pandas.DataFrame:

In [42]:
# When a transform function IS NOT an aggregate function

# Copy the original values just for easy comparison
df2 = df.loc[ :, ['hp','attack','defense'] ]

# The two commands below produce the same outputs (pandas.DataFrame)
# Be careful that both commands don't alter the original dataframe (df)
df2[['hp_after','attack_after','defense_after']] = df[['hp','attack','defense']].apply(lambda x: x+20)
#df2[['hp_after','attack_after','defense_after']] = df[['hp','attack','defense']].transform(lambda x: x+20)

df2

Unnamed: 0,hp,attack,defense,hp_after,attack_after,defense_after
0,45,49,49,65,69,69
1,60,62,63,80,82,83
2,80,100,123,100,120,143
3,39,52,43,59,72,63
4,58,64,58,78,84,78
...,...,...,...,...,...,...
796,97,101,103,117,121,123
797,59,181,131,79,201,151
798,223,101,53,243,121,73
799,97,107,101,117,127,121


In [43]:
# When a transform function IS an aggregate function
# transform() cannot produce aggregated results BUT apply() can
# The output of transform() has to be pandas.DataFrame that has the same shape as self

display( df[['hp','attack','defense']].apply('mean') )      # OK
display( df[['hp','attack','defense']].transform('mean') )  # ValueError

Unnamed: 0,0
hp,68.958801
attack,77.857678
defense,73.008739


ValueError: Function did not transform

`apply()` vs. `transform()`

- Comparison 1:
  - `apply()` supports both aggregation and non-aggregation functions.
  - `transform()` doesn't support aggregation functions (causing an error).
- Comparison 2:
  - <u>`apply()` sends the entire pandas.DataFrame to the calling function</u> so it can work with multiple Series at a time.
  - <u>`transform()` only sends each individual column as a pandas.Series to the calling function</u> so is can only work with one Series at a time. Also, the result from `transform()` has the same shape as self.


In [44]:
# apply() allows working with multiple pandas.Series at a time

df = df_pokemon.loc[ :, ['pokedex_number','name','type1','type2','hp','attack','defense'] ]
df['att_sub_def'] = df[['hp','attack','defense']].apply(lambda x: x['attack'] - x['defense'], axis=1) # OK
df

Unnamed: 0,pokedex_number,name,type1,type2,hp,attack,defense,att_sub_def
0,1,Bulbasaur,grass,poison,45,49,49,0
1,2,Ivysaur,grass,poison,60,62,63,-1
2,3,Venusaur,grass,poison,80,100,123,-23
3,4,Charmander,fire,,39,52,43,9
4,5,Charmeleon,fire,,58,64,58,6
...,...,...,...,...,...,...,...,...
796,797,Celesteela,steel,flying,97,101,103,-2
797,798,Kartana,grass,steel,59,181,131,50
798,799,Guzzlord,dark,dragon,223,101,53,48
799,800,Necrozma,psychic,,97,107,101,6


In [45]:
# transform() can only work with one pandas.Series at a time

df = df_pokemon.loc[ :, ['pokedex_number','name','type1','type2','hp','attack','defense'] ]
df['att_sub_def'] = df[['hp','attack','defense']].transform(lambda x: x['attack'] - x['defense'], axis=1) # ValueError
df

ValueError: Function did not transform

At this point, it may seem that `apply()` is more flexible and less prone to erros than `transform()`. However, there are situations where `transform()` is prefered over `apply()` as you will see in the next section.

# 3. Pandas: Groupby

Pandas’ `groupby()` allows us to split data into separate groups to perform computations for better analysis.
- Efficient data summarization
- Allow us to squeeze data (This is very useful when the data is so big and cannot fit in one Series/DataFrame)



## 3.1 The groupby object is neither pandas.Series nor pandas.DataFrame

Groupby on pandas.Series:

- When `groupby()` is used on Series, our job is to assign a group label to each row in the Series.

In [46]:
# Create a pandas.Series to play around
ser = pd.Series([390., 350., 30., 20.])
ser

Unnamed: 0,0
0,390.0
1,350.0
2,30.0
3,20.0


In [47]:
# Determine a group for each Series' element by a list of group labels

# In the below command, the 0th and 2nd rows are assigned to the "a" group and the other two to the "b" group
grouped = ser.groupby( ["a", "b", "a", "b"] )

# The groupby object is neither Series nor DataFrame, be careful when dealing with it
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7e8ba2fe8d90>

In [48]:
# Inspect the result of the above cell by iterating through the groupby object
for i, (name,group) in enumerate(grouped):
  print( f"===== Group {i} =====" )
  print( f"Group name: {name}" )
  print( f"Type: {type(group)}" )
  print( f"Value:\n{group}\n" )

===== Group 0 =====
Group name: a
Type: <class 'pandas.core.series.Series'>
Value:
0    390.0
2     30.0
dtype: float64

===== Group 1 =====
Group name: b
Type: <class 'pandas.core.series.Series'>
Value:
1    350.0
3     20.0
dtype: float64



In [49]:
# Use a function to specify how to group each element in the pandas.Series
# In the lambda function below:
# - "i" refers to a label of a row
# - Any element (in a Series) whose value is greater than 300, will be assigned to the "A" group or the "B" group otherwise
grouped = ser.groupby( lambda i: "A" if ser.loc[i]>300 else "B" )

# Inspect the result by iterating through the groupby object
for i, (name,group) in enumerate(grouped):
  print( f"===== Group {i} =====" )
  print( f"Group name: {name}" )
  print( f"Type: {type(group)}" )
  print( f"Value:\n{group}\n" )

===== Group 0 =====
Group name: A
Type: <class 'pandas.core.series.Series'>
Value:
0    390.0
1    350.0
dtype: float64

===== Group 1 =====
Group name: B
Type: <class 'pandas.core.series.Series'>
Value:
2    30.0
3    20.0
dtype: float64



Groupby on pandas.DataFrame:

In [50]:
# Prepare a pandas.DataFrame to play around
df = df_pokemon.loc[ :, ['pokedex_number','name','type1','type2'] ]

In [51]:
# By default, any group with NA key will be dropped. To include NA-key group(s), set dropna=False.
# BUT having any group with NA key will lead to future errors (e.g., .groups, .get_group()).
# One suggestion is to use fillna() to replace NA keys before doing groupby().

# Compare groupby with 'dropna=True' versus 'dropna=False'
grouped = df.groupby( ['type2'] )     # default: axis=0, dropna=True
grouped_na = df.groupby( ['type2'], dropna=False )

# The groupby object is neither Series nor DataFrame, be careful when dealing with it
display(grouped)
display(grouped_na)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7e8ba2fe9120>

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7e8ba30547c0>

In [52]:
# Access the groupby object's attribute

# Get the number of groups
print(f'{grouped.ngroups=}')      # OK
print(f'{grouped_na.ngroups=}')   # OK

grouped.ngroups=18
grouped_na.ngroups=19


In [53]:
# Access the groupby object's attribute

# Get the group objects
# Each resultant list contains integer indices of rows that belong to the group
display(grouped.groups)      # OK
display(grouped_na.groups)   # ValueError because of the key=null category

{'bug': [346, 347, 450, 750, 751], 'dark': [18, 19, 51, 52, 247, 273, 274, 317, 318, 331, 341, 433, 434, 441, 451, 550, 551, 552, 657, 674, 726], 'dragon': [229, 328, 329, 482, 483, 486, 632, 633, 634, 690, 695, 696, 713, 714, 775, 779, 798], 'electric': [25, 169, 170, 594, 595, 617, 643, 736, 737], 'fairy': [38, 39, 121, 173, 182, 183, 279, 280, 281, 297, 302, 438, 545, 546, 701, 702, 706, 718, 729, 741, 742, 754, 755, 777, 784, 785, 786, 787, 800], 'fighting': [61, 213, 255, 256, 285, 390, 391, 452, 453, 474, 498, 499, 558, 559, 637, 638, 639, 646, 651, 758, 759, 782, 783, 793, 794], 'fire': [104, 227, 228, 493, 554, 606, 607, 608, 635, 636, 642, 756, 757], 'flying': [5, 11, 15, 16, 17, 20, 21, 40, 41, 82, 83, 84, 122, 129, 141, 143, 144, 145, 148, 162, 163, 164, 165, 168, 175, 176, 177, 186, 187, 188, 192, 197, 206, 224, 225, 226, 248, 249, 266, 275, 276, 277, 278, 283, 290, 332, 333, 356, 372, 383, 395, 396, 397, 413, 414, 415, 424, 425, 429, 440, 457, 467, 468, 471, 518, 519, 520,

ValueError: Categorical categories cannot be null

In [54]:
# Access the groupby object's method

# Compute and display group sizes
print( grouped.size(), end='\n\n' ) # OK
print( grouped_na.size() )          # OK

type2
bug          5
dark        21
dragon      17
electric     9
fairy       29
fighting    25
fire        13
flying      95
ghost       14
grass       20
ground      34
ice         15
normal       4
poison      34
psychic     29
rock        14
steel       22
water       17
dtype: int64

type2
bug           5
dark         21
dragon       17
electric      9
fairy        29
fighting     25
fire         13
flying       95
ghost        14
grass        20
ground       34
ice          15
normal        4
poison       34
psychic      29
rock         14
steel        22
water        17
NaN         384
dtype: int64


In [55]:
# Access the groupby object's method

# Preview groups (and the first entry in each group) as a pandas.DataFrame
display( grouped.first() )      # OK
print()
display( grouped_na.first() )   # OK

Unnamed: 0_level_0,pokedex_number,name,type1
type2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bug,347,Anorith,rock
dark,19,Rattata,normal
dragon,230,Kingdra,water
electric,26,Raichu,electric
fairy,39,Jigglypuff,normal
fighting,62,Poliwrath,water
fire,105,Marowak,ground
flying,6,Charizard,fire
ghost,292,Shedinja,bug
grass,46,Paras,bug





Unnamed: 0_level_0,pokedex_number,name,type1
type2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bug,347,Anorith,rock
dark,19,Rattata,normal
dragon,230,Kingdra,water
electric,26,Raichu,electric
fairy,39,Jigglypuff,normal
fighting,62,Poliwrath,water
fire,105,Marowak,ground
flying,6,Charizard,fire
ghost,292,Shedinja,bug
grass,46,Paras,bug


In [56]:
# Access the groupby object's method

# Preview groups (and the last entry in each group) as a pandas.DataFrame
display( grouped.last() )      # OK
print()
display( grouped_na.last() )   # OK

Unnamed: 0_level_0,pokedex_number,name,type1
type2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bug,752,Araquanid,water
dark,727,Incineroar,fire
dragon,799,Guzzlord,dark
electric,738,Vikavolt,bug
fairy,801,Magearna,steel
fighting,795,Pheromosa,bug
fire,758,Salazzle,poison
flying,797,Celesteela,steel
ghost,792,Lunala,psychic
grass,781,Dhelmise,ghost





Unnamed: 0_level_0,pokedex_number,name,type1
type2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bug,752,Araquanid,water
dark,727,Incineroar,fire
dragon,799,Guzzlord,dark
electric,738,Vikavolt,bug
fairy,801,Magearna,steel
fighting,795,Pheromosa,bug
fire,758,Salazzle,poison
flying,797,Celesteela,steel
ghost,792,Lunala,psychic
grass,781,Dhelmise,ghost


In [57]:
# Access the groupby object's method

# Return a pandas.DataFrame of a specified group
display( grouped.get_group('dragon') )   # OK
#display( grouped_na.get_group(pd.NA) )  # Error: it doesn't know how to access the NA-key group

Unnamed: 0,pokedex_number,name,type1,type2
229,230,Kingdra,water,dragon
328,329,Vibrava,ground,dragon
329,330,Flygon,ground,dragon
482,483,Dialga,steel,dragon
483,484,Palkia,water,dragon
486,487,Giratina,ghost,dragon
632,633,Deino,dark,dragon
633,634,Zweilous,dark,dragon
634,635,Hydreigon,dark,dragon
690,691,Dragalge,poison,dragon


In [58]:
# Iterate through groups

# Below codes work fine even for NA-key groups
for i, (name, group) in enumerate(grouped):
  if i == 3:    # As a demo, we inspect only the first three groups
    break
  print( f"\n===== Group {i} =====" )
  print( f"Group name: type={type(name)} value={name}" )
  print( f"Group: type={type(group)}" )
  display(group)


===== Group 0 =====
Group name: type=<class 'tuple'> value=('bug',)
Group: type=<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,pokedex_number,name,type1,type2
346,347,Anorith,rock,bug
347,348,Armaldo,rock,bug
450,451,Skorupi,poison,bug
750,751,Dewpider,water,bug
751,752,Araquanid,water,bug



===== Group 1 =====
Group name: type=<class 'tuple'> value=('dark',)
Group: type=<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,pokedex_number,name,type1,type2
18,19,Rattata,normal,dark
19,20,Raticate,normal,dark
51,52,Meowth,normal,dark
52,53,Persian,normal,dark
247,248,Tyranitar,rock,dark
273,274,Nuzleaf,grass,dark
274,275,Shiftry,grass,dark
317,318,Carvanha,water,dark
318,319,Sharpedo,water,dark
331,332,Cacturne,grass,dark



===== Group 2 =====
Group name: type=<class 'tuple'> value=('dragon',)
Group: type=<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,pokedex_number,name,type1,type2
229,230,Kingdra,water,dragon
328,329,Vibrava,ground,dragon
329,330,Flygon,ground,dragon
482,483,Dialga,steel,dragon
483,484,Palkia,water,dragon
486,487,Giratina,ghost,dragon
632,633,Deino,dark,dragon
633,634,Zweilous,dark,dragon
634,635,Hydreigon,dark,dragon
690,691,Dragalge,poison,dragon


In [59]:
# Group by many columns
group2 = df.groupby( ['type1', 'type2'] )

print(f'{group2.ngroups=}\n')
display(group2.groups)

group2.ngroups=148



{('bug', 'electric'): [594, 595, 736, 737], ('bug', 'fairy'): [741, 742], ('bug', 'fighting'): [213, 793, 794], ('bug', 'fire'): [635, 636], ('bug', 'flying'): [11, 122, 164, 165, 192, 266, 283, 290, 413, 414, 415, 468, 665], ('bug', 'ghost'): [291], ('bug', 'grass'): [45, 46, 412, 539, 540, 541], ('bug', 'ground'): [289], ('bug', 'poison'): [12, 13, 14, 47, 48, 166, 167, 268, 542, 543, 544], ('bug', 'rock'): [212, 556, 557], ('bug', 'steel'): [204, 211, 588, 631, 648], ('bug', 'water'): [282, 766, 767], ('bug', nan): [9, 10, 126, 203, 264, 265, 267, 312, 313, 400, 401, 411, 587, 615, 616, 663, 664, 735], ('dark', 'dragon'): [632, 633, 634, 798], ('dark', 'fighting'): [558, 559], ('dark', 'fire'): [227, 228], ('dark', 'flying'): [197, 429, 628, 629, 716], ('dark', 'ghost'): [301], ('dark', 'ice'): [214, 460], ('dark', 'psychic'): [685, 686], ('dark', 'steel'): [623, 624], ('dark', nan): [196, 260, 261, 358, 490, 508, 509, 569, 570], ('dragon', 'electric'): [643], ('dragon', 'fighting')

In [60]:
# When grouping is done based on many columns, use a tuple to specify the group name
group2.get_group( ('bug','poison') )

Unnamed: 0,pokedex_number,name,type1,type2
12,13,Weedle,bug,poison
13,14,Kakuna,bug,poison
14,15,Beedrill,bug,poison
47,48,Venonat,bug,poison
48,49,Venomoth,bug,poison
166,167,Spinarak,bug,poison
167,168,Ariados,bug,poison
268,269,Dustox,bug,poison
542,543,Venipede,bug,poison
543,544,Whirlipede,bug,poison


## 3.2 The groupby process: split-apply-combine

According to https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html, “group by” refers to a process involving one or more of the following steps:
- Splitting the data into groups based on some criteria
- **Applying a function to each group independently** (e.g., aggregation, transformation, filtration)
- Combining the results into a data structure

### 3.2.1 Split $→$ Aggregate

![picture](https://miro.medium.com/max/826/1*WTlC_t-5RQqeqS0tcbkCKQ.png)

Image from https://towardsdatascience.com/pandas-groupby-aggregate-transform-filter-c95ba3444bbb

In [61]:
# Prepare a groupby object to play around
group = df_pokemon[['pokedex_number','name','type1','type2','hp','attack','defense']].groupby('type1')
group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7e8ba3061330>

In [63]:
# Choose some existing columns from the groupby object

# Although it looks like choosing columns in pandas.DataFrame, the result below remains the groupby object
group[['hp','attack']]            # OK
#group.loc[ :, ['hp','attack'] ]  # Error: Groupby has no attribute 'loc'

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7e8ba2fe8df0>

In [64]:
# Group -> Choose some existing columns -> Apply the aggregation function
# For each group, apply an aggregation function to each column separately

# As 'mean' is numeric aggregation, non-numeric columns (i.e., 'name','type1','type2') are excluded (with warning)
group[['hp','attack','defense']].agg('mean')   # This equals group[['hp','attack','defense']].mean() as well as group[['hp','attack','defense']].agg(np.mean)
#group.agg('mean')     # NotImplementedError due to non-numeric columns included

Unnamed: 0_level_0,hp,attack,defense
type1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bug,56.722222,70.125,70.847222
dark,72.551724,87.793103,70.517241
dragon,79.851852,106.407407,86.259259
electric,60.512821,70.820513,61.820513
fairy,73.944444,62.111111,68.166667
fighting,71.428571,99.178571,66.392857
fire,68.730769,81.5,67.788462
flying,68.0,66.666667,65.0
ghost,63.37037,72.740741,79.518519
grass,65.358974,73.769231,70.871795


In [65]:
# Group -> Choose some existing columns -> Apply the aggregation functions
result = group[['hp','attack']].agg(['mean','median','min','max','std'])

# Result is a DataFrame with two-level column indices
result

Unnamed: 0_level_0,hp,hp,hp,hp,hp,attack,attack,attack,attack,attack
Unnamed: 0_level_1,mean,median,min,max,std,mean,median,min,max,std
type1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
bug,56.722222,60.0,1,107,17.652532,70.125,65.0,10,185,37.202884
dark,72.551724,65.0,35,223,36.063026,87.793103,88.0,50,150,25.470402
dragon,79.851852,75.0,41,216,34.962291,106.407407,100.0,50,180,33.864276
electric,60.512821,60.0,20,90,17.519721,70.820513,65.0,30,123,25.50219
fairy,73.944444,76.0,35,126,23.048471,62.111111,58.5,20,131,28.682622
fighting,71.428571,70.0,30,144,26.790585,99.178571,100.0,35,145,27.966983
fire,68.730769,65.0,38,115,19.182422,81.5,79.5,30,160,26.895951
flying,68.0,79.0,40,85,24.433583,66.666667,70.0,30,100,35.118846
ghost,63.37037,59.0,20,150,30.011726,72.740741,66.0,30,165,32.577237
grass,65.358974,65.0,30,123,18.721454,73.769231,68.0,27,181,29.880682


In [66]:
# (Optional) Reset row index
result.reset_index(inplace=True)
result

Unnamed: 0_level_0,type1,hp,hp,hp,hp,hp,attack,attack,attack,attack,attack
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,min,max,std,mean,median,min,max,std
0,bug,56.722222,60.0,1,107,17.652532,70.125,65.0,10,185,37.202884
1,dark,72.551724,65.0,35,223,36.063026,87.793103,88.0,50,150,25.470402
2,dragon,79.851852,75.0,41,216,34.962291,106.407407,100.0,50,180,33.864276
3,electric,60.512821,60.0,20,90,17.519721,70.820513,65.0,30,123,25.50219
4,fairy,73.944444,76.0,35,126,23.048471,62.111111,58.5,20,131,28.682622
5,fighting,71.428571,70.0,30,144,26.790585,99.178571,100.0,35,145,27.966983
6,fire,68.730769,65.0,38,115,19.182422,81.5,79.5,30,160,26.895951
7,flying,68.0,79.0,40,85,24.433583,66.666667,70.0,30,100,35.118846
8,ghost,63.37037,59.0,20,150,30.011726,72.740741,66.0,30,165,32.577237
9,grass,65.358974,65.0,30,123,18.721454,73.769231,68.0,27,181,29.880682


In [67]:
result['hp']          # Access level0 column

Unnamed: 0,mean,median,min,max,std
0,56.722222,60.0,1,107,17.652532
1,72.551724,65.0,35,223,36.063026
2,79.851852,75.0,41,216,34.962291
3,60.512821,60.0,20,90,17.519721
4,73.944444,76.0,35,126,23.048471
5,71.428571,70.0,30,144,26.790585
6,68.730769,65.0,38,115,19.182422
7,68.0,79.0,40,85,24.433583
8,63.37037,59.0,20,150,30.011726
9,65.358974,65.0,30,123,18.721454


In [68]:
display( result['hp']['mean'] )   # Access level1 column

Unnamed: 0,mean
0,56.722222
1,72.551724
2,79.851852
3,60.512821
4,73.944444
5,71.428571
6,68.730769
7,68.0
8,63.37037
9,65.358974


### 3.2.2 Split $→$ Transform

**EX1:** Standardization (this function has no aggregation effect)

In [69]:
# Create a pandas.DataFrame to play around
df = df_pokemon[ ['pokedex_number','name','type1','type2','hp','attack','defense'] ]

In [70]:
# Before standardization: Inspect mean and SD values of all groups

# Group the data
# Explanation about 'group_keys' in the following EX2
grouped = df.groupby('type1', group_keys=False)

# Check the current mean and SD for each group in 'type1'
print( f"{'='*10} Group by 'type1' -> Find mean {'='*10}\n" )
print( grouped.mean(numeric_only=True), end='\n\n' )
print( f"{'='*10} Group by 'type1' -> Find SD {'='*10}\n" )
print( grouped.std(numeric_only=True) )


          pokedex_number         hp      attack     defense
type1                                                      
bug           395.916667  56.722222   70.125000   70.847222
dark          490.034483  72.551724   87.793103   70.517241
dragon        516.296296  79.851852  106.407407   86.259259
electric      380.102564  60.512821   70.820513   61.820513
fairy         467.000000  73.944444   62.111111   68.166667
fighting      404.035714  71.428571   99.178571   66.392857
fire          376.403846  68.730769   81.500000   67.788462
flying        690.000000  68.000000   66.666667   65.000000
ghost         498.814815  63.370370   72.740741   79.518519
grass         410.128205  65.358974   73.769231   70.871795
ground        371.000000  73.187500   94.812500   83.906250
ice           426.217391  72.086957   73.304348   71.913043
normal        367.504762  76.723810   75.161905   59.695238
poison        314.375000  65.593750   72.656250   70.031250
psychic       429.735849  72.943396   6

In [71]:
# Perform standardization to each group of 'type1'

# Group -> Choose columns -> Data transformation
# Two commands below produce the same results, use only one
#transformed = grouped[['hp','attack','defense']].apply( lambda x: ((x - x.mean()) / x.std()) )
transformed = grouped[['hp','attack','defense']].transform( lambda x: ((x - x.mean()) / x.std()) )

# Notice that results below are no longer grouped by 'type1'
transformed

Unnamed: 0,hp,attack,defense
0,-1.087468,-0.828938,-0.854079
1,-0.286248,-0.393874,-0.307388
2,0.782045,0.877850,2.035572
3,-1.549897,-1.096819,-0.981888
4,-0.559406,-0.650656,-0.387728
...,...,...,...
796,1.699590,0.268133,-0.405693
797,-0.339662,3.588632,2.347967
798,4.171815,0.518519,-0.677994
799,0.763511,1.124626,1.140519


Unlike aggregation, data transformation is typically used by assigning the results to a new column.

In [72]:
# To easily compare values, assign the transformed result to new columns
df2 = df.assign(hp_zscore=transformed['hp'], \
                att_zscore=transformed['attack'], \
                def_zscore=transformed['defense'])

df2

Unnamed: 0,pokedex_number,name,type1,type2,hp,attack,defense,hp_zscore,att_zscore,def_zscore
0,1,Bulbasaur,grass,poison,45,49,49,-1.087468,-0.828938,-0.854079
1,2,Ivysaur,grass,poison,60,62,63,-0.286248,-0.393874,-0.307388
2,3,Venusaur,grass,poison,80,100,123,0.782045,0.877850,2.035572
3,4,Charmander,fire,,39,52,43,-1.549897,-1.096819,-0.981888
4,5,Charmeleon,fire,,58,64,58,-0.559406,-0.650656,-0.387728
...,...,...,...,...,...,...,...,...,...,...
796,797,Celesteela,steel,flying,97,101,103,1.699590,0.268133,-0.405693
797,798,Kartana,grass,steel,59,181,131,-0.339662,3.588632,2.347967
798,799,Guzzlord,dark,dragon,223,101,53,4.171815,0.518519,-0.677994
799,800,Necrozma,psychic,,97,107,101,0.763511,1.124626,1.140519


In [73]:
# After standardization: Reconfirm mean and sd values of each group

# Notice that mean and SD values of each group are about 0.0 and 1.0 respectively
print( f"{'='*5} Group by 'type1' -> Standardize -> Mean {'='*5}\n" )
print( df2.groupby('type1')[['hp_zscore','att_zscore','def_zscore']].mean() )
print( f"\n{'='*5} Group by 'type1' -> Standardize -> SD {'='*5}\n" )
print( df2.groupby('type1')[['hp_zscore','att_zscore','def_zscore']].std() )

===== Group by 'type1' -> Standardize -> Mean =====

             hp_zscore    att_zscore    def_zscore
type1                                             
bug       4.163336e-17 -1.850372e-17 -1.757853e-16
dark      3.062684e-17  1.914178e-16 -1.531342e-16
dragon    1.233581e-16  6.322103e-17  2.384924e-16
electric  8.540177e-17  1.366428e-16  1.437596e-16
fairy     8.018277e-17 -8.326673e-17 -2.837237e-16
fighting -7.930164e-17 -6.344132e-17  2.061843e-16
fire      2.348549e-16 -5.337611e-19  2.049643e-16
flying    0.000000e+00 -1.063964e-16  0.000000e+00
ghost    -1.387779e-16 -1.891491e-16 -1.233581e-17
grass    -3.430304e-16 -1.480297e-16 -1.366428e-16
ground    1.387779e-17  0.000000e+00 -1.040834e-17
ice       2.316987e-16  1.255035e-16 -1.351576e-16
normal    8.260588e-17 -8.934652e-17 -2.537653e-17
poison    7.806256e-18 -3.469447e-18 -8.673617e-18
psychic   1.529175e-16  1.759599e-16 -1.801494e-16
rock      2.072416e-16 -1.282924e-16  1.973730e-17
steel    -2.821817e-16  1.601

**EX2:** Difference between `apply()` and `transform()` when using with groupby

In [74]:
# Prepare the DataFrame to play around
df = df_pokemon[ ['pokedex_number','name','type1','hp','attack','defense'] ]
df.head()

Unnamed: 0,pokedex_number,name,type1,hp,attack,defense
0,1,Bulbasaur,grass,45,49,49
1,2,Ivysaur,grass,60,62,63
2,3,Venusaur,grass,80,100,123
3,4,Charmander,fire,39,52,43
4,5,Charmeleon,fire,58,64,58


When the function IS NOT an aggregate function:

In [76]:
# The 'group_keys' argument when using df.groupby(..., group_keys).apply(...)
# - When 'group_keys=True', the group labels are prepended to the resultant DataFrame
# - When 'group_keys=False', the group labels are omitted in the resultant DataFrame
# The 'group_keys' argument only matters when the result from df.groupby().apply() has the same shape as the original 'df'

print(f'************ group_keys=False ************')
df_false = df.groupby('type1', group_keys=False)[['hp','attack','defense']].apply(lambda x: x/100)
print( df_false.index, end='\n\n' )
display( df_false )

print(f'\n************ group_keys=True ************')
df_true = df.groupby('type1', group_keys=True)[['hp','attack','defense']].apply(lambda x: x/100)
print( df_true.index, end='\n\n' )
display( df_true )

************ group_keys=False ************
Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       791, 792, 793, 794, 795, 796, 797, 798, 799, 800],
      dtype='int64', length=801)



Unnamed: 0,hp,attack,defense
0,0.45,0.49,0.49
1,0.60,0.62,0.63
2,0.80,1.00,1.23
3,0.39,0.52,0.43
4,0.58,0.64,0.58
...,...,...,...
796,0.97,1.01,1.03
797,0.59,1.81,1.31
798,2.23,1.01,0.53
799,0.97,1.07,1.01



************ group_keys=True ************
MultiIndex([(  'bug',   9),
            (  'bug',  10),
            (  'bug',  11),
            (  'bug',  12),
            (  'bug',  13),
            (  'bug',  14),
            (  'bug',  45),
            (  'bug',  46),
            (  'bug',  47),
            (  'bug',  48),
            ...
            ('water', 692),
            ('water', 727),
            ('water', 728),
            ('water', 729),
            ('water', 745),
            ('water', 750),
            ('water', 751),
            ('water', 770),
            ('water', 778),
            ('water', 787)],
           names=['type1', None], length=801)



Unnamed: 0_level_0,Unnamed: 1_level_0,hp,attack,defense
type1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bug,9,0.45,0.30,0.35
bug,10,0.50,0.20,0.55
bug,11,0.60,0.45,0.50
bug,12,0.40,0.35,0.30
bug,13,0.45,0.25,0.50
...,...,...,...,...
water,750,0.38,0.40,0.52
water,751,0.68,0.70,0.92
water,770,0.55,0.60,1.30
water,778,0.68,1.05,0.70


In [77]:
# Groupby -> Choose columns -> Use a non-aggregate function

# These two commands produce the same result (DataFrame), use only one
# Notice that the results are no longer grouped by type1
#df.groupby('type1', group_keys=False)[['hp','attack','defense']].apply(lambda x: x/100)
df.groupby('type1')[['hp','attack','defense']].transform(lambda x: x/100)

Unnamed: 0,hp,attack,defense
0,0.45,0.49,0.49
1,0.60,0.62,0.63
2,0.80,1.00,1.23
3,0.39,0.52,0.43
4,0.58,0.64,0.58
...,...,...,...
796,0.97,1.01,1.03
797,0.59,1.81,1.31
798,2.23,1.01,0.53
799,0.97,1.07,1.01


In [78]:
# Groupby -> Choose columns -> Use a non-aggregate function
# Remind that apply() sends the whole DataFrame to the calling function whereas transform() only sends each column separately

# OK
# Notice that the results are no longer grouped by 'type1'
df.groupby('type1', group_keys=False)[['hp','attack','defense']] \
                                        .apply(lambda x: (x['attack'] - x['defense']).to_frame('att_sub_def') )

# KeyError
#df.groupby('type1')[['hp','attack','defense']].transform(lambda x: (x['attack'] - x['defense']).to_frame('att_sub_def') )

Unnamed: 0,att_sub_def
0,0
1,-1
2,-23
3,9
4,6
...,...
796,-2
797,50
798,48
799,6


When the function IS an aggregate function:

In [79]:
# Groupby -> Choose columns -> Use an aggregate function via apply()
# When the result from .apply() doesn't retain the shape of 'df', 'group_keys' has no effect

# When the function includes aggregation, apply() returns one value per one group
df.groupby('type1')[['hp','attack','defense']].apply(lambda x: x.max())

Unnamed: 0_level_0,hp,attack,defense
type1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bug,107,185,230
dark,223,150,125
dragon,216,180,130
electric,90,123,115
fairy,126,131,95
fighting,144,145,95
fire,115,160,140
flying,85,100,80
ghost,150,165,145
grass,123,181,131


While `groupby()`+aggregation (Section 3.2.1: Split $→$ Aggregate) and `groupby()`+`apply(aggregate_func)` return a reduced (grouped) version of the input, `groupby()`+`transform(aggregate_func)` returns a group-level transformed version <u>of the full data</u>. The new output data has the same length (the same number of rows) as the input data.

![picture](https://miro.medium.com/max/826/1*fxdhr6i-jlzeex7f5RkIlw.png)

Image from https://towardsdatascience.com/pandas-groupby-aggregate-transform-filter-c95ba3444bbb

In [80]:
# Groupby -> Choose columns -> Use an aggregate function via transform() -> Append the result to the original DataFrame
# Even when the function includes aggregation, transform() returns results with the same length as the input

# Transform the groupby object
transformed = df.groupby('type1')[['hp','attack','defense']].transform(lambda x: x.max())
print('******* DataFrame resulted from .transform() *******')
display( transformed )

# Append the tranform results to the original DataFrame
print('\n******* Original DataFrame with transformed columns appended *******')
df2 = df.assign( type1_hp_max=transformed.hp, type1_att_max=transformed.attack, type1_def_max=transformed.defense)
df2

******* DataFrame resulted from .transform() *******


Unnamed: 0,hp,attack,defense
0,123,181,131
1,123,181,131
2,123,181,131
3,115,160,140
4,115,160,140
...,...,...,...
796,100,150,230
797,123,181,131
798,223,150,125
799,190,165,131



******* Original DataFrame with transformed columns appended *******


Unnamed: 0,pokedex_number,name,type1,hp,attack,defense,type1_hp_max,type1_att_max,type1_def_max
0,1,Bulbasaur,grass,45,49,49,123,181,131
1,2,Ivysaur,grass,60,62,63,123,181,131
2,3,Venusaur,grass,80,100,123,123,181,131
3,4,Charmander,fire,39,52,43,115,160,140
4,5,Charmeleon,fire,58,64,58,115,160,140
...,...,...,...,...,...,...,...,...,...
796,797,Celesteela,steel,97,101,103,100,150,230
797,798,Kartana,grass,59,181,131,123,181,131
798,799,Guzzlord,dark,223,101,53,223,150,125
799,800,Necrozma,psychic,97,107,101,190,165,131


### 3.2.3 Split $→$ Filtration

- Filtration is a process in which we <u>discard some groups</u>, according to a group-wise computation that evaluates True or False.
- As the name suggests, `filter()` <u>does not change the data in any capacity, but instead selects a subset of the data</u>.

![picture](https://miro.medium.com/max/826/1*lDZgurT8zja-u77DW758KQ.png)

Image from https://towardsdatascience.com/pandas-groupby-aggregate-transform-filter-c95ba3444bbb

In [81]:
# Create a DataFrame to play around
df = df_pokemon[ ['pokedex_number','name','type1','hp','attack','defense'] ]
df.head()

Unnamed: 0,pokedex_number,name,type1,hp,attack,defense
0,1,Bulbasaur,grass,45,49,49
1,2,Ivysaur,grass,60,62,63
2,3,Venusaur,grass,80,100,123
3,4,Charmander,fire,39,52,43
4,5,Charmeleon,fire,58,64,58


In [82]:
# Groupby without filter
df.groupby('type1').size()

Unnamed: 0_level_0,0
type1,Unnamed: 1_level_1
bug,72
dark,29
dragon,27
electric,39
fairy,18
fighting,28
fire,52
flying,3
ghost,27
grass,78


In [83]:
# Groupby with a filter

# Groupby -> Filtration
# Show only data that belongs to a group whose number of pokemons is less than 20
result = df.groupby('type1').filter(lambda group: group['pokedex_number'].count() < 20)

# Result from groupby+filter is the subset of the original dataframe (df) not the grouped results
result

Unnamed: 0,pokedex_number,name,type1,hp,attack,defense
34,35,Clefairy,fairy,70,45,48
35,36,Clefable,fairy,95,70,73
172,173,Cleffa,fairy,50,25,28
174,175,Togepi,fairy,35,20,65
175,176,Togetic,fairy,55,40,85
208,209,Snubbull,fairy,60,80,50
209,210,Granbull,fairy,90,120,75
467,468,Togekiss,fairy,85,50,95
640,641,Tornadus,flying,79,100,80
668,669,Flabébé,fairy,44,38,39


## 3.3 Groupby's wrapup
1. Result returned from `.groupby()` is neither pandas.Series nor pandas.DataFrame, but a Groupby object.

2. A Groupby object can be further processed with aggregation functions. Result is <u>an aggregated DataFrame whose number of rows equal to the number of groups</u>.
  - Standalone aggregation function: `df.groupby(...).mean(...)`, ...
  - Compound aggregation function: `df.groupby(...).agg(...)`

3. A Groupby object can be further processed with apply functions, `df.groupby(..., group_keys).apply(func)`:
  - `func` can be any function with-or-without aggregation effect and with-or-without cross-colomn calculation.
  - If `func` is a function with no aggregation effect, result is <u>a non-aggregated DataFrame whose number of rows equal to the original DataFrame before being grouped</u>. In this case, watch out the bool value in `group_keys`.
  - If `func` is a function with aggregation effect, result is <u>an aggregated DataFrame whose number of rows equal to the number of groups</u>.

4. A Groupby object can be further processed with tranformation functions, `df.groupby(...).tranform(func)`:
  - `func` can be any function with-or-without aggregation effect but <u>any cross-colomn calculation will cause an error</u>.
  - If `func` is a function with no aggregation effect, it is similar to calling `df.groupby(..., group_keys=False).apply(func)`. Result is <u>a non-aggregated DataFrame whose number of rows equal to the original DataFrame before being grouped</u>.
  - If `func` is a function with aggregation effect, result is <u>seemingly a non-aggregated DataFrame whose number of rows equal to the number of groups</u>. However, the data in transformed columns are not the same as the data in the original DataFrame (before being grouped) <u>but they are the per-group aggregated results.</u>

5. A Groupby object can be further processed with filtration functions, `df.groupby(...).filter(group_func)`:
  - `group_func` is a function that receives one group as input and returns True/False (= Does this group meet our criteria?).
  - Result from `df.groupby(...).filter(group_func)` is a direct subset of the original DataFrame before being grouped (no data in `df` is changed). <u>However, some rows in `df` will be filtered out (eliminated) if they belong to groups that don't meet our criteria judging by `group_func`</u>.

# PRACTICE

1. (1 point) Based on the 'type1' attribute, conclude the numbers of each pokemon type. Sort the results descendingly based on the numbers of pokemons.

2. (1 point) Display all types of pokemons (considering 'type1' only) from the best to the worst stats.
  - First, consider 'hp', the larger the better. Then, consider the average between 'attack' and 'defense', the larger the better
  - The final display should include at least 'type1', 'hp', 'attack' and 'defense' columns.

3. (1.5 point) Display all pokemons whose 'hp' is greater than the average hp value computed from all pokemons with the same 'type1'. Sort the result by 'type1' (ascendingly) then by 'hp' (descendingly).