# Grouping data

In a groupby, you are splitting a dataset according to one column by group.  We will look at the split-appy-combine approach when using groupby.

In [1]:
import pandas as pd

In [2]:
# A hypothetical data set with game scores for 3 teams

dict1 = {'team': ['phoenix', 'hawks', 'cardinals', 'phoenix', 'hawks', 'cardinals','phoenix', 'hawks', 'cardinals', 'phoenix', 'hawks', 'cardinals', 'phoenix', 'hawks', 'cardinals'],
         'year': ['2000', '2000', '2000', '2001', '2001','2001', '2002', '2002', '2002', '2003', '2003', '2003', '2004', '2004', '2004'],
         'game_1': [30, 41, 69, 76, 80, 56, 50, 100, 23, 55, 74, 39, 47, 64, 33],
         'game_2': [36, 44, 67, 72, 83, 89, 97, 100, 24, 55, 75, 31, 100, 77, 42],
         'game_3': [66, 34, 97, 82, 53, 98, 74, 0, 21, 58, 79, 39, 33, 44, 57],
         'game_4': [52, 98, 23, 63, 55, 21, 99, 78, 91, 34, 93, 28, 63, 91, 9]}

In [3]:
# create pandas dataframe
df = pd.DataFrame(dict1)
df

Unnamed: 0,team,year,game_1,game_2,game_3,game_4
0,phoenix,2000,30,36,66,52
1,hawks,2000,41,44,34,98
2,cardinals,2000,69,67,97,23
3,phoenix,2001,76,72,82,63
4,hawks,2001,80,83,53,55
5,cardinals,2001,56,89,98,21
6,phoenix,2002,50,97,74,99
7,hawks,2002,100,100,0,78
8,cardinals,2002,23,24,21,91
9,phoenix,2003,55,55,58,34


### Split

In [4]:
# split the data into groups and then perform an operation
df.groupby('year').sum()

Unnamed: 0_level_0,game_1,game_2,game_3,game_4
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000,140,147,197,173
2001,212,244,233,139
2002,173,221,95,268
2003,168,161,176,155
2004,144,219,134,163


In [5]:
# with one column specified, you will get back a series
df.groupby('team')['game_3'].sum()

team
cardinals    312
hawks        210
phoenix      313
Name: game_3, dtype: int64

In [6]:
# double bracket will give you a format in dataframe
df.groupby('team')[['game_3']].sum()

Unnamed: 0_level_0,game_3
team,Unnamed: 1_level_1
cardinals,312
hawks,210
phoenix,313


In [7]:
# Create two groups, resulting in a MultiIndex object as the index
df_multi = df.groupby(['year', 'team']).sum()
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,game_1,game_2,game_3,game_4
year,team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000,cardinals,69,67,97,23
2000,hawks,41,44,34,98
2000,phoenix,30,36,66,52
2001,cardinals,56,89,98,21
2001,hawks,80,83,53,55
2001,phoenix,76,72,82,63
2002,cardinals,23,24,21,91
2002,hawks,100,100,0,78
2002,phoenix,50,97,74,99
2003,cardinals,39,31,39,28


In [8]:
# Select subsets of the dataframe by selecting a value and then a particular index level
df_multi.xs('2004', level='year')

Unnamed: 0_level_0,game_1,game_2,game_3,game_4
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cardinals,33,42,57,9
hawks,64,77,44,91
phoenix,47,100,33,63


In [9]:
# to reset the index
df_multi.reset_index()

Unnamed: 0,year,team,game_1,game_2,game_3,game_4
0,2000,cardinals,69,67,97,23
1,2000,hawks,41,44,34,98
2,2000,phoenix,30,36,66,52
3,2001,cardinals,56,89,98,21
4,2001,hawks,80,83,53,55
5,2001,phoenix,76,72,82,63
6,2002,cardinals,23,24,21,91
7,2002,hawks,100,100,0,78
8,2002,phoenix,50,97,74,99
9,2003,cardinals,39,31,39,28


### Apply

In [10]:
# Apply a function to each group
df_apply = df.groupby('team')['game_1','game_2', 'game_3', 'game_4'].apply(lambda x: x.sum())
df_apply

Unnamed: 0_level_0,game_1,game_2,game_3,game_4
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cardinals,220,253,312,172
hawks,359,379,210,415
phoenix,258,360,313,311


### Combine

In [11]:
# Combining the results into a data structure
df_combine = df_apply.mean()
df_combine

game_1    279.000000
game_2    330.666667
game_3    278.333333
game_4    299.333333
dtype: float64

In [None]:
# end