In [3]:
import pandas as pd
import numpy as np

2. Concatenating

In [4]:
df1 = pd.DataFrame(np.full((2, 3), "x", dtype=object), columns=["A", "B", "C"])
df1

Unnamed: 0,A,B,C
0,x,x,x
1,x,x,x


In [5]:
df2 = pd.DataFrame(np.full((3, 3), "o", dtype=object), columns=["A", "B", "C"])
df2

Unnamed: 0,A,B,C
0,o,o,o
1,o,o,o
2,o,o,o


In [6]:
df3 = pd.DataFrame(np.full((2, 2), "v", dtype=object), columns=["D", "E"])
df3

Unnamed: 0,D,E
0,v,v
1,v,v


Concatenation along axis 0

In [7]:
pd.concat([df1, df2])

Unnamed: 0,A,B,C
0,x,x,x
1,x,x,x
0,o,o,o
1,o,o,o
2,o,o,o


In [8]:
pd.concat([df1, df2]).reset_index(drop=True)

Unnamed: 0,A,B,C
0,x,x,x
1,x,x,x
2,o,o,o
3,o,o,o
4,o,o,o


In [9]:
pd.concat([df1, df3])

Unnamed: 0,A,B,C,D,E
0,x,x,x,,
1,x,x,x,,
0,,,,v,v
1,,,,v,v


The `keys` parameter

In [10]:
df4 = pd.concat([df1, df2], keys=["df1", "df2"])
df4

Unnamed: 0,Unnamed: 1,A,B,C
df1,0,x,x,x
df1,1,x,x,x
df2,0,o,o,o
df2,1,o,o,o
df2,2,o,o,o


Concatenating along axis 1

In [11]:
pd.concat([df1, df3], axis=1)

Unnamed: 0,A,B,C,D,E
0,x,x,x,v,v
1,x,x,x,v,v


In [12]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
0,x,x,x,o,o,o
1,x,x,x,o,o,o
2,,,,o,o,o


The `join` parameter

In [13]:
pd.concat([df1, df2], axis=1, join="inner")

Unnamed: 0,A,B,C,A.1,B.1,C.1
0,x,x,x,o,o,o
1,x,x,x,o,o,o


In [14]:
pd.concat([df1, df3],join="inner")

0
1
0
1


3. Merging and joining

In [15]:
users = pd.DataFrame(
    {
        "userID": [5672, 3452, 2878, 3234],
        "First Name": ["Christopher", "Johnnie", "Debbie", "Teri"],
        "Last Name": ["Boyd", "Baldwin", "Alvarez", "Gill"],
    }
)
users

Unnamed: 0,userID,First Name,Last Name
0,5672,Christopher,Boyd
1,3452,Johnnie,Baldwin
2,2878,Debbie,Alvarez
3,3234,Teri,Gill


In [16]:
scores = pd.DataFrame(
    {"userID": [2878, 5672, 3234, 5672, 2878], "Score": [84, 56, 72, 77, 88]}
)
scores

Unnamed: 0,userID,Score
0,2878,84
1,5672,56
2,3234,72
3,5672,77
4,2878,88


In [17]:
"""Pandas finds by itself the column which is in common between the two DataFrames, which is the userID column. 
It has then merged the two DataFrames according to this column.
"""
merged_df = pd.merge(users, scores)
merged_df

Unnamed: 0,userID,First Name,Last Name,Score
0,5672,Christopher,Boyd,56
1,5672,Christopher,Boyd,77
2,2878,Debbie,Alvarez,84
3,2878,Debbie,Alvarez,88
4,3234,Teri,Gill,72


In [18]:
scores2 = pd.DataFrame(
    {"studentID": [2878, 5672, 3234, 5672, 2878], "Score": [84, 56, 72, 77, 88]}
)
scores2

Unnamed: 0,studentID,Score
0,2878,84
1,5672,56
2,3234,72
3,5672,77
4,2878,88


In [19]:
# MergeError: No common columns to perform merge on
pd.merge(users, scores2)

In [20]:
pd.merge(users, scores2, left_on="userID", right_on="studentID")

Unnamed: 0,userID,First Name,Last Name,studentID,Score
0,5672,Christopher,Boyd,5672,56
1,5672,Christopher,Boyd,5672,77
2,2878,Debbie,Alvarez,2878,84
3,2878,Debbie,Alvarez,2878,88
4,3234,Teri,Gill,3234,72


Merging on multiple columns

In [21]:
gold = pd.DataFrame(
    {
        "Code": ["CAN", "GER", "USA", "NOR"],
        "Country": ["Canada", "Germany", "United States", "Norway"],
        "Total": [14, 10, 9, 9],
    }
)
gold

Unnamed: 0,Code,Country,Total
0,CAN,Canada,14
1,GER,Germany,10
2,USA,United States,9
3,NOR,Norway,9


In [22]:
bronze = pd.DataFrame(
    {
        "Code": ["USA", "GER", "NOR", "AUS"],
        "Country": ["United States", "Germany", "Norway", "Austria"],
        "Total": [13, 7, 7, 6],
    }
)
bronze

Unnamed: 0,Code,Country,Total
0,USA,United States,13
1,GER,Germany,7
2,NOR,Norway,7
3,AUS,Austria,6


In [23]:
""" This is because, by default, pandas tries to merge according to all common columns. 
This means that the rows of the merged DataFrame consist of all rows
where the Code, Country, and Total columns are identical in both DataFrames. """

pd.merge(gold, bronze)

Unnamed: 0,Code,Country,Total


In [24]:
pd.merge(gold, bronze, on=["Code", "Country"])

Unnamed: 0,Code,Country,Total_x,Total_y
0,GER,Germany,10,7
1,USA,United States,9,13
2,NOR,Norway,9,7


In [25]:
pd.merge(gold, bronze, on=["Code", "Country"], suffixes=["_gold", "_bronze"])

Unnamed: 0,Code,Country,Total_gold,Total_bronze
0,GER,Germany,10,7
1,USA,United States,9,13
2,NOR,Norway,9,7


Different types of joins

In [26]:
"""This type of join returns both the merge of the matched rows and the unmatched values 
from both the left and right DataFrames."""
pd.merge(
    gold, bronze, on=["Code", "Country"], suffixes=["_gold", "_bronze"], how="outer"
)

Unnamed: 0,Code,Country,Total_gold,Total_bronze
0,CAN,Canada,14.0,
1,GER,Germany,10.0,7.0
2,USA,United States,9.0,13.0
3,NOR,Norway,9.0,7.0
4,AUS,Austria,,6.0


`left` join: return the merge of the matched rows and the unmatched values from only the left DataFrame  
`right` join: return the merge of the matched rows and the unmatched values from only the right DataFrame


In [27]:
pd.merge(
    gold, bronze, on=["Code", "Country"], suffixes=["_gold", "_bronze"], how="left"
)

Unnamed: 0,Code,Country,Total_gold,Total_bronze
0,CAN,Canada,14,
1,GER,Germany,10,7.0
2,USA,United States,9,13.0
3,NOR,Norway,9,7.0


In [28]:
pd.merge(
    gold, bronze, on=["Code", "Country"], suffixes=["_gold", "_bronze"], how="right"
)

Unnamed: 0,Code,Country,Total_gold,Total_bronze
0,USA,United States,9.0,13
1,GER,Germany,10.0,7
2,NOR,Norway,9.0,7
3,AUS,Austria,,6


**Remark**: We would like to draw your attention to one particular issue that can arise when performing an outer merge. Suppose we have two DataFrames containing integer values

In [29]:
df1 = pd.DataFrame({"key": [1, 2, 3, 4], "val1": [1, 2, 3, 4]})
df2 = pd.DataFrame({"key": [1, 2, 3, 5], "val2": [1, 2, 3, 4]})

In [30]:
df_in = df1.merge(df2, how="inner")
df_in

Unnamed: 0,key,val1,val2
0,1,1,1
1,2,2,2
2,3,3,3


In [31]:
df_out = df1.merge(df2, how="outer")
df_out

Unnamed: 0,key,val1,val2
0,1,1.0,1.0
1,2,2.0,2.0
2,3,3.0,3.0
3,4,4.0,
4,5,,4.0


`dtype` of columns with `NaN` values was changed to `float`

4. Exercise: merging with different joins

In [32]:
left = pd.DataFrame(
    {"key1": ["a", "b", "c"], "key2": ["A", "B", "C"], "lval": [0, 1, 2]}
)
right = pd.DataFrame(
    {"key1": ["a", "b", "c"], "key2": ["A", "D", "C"], "rval": [3, 4, 6]}
)

In [33]:
left

Unnamed: 0,key1,key2,lval
0,a,A,0
1,b,B,1
2,c,C,2


In [34]:
right

Unnamed: 0,key1,key2,rval
0,a,A,3
1,b,D,4
2,c,C,6


Question A.

In [35]:
left.merge(right, how='inner')

Unnamed: 0,key1,key2,lval,rval
0,a,A,0,3
1,c,C,2,6


Question B.

In [36]:
left.merge(right, how='outer')

Unnamed: 0,key1,key2,lval,rval
0,a,A,0.0,3.0
1,b,B,1.0,
2,c,C,2.0,6.0
3,b,D,,4.0


Question C.

In [37]:
pd.merge(left, right, how='right')

Unnamed: 0,key1,key2,lval,rval
0,a,A,0.0,3
1,b,D,,4
2,c,C,2.0,6


Question D.

In [38]:
pd.merge(left, right, on=['key1'], how='outer', suffixes=['_x','_y'])

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,a,A,0,A,3
1,b,B,1,D,4
2,c,C,2,C,6


5. Pivoting

The pivot() function is applied to a DataFrame and has three important parameters: index, columns, and values:
- it takes the entries from the column passed to index and makes these the indices of the new DataFrame
- it takes the entries from the column passed to columns and makes these the column labels of the new DataFrame
- it takes the entries from the column passed to values and uses them to fill in the new DataFrame, by putting them in the corresponding columns


In [39]:
values = [3, 81, 1, 56, 71, 91, 54, 94, 64, 90, 21, 36]
coordinates = ["x", "y", "z"] * 4
time = [0] * 3 + [1] * 3 + [2] * 3 + [3] * 3
df = pd.DataFrame({"time": time, "coordinates": coordinates, "values": values})
df

Unnamed: 0,time,coordinates,values
0,0,x,3
1,0,y,81
2,0,z,1
3,1,x,56
4,1,y,71
5,1,z,91
6,2,x,54
7,2,y,94
8,2,z,64
9,3,x,90


In [40]:
df_pivot = df.pivot(index="time", columns="coordinates", values="values")
df_pivot

coordinates,x,y,z
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3,81,1
1,56,71,91
2,54,94,64
3,90,21,36


The `pivot_table()` funtion

In [41]:
values2 = [6, 82, 9, 47, 8, 12, 64, 88, 53, 46, 59, 60]

In [42]:
df2 = pd.DataFrame(
    {"time": time * 2, "coordinates": coordinates * 2, "values": values + values2}
)
df2

Unnamed: 0,time,coordinates,values
0,0,x,3
1,0,y,81
2,0,z,1
3,1,x,56
4,1,y,71
5,1,z,91
6,2,x,54
7,2,y,94
8,2,z,64
9,3,x,90


In [43]:
df2.pivot(index='time', columns='coordinates', values='values')
# ValueError: Index contains duplicate entries, cannot reshape

In [44]:
"""We can see that the entry at row 0 and column x is the average of the two entries 3 and 6 that mapped to this position."""
df2_pivot = df2.pivot_table(index="time", columns="coordinates", values="values")
df2_pivot

coordinates,x,y,z
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4.5,81.5,5.0
1,51.5,39.5,51.5
2,59.0,91.0,58.5
3,68.0,40.0,48.0


In [45]:
import numpy as np


def distance(a):
    x = np.max(a) - np.min(a)
    return x

In [46]:
df2_pivot = df2.pivot_table(
    index="time", columns="coordinates", values="values", aggfunc=distance
)
df2_pivot

coordinates,x,y,z
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3,1,8
1,9,63,79
2,10,6,11
3,44,38,24


In [47]:
df2_pivot = df2.pivot_table(
    index="time", columns="coordinates", values="values", aggfunc=tuple
)
df2_pivot

coordinates,x,y,z
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"(3, 6)","(81, 82)","(1, 9)"
1,"(56, 47)","(71, 8)","(91, 12)"
2,"(54, 64)","(94, 88)","(64, 53)"
3,"(90, 46)","(21, 59)","(36, 60)"


A subtle difference: `pivot_table()` only aggregates numerical data types in the parameter values, whereas `pivot()` aggregates both numeric and non-numeric data types. To see this better let’s consider this example:

In [48]:
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
                           'two'],
                   'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'baz': [1, 2, 3, 4, 5, 6],
                   'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
df

Unnamed: 0,foo,bar,baz,zoo
0,one,A,1,x
1,one,B,2,y
2,one,C,3,z
3,two,A,4,q
4,two,B,5,w
5,two,C,6,t


In [49]:
df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])

Unnamed: 0_level_0,baz,baz,baz,zoo,zoo,zoo
bar,A,B,C,A,B,C
foo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
one,1,2,3,x,y,z
two,4,5,6,q,w,t


In [50]:
df.pivot_table(index='foo', columns='bar', values=['baz', 'zoo'])

6. Exercise: pivoting

In [59]:
songs = pd.read_csv("2_data-analysis/c2_songs.csv")
songs

Unnamed: 0,Musician,Genre,Name,Decade,Minutes
0,Led Zeppelin,hard rock,Stairway to Heaven,70,08:02
1,Led Zeppelin,hard rock,Kashmir,70,08:37
2,Led Zeppelin,hard rock,Immigrant Song,70,02:26
3,Led Zeppelin,hard rock,Whole Lotta Love,60,05:33
4,Led Zeppelin,hard rock,Black Dog,70,04:55
5,Led Zeppelin,hard rock,Good Times Bad Times,60,02:43
6,Led Zeppelin,hard rock,Moby Dick,60,04:25
7,Led Zeppelin,hard rock,Ramble On,60,04:35
8,Led Zeppelin,hard rock,All My Love,70,05:53
9,Led Zeppelin,hard rock,The Song Remains the Same,70,05:24


In [60]:
def count_x(x):
    return np.count_nonzero(x)

songs.pivot_table(
    index=['Musician', 'Decade'], 
    columns=['Genre'], 
    values='Name',
    aggfunc=count_x,
    fill_value=0)

Unnamed: 0_level_0,Genre,folk rock,hard rock,pop rock
Musician,Decade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bob Dylan,60,2,0,0
Bob Dylan,70,1,0,0
David Bowie,60,0,0,1
David Bowie,70,0,0,3
David Bowie,80,0,0,1
David Bowie,90,0,0,1
Led Zeppelin,60,0,5,0
Led Zeppelin,70,0,6,0


7. Hierarchical indexing

In [65]:
# define the MultiIndex for the rows
row_levels = [['R0', 'R1'], ['r00', 'r01', 'r10', 'r11']]
row_labels = [[0,0,1,1], [0,1,2,3]]
row_indices = pd.MultiIndex(row_levels, row_labels)

In [67]:
# define the MultiIndex for the columns
col_levels = [['C0', 'C1'], ['c00', 'c01', 'c10', 'c11']]
col_labels = [[0,0,1,1], [0,1,2,3]]
col_indices = pd.MultiIndex(col_levels, col_labels)

In [68]:
# define the data
data = np.random.randint(5, size=(4,4))
# define the DataFrame
pd.DataFrame(data, index=row_indices, columns=col_indices)

Unnamed: 0_level_0,Unnamed: 1_level_0,C0,C0,C1,C1
Unnamed: 0_level_1,Unnamed: 1_level_1,c00,c01,c10,c11
R0,r00,3,1,3,4
R0,r01,2,1,4,2
R1,r10,1,3,3,4
R1,r11,1,0,2,0


8. Stacking and unstacking

In [71]:
# define the MultiIndex for the rows
row_levels = [["R0", "R1"], ["r00", "r01", "r10", "r11"]]
row_labels = [[0, 0, 1, 1], [0, 1, 2, 3]]
row_indices = pd.MultiIndex(row_levels, row_labels)

# define the MultiIndex for the columns
col_levels = [["C0", "C1"], ["c00", "c01", "c10", "c11"]]
col_labels = [[0, 0, 1, 1], [0, 1, 2, 3]]
col_indices = pd.MultiIndex(col_levels, col_labels)

# define the data
data = np.arange(16).reshape(4, 4)

# create the dataframe
df = pd.DataFrame(data, index=row_indices, columns=col_indices)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,C0,C0,C1,C1
Unnamed: 0_level_1,Unnamed: 1_level_1,c00,c01,c10,c11
R0,r00,0,1,2,3
R0,r01,4,5,6,7
R1,r10,8,9,10,11
R1,r11,12,13,14,15


In [72]:
# the stack() function takes the inner most column level 
# and turns it into an innermost row level
df.stack()

Unnamed: 0,Unnamed: 1,Unnamed: 2,C0,C1
R0,r00,c00,0.0,
R0,r00,c01,1.0,
R0,r00,c10,,2.0
R0,r00,c11,,3.0
R0,r01,c00,4.0,
R0,r01,c01,5.0,
R0,r01,c10,,6.0
R0,r01,c11,,7.0
R1,r10,c00,8.0,
R1,r10,c01,9.0,


In [73]:
df.unstack()

Unnamed: 0_level_0,C0,C0,C0,C0,C0,C0,C0,C0,C1,C1,C1,C1,C1,C1,C1,C1
Unnamed: 0_level_1,c00,c00,c00,c00,c01,c01,c01,c01,c10,c10,c10,c10,c11,c11,c11,c11
Unnamed: 0_level_2,r00,r01,r10,r11,r00,r01,r10,r11,r00,r01,r10,r11,r00,r01,r10,r11
R0,0.0,4.0,,,1.0,5.0,,,2.0,6.0,,,3.0,7.0,,
R1,,,8.0,12.0,,,9.0,13.0,,,10.0,14.0,,,11.0,15.0


Stacking and unstacking on different levels

In [74]:
df.stack(level=0)

Unnamed: 0,Unnamed: 1,Unnamed: 2,c00,c01,c10,c11
R0,r00,C0,0.0,1.0,,
R0,r00,C1,,,2.0,3.0
R0,r01,C0,4.0,5.0,,
R0,r01,C1,,,6.0,7.0
R1,r10,C0,8.0,9.0,,
R1,r10,C1,,,10.0,11.0
R1,r11,C0,12.0,13.0,,
R1,r11,C1,,,14.0,15.0


In [80]:
df.stack(level=0).unstack().swaplevel(axis=1).dropna(axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,C0,C0,C1,C1
Unnamed: 0_level_1,Unnamed: 1_level_1,c00,c01,c10,c11
R0,r00,0.0,1.0,2.0,3.0
R0,r01,4.0,5.0,6.0,7.0
R1,r10,8.0,9.0,10.0,11.0
R1,r11,12.0,13.0,14.0,15.0


9. Grouping

![Screen_Shot_2017-12-15_at_10.46.23.png](attachment:Screen_Shot_2017-12-15_at_10.46.23.png)

In [82]:

raw_data = {'team': ['Ten Snakes', 'Ten Snakes', 'Ten Snakes', 'Ten Snakes', 
                     'Nine Monkeys', 'Nine Monkeys', 'Nine Monkeys', 'Nine Monkeys', 
                     'Eight Eagles', 'Eight Eagles'], 
        'rank': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '2nd'], 
        'name': ['James', 'Allen', 'Matthew', 'James', 'Devon', 'Sam', 'Justin', 'Sam', 'Paul', 'Ross'], 
        'score1': [16,35,55,29,2,61,68,41,94,18],
        'score2': [81,65,54,44,28,93,2,5,53,99]}
df = pd.DataFrame(raw_data, columns = ['team', 'rank', 'name', 'score1', 'score2'])
df

Unnamed: 0,team,rank,name,score1,score2
0,Ten Snakes,1st,James,16,81
1,Ten Snakes,1st,Allen,35,65
2,Ten Snakes,2nd,Matthew,55,54
3,Ten Snakes,2nd,James,29,44
4,Nine Monkeys,1st,Devon,2,28
5,Nine Monkeys,1st,Sam,61,93
6,Nine Monkeys,2nd,Justin,68,2
7,Nine Monkeys,2nd,Sam,41,5
8,Eight Eagles,1st,Paul,94,53
9,Eight Eagles,2nd,Ross,18,99


Grouping by a single variable

In [103]:
grouped = df.groupby('team')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f9291431220>

In [104]:
list(grouped)

[('Eight Eagles',
             team rank  name  score1  score2
  8  Eight Eagles  1st  Paul      94      53
  9  Eight Eagles  2nd  Ross      18      99),
 ('Nine Monkeys',
             team rank    name  score1  score2
  4  Nine Monkeys  1st   Devon       2      28
  5  Nine Monkeys  1st     Sam      61      93
  6  Nine Monkeys  2nd  Justin      68       2
  7  Nine Monkeys  2nd     Sam      41       5),
 ('Ten Snakes',
           team rank     name  score1  score2
  0  Ten Snakes  1st    James      16      81
  1  Ten Snakes  1st    Allen      35      65
  2  Ten Snakes  2nd  Matthew      55      54
  3  Ten Snakes  2nd    James      29      44)]

In [105]:
grouped.describe()

Unnamed: 0_level_0,score1,score1,score1,score1,score1,score1,score1,score1,score2,score2,score2,score2,score2,score2,score2,score2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Eight Eagles,2.0,56.0,53.740115,18.0,37.0,56.0,75.0,94.0,2.0,76.0,32.526912,53.0,64.5,76.0,87.5,99.0
Nine Monkeys,4.0,43.0,29.631065,2.0,31.25,51.0,62.75,68.0,4.0,32.0,42.292631,2.0,4.25,16.5,44.25,93.0
Ten Snakes,4.0,33.75,16.23525,16.0,25.75,32.0,40.0,55.0,4.0,61.0,15.853496,44.0,51.5,59.5,69.0,81.0


In [106]:
grouped.size()

team
Eight Eagles    2
Nine Monkeys    4
Ten Snakes      4
dtype: int64

In [107]:
grouped.count()

Unnamed: 0_level_0,rank,name,score1,score2
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Eight Eagles,2,2,2,2
Nine Monkeys,4,4,4,4
Ten Snakes,4,4,4,4


In [108]:
grouped.get_group("Ten Snakes")

Unnamed: 0,team,rank,name,score1,score2
0,Ten Snakes,1st,James,16,81
1,Ten Snakes,1st,Allen,35,65
2,Ten Snakes,2nd,Matthew,55,54
3,Ten Snakes,2nd,James,29,44


Grouping by multiple index levels

In [109]:
df2 = df.set_index(['team','rank'])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,name,score1,score2
team,rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ten Snakes,1st,James,16,81
Ten Snakes,1st,Allen,35,65
Ten Snakes,2nd,Matthew,55,54
Ten Snakes,2nd,James,29,44
Nine Monkeys,1st,Devon,2,28
Nine Monkeys,1st,Sam,61,93
Nine Monkeys,2nd,Justin,68,2
Nine Monkeys,2nd,Sam,41,5
Eight Eagles,1st,Paul,94,53
Eight Eagles,2nd,Ross,18,99


In [110]:
grouped2 = df2.groupby(level=["team", "rank"])

In [111]:
grouped2.size()

team          rank
Eight Eagles  1st     1
              2nd     1
Nine Monkeys  1st     2
              2nd     2
Ten Snakes    1st     2
              2nd     2
dtype: int64

In [112]:
grouped2.get_group(("Eight Eagles", "1st"))

Unnamed: 0_level_0,Unnamed: 1_level_0,name,score1,score2
team,rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Eight Eagles,1st,Paul,94,53


Applying a function

In [115]:
grouped.agg(np.sum)

Unnamed: 0_level_0,rank,name,score1,score2
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Eight Eagles,1st2nd,PaulRoss,112,152
Nine Monkeys,1st1st2nd2nd,DevonSamJustinSam,172,128
Ten Snakes,1st1st2nd2nd,JamesAllenMatthewJames,135,244


In [117]:
grouped2.agg(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,name,score1,score2
team,rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Eight Eagles,1st,Paul,94,53
Eight Eagles,2nd,Ross,18,99
Nine Monkeys,1st,DevonSam,63,121
Nine Monkeys,2nd,JustinSam,109,7
Ten Snakes,1st,JamesAllen,51,146
Ten Snakes,2nd,MatthewJames,84,98


Filtering by groups

In [119]:
def f(x):
    m = x.mean(numeric_only=True)
    return (m.score1 > 50) & (m.score2 > 50)

In [120]:
grouped.filter(f)

Unnamed: 0,team,rank,name,score1,score2
8,Eight Eagles,1st,Paul,94,53
9,Eight Eagles,2nd,Ross,18,99


10. Exercise: grouping and filtering

In [134]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,name,score1,score2
team,rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ten Snakes,1st,James,16,81
Ten Snakes,1st,Allen,35,65
Ten Snakes,2nd,Matthew,55,54
Ten Snakes,2nd,James,29,44
Nine Monkeys,1st,Devon,2,28
Nine Monkeys,1st,Sam,61,93
Nine Monkeys,2nd,Justin,68,2
Nine Monkeys,2nd,Sam,41,5
Eight Eagles,1st,Paul,94,53
Eight Eagles,2nd,Ross,18,99


In [130]:
grouped2 = df2.groupby(level=['team', 'rank'])


In [142]:
def f(x):

    return (x.score1.min()>50) and (x.score2.min()>50)

In [143]:
# Grouped by 'team' and 'rank'
grouped2.filter(f)

Unnamed: 0_level_0,Unnamed: 1_level_0,name,score1,score2
team,rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Eight Eagles,1st,Paul,94,53


In [144]:
# Grouped by 'team' only
grouped.filter(f)

Unnamed: 0,team,rank,name,score1,score2
