8.1 Hierarchical Indexing

In [8]:
import pandas as pd
import numpy as np


data = pd.Series(np.random.randn(9),
                 index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])

print(data)
print('\n')
print(data.index)
print('\n')
print(data['b'])
print('\n')
print(data['b':'c'])
print('\n')
print(data.loc[['b', 'd']])
print('\n')
print(data.loc[:, 2])
print('\n')
print(data.unstack())
print('\n')
print(data.unstack().stack())

a  1   -0.753332
   2    0.859014
   3   -0.996806
b  1    0.462221
   3   -1.020178
c  1    0.439455
   2   -0.377132
d  2   -1.674719
   3    0.178594
dtype: float64


MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )


1    0.462221
3   -1.020178
dtype: float64


b  1    0.462221
   3   -1.020178
c  1    0.439455
   2   -0.377132
dtype: float64


b  1    0.462221
   3   -1.020178
d  2   -1.674719
   3    0.178594
dtype: float64


a    0.859014
c   -0.377132
d   -1.674719
dtype: float64


          1         2         3
a -0.753332  0.859014 -0.996806
b  0.462221       NaN -1.020178
c  0.439455 -0.377132       NaN
d       NaN -1.674719  0.178594


a  1   -0.753332
   2    0.859014
   3   -0.996806
b  1    0.462221
   3   -1.020178
c  1    0.439455
   2   -0.377132
d  2   -1.674719
   3    0.178594
dtype: float64


In [11]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[["a", "a", "b", "b"], [1, 2, 1, 2]],
                     columns=[["Ohio", "Ohio", "Colorado"],
                              ["Green", "Red", "Green"]])

print(frame)

frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']

print('\n')
print(frame)

print('\n')
print(frame['Ohio'])

     Ohio     Colorado
    Green Red    Green
a 1     0   1        2
  2     3   4        5
b 1     6   7        8
  2     9  10       11


state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11


color      Green  Red
key1 key2            
a    1         0    1
     2         3    4
b    1         6    7
     2         9   10


In [13]:
# Reordering and Sorting Levels

print(frame.sort_index(level=1))

print('\n')
print(frame.swaplevel(0, 1).sort_index(level=0))


state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
b    1        6   7        8
a    2        3   4        5
b    2        9  10       11


state      Ohio     Colorado
color     Green Red    Green
key2 key1                   
1    a        0   1        2
     b        6   7        8
2    a        3   4        5
     b        9  10       11


In [19]:
# Summary Statistics by Level

print(frame.groupby('key2').sum())
print('\n')
print(frame.T.groupby('color').sum().T)

state  Ohio     Colorado
color Green Red    Green
key2                    
1         6   8       10
2        12  14       16


color      Green  Red
key1 key2            
a    1         2    1
     2         8    4
b    1        14    7
     2        20   10


In [23]:
# Indexing with a DataFrame’s columns

frame = pd.DataFrame({"a": range(7), "b": range(7, 0, -1),
                      "c": ["one", "one", "one", "two", "two",
                            "two", "two"],
                      "d": [0, 1, 2, 0, 1, 2, 3]})

print(frame)

frame2 = frame.set_index(['c', 'd'])
print('\n')
print(frame2)

print('\n')
print(frame.set_index(['c', 'd'], drop=False))

print('\n')
print(frame2.reset_index)

   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3


       a  b
c   d      
one 0  0  7
    1  1  6
    2  2  5
two 0  3  4
    1  4  3
    2  5  2
    3  6  1


       a  b    c  d
c   d              
one 0  0  7  one  0
    1  1  6  one  1
    2  2  5  one  2
two 0  3  4  two  0
    1  4  3  two  1
    2  5  2  two  2
    3  6  1  two  3


<bound method DataFrame.reset_index of        a  b
c   d      
one 0  0  7
    1  1  6
    2  2  5
two 0  3  4
    1  4  3
    2  5  2
    3  6  1>


8.2 Combining and Merging Datasets

In [26]:
# Database-Style DataFrame Joins

df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})

df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})

print(df1)
print('\n')
print(df2)

print('\n')
print(pd.merge(df1, df2))

print('\n')
print(pd.merge(df1, df2, on='key'))

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   a      5
6   b      6


  key  data2
0   a      0
1   b      1
2   d      2


  key  data1  data2
0   b      0      1
1   b      1      1
2   a      2      0
3   a      4      0
4   a      5      0
5   b      6      1


  key  data1  data2
0   b      0      1
1   b      1      1
2   a      2      0
3   a      4      0
4   a      5      0
5   b      6      1


In [28]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})

print(pd.merge(df3, df4, left_on='lkey', right_on='rkey'))

print('\n')
print(pd.merge(df1, df2, how='outer'))

  lkey  data1 rkey  data2
0    b      0    b      1
1    b      1    b      1
2    a      2    a      0
3    a      4    a      0
4    a      5    a      0
5    b      6    b      1


  key  data1  data2
0   a    2.0    0.0
1   a    4.0    0.0
2   a    5.0    0.0
3   b    0.0    1.0
4   b    1.0    1.0
5   b    6.0    1.0
6   c    3.0    NaN
7   d    NaN    2.0


In [32]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})

print(df1)
print('\n')
print(df2)
print('\n')
print(pd.merge(df1, df2, on='key', how='left'))

print('\n')
print(pd.merge(df1, df2, how='inner'))

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   b      5


  key  data2
0   a      0
1   b      1
2   a      2
3   b      3
4   d      4


   key  data1  data2
0    b      0    1.0
1    b      0    3.0
2    b      1    1.0
3    b      1    3.0
4    a      2    0.0
5    a      2    2.0
6    c      3    NaN
7    a      4    0.0
8    a      4    2.0
9    b      5    1.0
10   b      5    3.0


  key  data1  data2
0   b      0      1
1   b      0      3
2   b      1      1
3   b      1      3
4   a      2      0
5   a      2      2
6   a      4      0
7   a      4      2
8   b      5      1
9   b      5      3


In [35]:
left = pd.DataFrame({"key1": ["foo", "foo", "bar"],
                     "key2": ["one", "two", "one"],
                     "lval": pd.Series([1, 2, 3], dtype='Int64')})
right = pd.DataFrame({"key1": ["foo", "foo", "bar", "bar"],
                      "key2": ["one", "one", "one", "two"],
                      "rval": pd.Series([4, 5, 6, 7], dtype='Int64')})

print(pd.merge(left, right, on=['key1', 'key2'], how='outer'))

print('\n')
print(pd.merge(left, right, on='key1'))

print('\n')
print(pd.merge(left, right, on='key1', suffixes=('_left', '_right')))

  key1 key2  lval  rval
0  bar  one     3     6
1  bar  two  <NA>     7
2  foo  one     1     4
3  foo  one     1     5
4  foo  two     2  <NA>


  key1 key2_x  lval key2_y  rval
0  foo    one     1    one     4
1  foo    one     1    one     5
2  foo    two     2    one     4
3  foo    two     2    one     5
4  bar    one     3    one     6
5  bar    one     3    two     7


  key1 key2_left  lval key2_right  rval
0  foo       one     1        one     4
1  foo       one     1        one     5
2  foo       two     2        one     4
3  foo       two     2        one     5
4  bar       one     3        one     6
5  bar       one     3        two     7


In [39]:
# Merging on Index

left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

print(left1)
print('\n')
print(right1)
print('\n')
print(pd.merge(left1, right1, left_on='key', right_index=True))

print('\n')
print(pd.merge(left1, right1, left_on='key', right_index=True, how='outer'))

  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5


   group_val
a        3.5
b        7.0


  key  value  group_val
0   a      0        3.5
1   b      1        7.0
2   a      2        3.5
3   a      3        3.5
4   b      4        7.0


  key  value  group_val
0   a      0        3.5
2   a      2        3.5
3   a      3        3.5
1   b      1        7.0
4   b      4        7.0
5   c      5        NaN


In [42]:
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})
righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
                      index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                              [2001, 2000, 2000, 2000, 2001, 2002]],
                              columns=['event1', 'event2'])

print(lefth)
print('\n')
print(righth)

print('\n')
print(pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True))

print('\n')
print(pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True, how='outer'))

     key1  key2  data
0    Ohio  2000   0.0
1    Ohio  2001   1.0
2    Ohio  2002   2.0
3  Nevada  2001   3.0
4  Nevada  2002   4.0


             event1  event2
Nevada 2001       0       1
       2000       2       3
Ohio   2000       4       5
       2000       6       7
       2001       8       9
       2002      10      11


     key1  key2  data  event1  event2
0    Ohio  2000   0.0       4       5
0    Ohio  2000   0.0       6       7
1    Ohio  2001   1.0       8       9
2    Ohio  2002   2.0      10      11
3  Nevada  2001   3.0       0       1


     key1  key2  data  event1  event2
4  Nevada  2000   NaN     2.0     3.0
3  Nevada  2001   3.0     0.0     1.0
4  Nevada  2002   4.0     NaN     NaN
0    Ohio  2000   0.0     4.0     5.0
0    Ohio  2000   0.0     6.0     7.0
1    Ohio  2001   1.0     8.0     9.0
2    Ohio  2002   2.0    10.0    11.0


In [47]:
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=["a", "c", "e"],
                     columns=["Ohio", "Nevada"])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=["b", "c", "d", "e"],
                      columns=["Missouri", "Alabama"])

print(left2)
print('\n')
print(right2)

print('\n')
print(pd.merge(left2, right2, how='outer', left_index=True, right_index=True))

print('\n')
print(left2.join(right2, how='outer'))

print('\n')
print(left1.join(right1, on='key'))

   Ohio  Nevada
a   1.0     2.0
c   3.0     4.0
e   5.0     6.0


   Missouri  Alabama
b       7.0      8.0
c       9.0     10.0
d      11.0     12.0
e      13.0     14.0


   Ohio  Nevada  Missouri  Alabama
a   1.0     2.0       NaN      NaN
b   NaN     NaN       7.0      8.0
c   3.0     4.0       9.0     10.0
d   NaN     NaN      11.0     12.0
e   5.0     6.0      13.0     14.0


   Ohio  Nevada  Missouri  Alabama
a   1.0     2.0       NaN      NaN
b   NaN     NaN       7.0      8.0
c   3.0     4.0       9.0     10.0
d   NaN     NaN      11.0     12.0
e   5.0     6.0      13.0     14.0


  key  value  group_val
0   a      0        3.5
1   b      1        7.0
2   a      2        3.5
3   a      3        3.5
4   b      4        7.0
5   c      5        NaN


In [50]:
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index=["a", "c", "e", "f"],
                       columns=["New York", "Oregon"])

print(another)

print('\n')
print(left2.join([right2, another]))

print('\n')
print(left2.join([right2, another], how='outer'))

   New York  Oregon
a       7.0     8.0
c       9.0    10.0
e      11.0    12.0
f      16.0    17.0


   Ohio  Nevada  Missouri  Alabama  New York  Oregon
a   1.0     2.0       NaN      NaN       7.0     8.0
c   3.0     4.0       9.0     10.0       9.0    10.0
e   5.0     6.0      13.0     14.0      11.0    12.0


   Ohio  Nevada  Missouri  Alabama  New York  Oregon
a   1.0     2.0       NaN      NaN       7.0     8.0
c   3.0     4.0       9.0     10.0       9.0    10.0
e   5.0     6.0      13.0     14.0      11.0    12.0
b   NaN     NaN       7.0      8.0       NaN     NaN
d   NaN     NaN      11.0     12.0       NaN     NaN
f   NaN     NaN       NaN      NaN      16.0    17.0


In [52]:
# Concatenating Along an Axis

arr = np.arange(12).reshape((3, 4))
print(arr)
print('\n')
print(np.concatenate([arr, arr], axis=1))

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


[[ 0  1  2  3  0  1  2  3]
 [ 4  5  6  7  4  5  6  7]
 [ 8  9 10 11  8  9 10 11]]


In [54]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

print(pd.concat([s1, s2, s3]))

print('\n')
print(pd.concat([s1, s2, s3], axis=1))

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64


     0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  NaN  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
f  NaN  NaN  5.0
g  NaN  NaN  6.0


In [59]:
s4 = pd.concat([s1, s3])
print(s4)

print('\n')
print(pd.concat([s1, s4], axis=1))

print('\n')
print(pd.concat([s1, s4], axis=1, join='inner'))

print('\n')
print(pd.concat([s1, s4], axis=1).reindex(['a', 'b', 'c', 'b', 'e']))

a    0
b    1
f    5
g    6
dtype: int64


     0  1
a  0.0  0
b  1.0  1
f  NaN  5
g  NaN  6


   0  1
a  0  0
b  1  1


     0    1
a  0.0  0.0
b  1.0  1.0
c  NaN  NaN
b  1.0  1.0
e  NaN  NaN


In [62]:
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
print(result)
print('\n')
print(result.unstack())

print('\n')
print(pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three']))

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64


         a    b    f    g
one    0.0  1.0  NaN  NaN
two    0.0  1.0  NaN  NaN
three  NaN  NaN  5.0  6.0


   one  two  three
a  0.0  NaN    NaN
b  1.0  NaN    NaN
c  NaN  2.0    NaN
d  NaN  3.0    NaN
e  NaN  4.0    NaN
f  NaN  NaN    5.0
g  NaN  NaN    6.0


In [68]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=["a", "b", "c"],
                   columns=["one", "two"])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=["a", "c"],
                   columns=["three", "four"])

print(df1)
print('\n')
print(df2)
print('\n')
print(pd.concat([df1, df2], axis=1, keys=['level1', 'level2']))
print('\n')
print(pd.concat({'level1': df1, 'level2': df2}, axis=1))

print('\n')
print(pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], names=['upper', 'lower']))

   one  two
a    0    1
b    2    3
c    4    5


   three  four
a      5     6
c      7     8


  level1     level2     
     one two  three four
a      0   1    5.0  6.0
b      2   3    NaN  NaN
c      4   5    7.0  8.0


  level1     level2     
     one two  three four
a      0   1    5.0  6.0
b      2   3    NaN  NaN
c      4   5    7.0  8.0


upper level1     level2     
lower    one two  three four
a          0   1    5.0  6.0
b          2   3    NaN  NaN
c          4   5    7.0  8.0


In [71]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
print(df1)
print('\n')
print(df2)

print('\n')
print(pd.concat([df1, df2], ignore_index=True))

          a         b         c         d
0  0.243156  2.803110  0.846964 -1.431339
1  1.155936  1.238180  1.514520 -0.558953
2 -0.514469  0.961703 -0.659726 -0.186390


          b         d         a
0 -0.331582  1.141308  0.924309
1 -0.041742 -0.184906  0.624127


          a         b         c         d
0  0.243156  2.803110  0.846964 -1.431339
1  1.155936  1.238180  1.514520 -0.558953
2 -0.514469  0.961703 -0.659726 -0.186390
3  0.924309 -0.331582       NaN  1.141308
4  0.624127 -0.041742       NaN -0.184906


In [77]:
# Combining Data with Overlap

a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
              index=["f", "e", "d", "c", "b", "a"])
b = pd.Series(np.arange(len(a), dtype=np.float64),
              index=["a", "b", "c", "d", "e", "f"])

b.iloc[-1] = np.nan

print(a)
print('\n')
print(b)

print('\n')
print(np.where(pd.isnull(a), b, a))

print('\n')
print(b[:-2].combine_first(a[2:]))

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64


a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
f    NaN
dtype: float64


[0.  2.5 2.  3.5 4.5 nan]


a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64


In [79]:
df1 = pd.DataFrame({"a": [1., np.nan, 5., np.nan],
                    "b": [np.nan, 2., np.nan, 6.],
                    "c": range(2, 18, 4)})
df2 = pd.DataFrame({"a": [5., 4., np.nan, 3., 7.],
                    "b": [np.nan, 3., 4., 6., 8.]})

print(df1)
print('\n')
print(df2)
print('\n')
print(df1.combine_first(df2))

     a    b   c
0  1.0  NaN   2
1  NaN  2.0   6
2  5.0  NaN  10
3  NaN  6.0  14


     a    b
0  5.0  NaN
1  4.0  3.0
2  NaN  4.0
3  3.0  6.0
4  7.0  8.0


     a    b     c
0  1.0  NaN   2.0
1  4.0  2.0   6.0
2  5.0  4.0  10.0
3  3.0  6.0  14.0
4  7.0  8.0   NaN
