In [1]:
### Hierarchial indexing is an important feature of pandas enabling us to have multiple (two or more) index levels on an axis.

### It provides a way for us to work with higher dimensional data in a lower dimensional form.


In [2]:
import pandas as pd
import numpy as np

In [5]:
data = pd.Series(np.random.randn(10), index = [['a','a','a','b','b','b','c','c','d','d'], [1,2,3,1,2,3,1,2,2,3]])

As we can see above, the index is a two dimensional array

In [6]:
data

a  1    0.085340
   2    0.947069
   3   -0.710097
b  1   -0.329458
   2    0.930887
   3   -0.238747
c  1   -0.378277
   2   -0.711937
d  2    1.171112
   3   -0.734569
dtype: float64

In [7]:
### This is a prettified view of a Series with Multi-index as its index.

data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [8]:
### With a hierarchially indexed object, partial indexing is possible enabling us to concisely select subsets of the data

In [9]:
data['a']

1    0.085340
2    0.947069
3   -0.710097
dtype: float64

In [16]:
data['a':'c']

a  1    0.085340
   2    0.947069
   3   -0.710097
b  1   -0.329458
   2    0.930887
   3   -0.238747
c  1   -0.378277
   2   -0.711937
dtype: float64

Lets select data for index 'b' and 'd'

In [18]:
data[['b','d']]

b  1   -0.329458
   2    0.930887
   3   -0.238747
d  2    1.171112
   3   -0.734569
dtype: float64

In [19]:
data

a  1    0.085340
   2    0.947069
   3   -0.710097
b  1   -0.329458
   2    0.930887
   3   -0.238747
c  1   -0.378277
   2   -0.711937
d  2    1.171112
   3   -0.734569
dtype: float64

In [20]:
### Selection is even possible in some cases from an "inner" level. Here we select 

data[:,2]

a    0.947069
b    0.930887
c   -0.711937
d    1.171112
dtype: float64

In above example, we select (a,2),(b,2),(c,2), and (d,2)

In [23]:
### Hierarchial indexing plays a critical role in reshaping data and group-based operations like formatting a pivot table.

### For example, the data Series could be rearranged into a DataFrame using its unstack method

data.unstack()

Unnamed: 0,1,2,3
a,0.08534,0.947069,-0.710097
b,-0.329458,0.930887,-0.238747
c,-0.378277,-0.711937,
d,,1.171112,-0.734569


In [24]:
### The reverse operation of unstack is stack

data.unstack().stack()

a  1    0.085340
   2    0.947069
   3   -0.710097
b  1   -0.329458
   2    0.930887
   3   -0.238747
c  1   -0.378277
   2   -0.711937
d  2    1.171112
   3   -0.734569
dtype: float64

### With a DataFrame, either axis can have a hierarchial index. Following is an example of hierarchical index with hierarical columns



In [27]:
df = pd.DataFrame(np.random.randn(4,3), index = [['a','a','b','b'], [1,2,1,2]], 
                  columns=[['Ohio','Ohio','Colorado'], ['Green','Red','Green']])

In [28]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0.175539,-0.681855,0.509332
a,2,-0.593076,-0.773491,-0.571042
b,1,0.364081,0.069295,0.467998
b,2,0.861911,-1.923506,-0.776658


In [29]:
df.index

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [30]:
df.columns

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]])

In [31]:
### The hierarchial levels can have names(as strings or any Python objects). 

### This will assign names to the levels .i.e. a,b.. and 1,2...
df.index.names = ['key1','key2']

In [32]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0.175539,-0.681855,0.509332
a,2,-0.593076,-0.773491,-0.571042
b,1,0.364081,0.069295,0.467998
b,2,0.861911,-1.923506,-0.776658


In [33]:
### Now lets assign names to the columns
df.columns.names = ['state','color']

In [34]:
df

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0.175539,-0.681855,0.509332
a,2,-0.593076,-0.773491,-0.571042
b,1,0.364081,0.069295,0.467998
b,2,0.861911,-1.923506,-0.776658


In [35]:
### With partial column indexing you can similarly select groups of columns
df['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0.175539,-0.681855
a,2,-0.593076,-0.773491
b,1,0.364081,0.069295
b,2,0.861911,-1.923506


In [38]:
df

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0.175539,-0.681855,0.509332
a,2,-0.593076,-0.773491,-0.571042
b,1,0.364081,0.069295,0.467998
b,2,0.861911,-1.923506,-0.776658


In [38]:
### For detailed info, refer http://pandas.pydata.org/pandas-docs/stable/advanced.html

In [39]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [46]:
arrays

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [50]:
zipped=zip(arrays)

In [52]:
list(zipped)

[(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],),
 (['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],)]

In [53]:
tuples = list(zip(*arrays))

In [54]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [55]:
cust_index1 = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

In [56]:
cust_index1

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [57]:
s = pd.Series(np.random.randn(8), index=cust_index1)

In [58]:
s

first  second
bar    one       3.365493
       two       0.302221
baz    one      -0.565882
       two      -0.624225
foo    one      -0.515135
       two       0.191366
qux    one       2.474611
       two       1.107154
dtype: float64

In [59]:
### When you want every pairing of the elements in two iterables, it can be easier to use the MultiIndex.from_product function


In [60]:
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]

In [61]:
from_itr = pd.MultiIndex.from_product(iterables, names=['first', 'second'])

In [62]:
df = pd.DataFrame(np.random.randn(8,2), index=from_itr)

In [63]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.78331,0.10245
bar,two,-0.82084,-0.08091
baz,one,-0.146326,-2.101293
baz,two,0.565367,-0.757267
foo,one,0.353631,1.060269
foo,two,1.082199,1.701937
qux,one,-0.474265,0.449742
qux,two,-0.627775,-0.234792


In [64]:
### As a convenience, you can pass a list of arrays directly into Series or DataFrame to construct a MultiIndex automatically

In [65]:
from_arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
          np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]

In [66]:
s_fa = pd.Series(np.random.randn(8), index=from_arrays)

In [67]:
s_fa

bar  one   -0.444718
     two   -0.813412
baz  one    0.444227
     two   -2.416354
foo  one    0.616115
     two   -1.111251
qux  one    1.841590
     two    0.403802
dtype: float64

In [68]:
df_fa = pd.DataFrame(np.random.randn(8, 4), index=from_arrays)

In [69]:
df_fa

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,-0.540115,0.328948,-0.373928,-1.920175
bar,two,1.156421,-3.464214,-1.939077,2.17511
baz,one,0.501103,1.086332,1.118446,0.812174
baz,two,1.105823,0.757038,-1.71223,0.14616
foo,one,-0.390011,0.57432,0.741939,0.908262
foo,two,0.82786,-0.099447,0.8727,0.06254
qux,one,-0.38591,0.037563,0.530881,-0.964906
qux,two,1.153016,-0.160146,-0.330434,0.094129


In [70]:
### All of the MultiIndex constructors accept a names argument which stores string names for the levels themselves. 
### If no names are provided, None will be assigned
df_fa.index.names

FrozenList([None, None])

In [71]:
df_fac = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=from_arrays)

In [72]:
df_fac

Unnamed: 0_level_0,bar,bar,baz,baz,foo,foo,qux,qux
Unnamed: 0_level_1,one,two,one,two,one,two,one,two
A,-0.942813,0.77296,0.577357,-0.574818,-0.096448,-0.727551,-0.325681,-0.163238
B,0.993336,-1.56262,0.520256,-0.866479,0.330996,-0.354497,1.459134,-0.787031
C,-0.802976,-1.22291,-2.151636,-0.424064,1.246081,2.235826,0.648702,0.040644


In [73]:
### So going back to our first example

### df = pd.DataFrame(np.random.randn(4,3), index = [['a','a','b','b'], [1,2,1,2]], 
    ###              columns=[['Ohio','Ohio','Colorado'], ['Green','Red','Green']])

### This can be created by using our cust_index as follows

In [77]:
### This is the same output thats produced by
df_o = pd.DataFrame(np.random.randn(4,3), index = [['a','a','b','b'], [1,2,1,2]], 
                    columns=[['Ohio','Ohio','Colorado'], ['Green','Red','Green']])

In [78]:
df_o

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,-0.048873,-1.645658,0.238011
a,2,-0.267101,-2.267165,-0.240148
b,1,-0.878398,1.639451,0.945396
b,2,0.114189,0.275124,-0.72936


In [79]:
### except that we haven't assigned names yet. Which we can as follows

df_o.columns.names = ['state','color']

In [80]:
df_o

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
a,1,-0.048873,-1.645658,0.238011
a,2,-0.267101,-2.267165,-0.240148
b,1,-0.878398,1.639451,0.945396
b,2,0.114189,0.275124,-0.72936


In [None]:
### Now lets make it much more simple

In [83]:
arrays_simple = [['a', 'a', 'b', 'b'],[1,2,1,2]]

In [84]:
arrays_simple

[['a', 'a', 'b', 'b'], [1, 2, 1, 2]]

In [85]:
tuples_simple = list(zip(*arrays_simple))

In [86]:
tuples_simple

[('a', 1), ('a', 2), ('b', 1), ('b', 2)]