# Hierarchical Indexing

- Creation of MultiIndex Series
 - by index list
 - by dictionary
 - by MultiIndex index

- Creation of MultiIndex DataFrame
 - index/columns
 
- Selection in Series
 - indexing
 - slicing 
 - masking
 - fancy indexing
 
- Selection in DataFrame
 - indexing: IndexSlice
 - slicing 
 - masking
 - fancy indexing

- Rearranging
 - sorted and unsorted index: ``.sort_index()``
 - stack and unstack: ``.stack()`` and ``.unstack()``
 - set and reset index: ``.reset_index()`` and ``.set_index()``
 
- Aggregation

In [73]:
import pandas as pd
import numpy as np
np.__version__, pd.__version__

('1.15.1', '0.23.4')

# MultiIndex Series

### wrong way to create a multiply indexed series

In [25]:
# this is the example that pandas doesn't generate a multi-index series.
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

## Creating a multiply indexed series -- Method 1: using index list 

In [167]:
df = pd.Series([33871618, 37253956, 18976457, 19378102, 20851820, 25145561],
                index=[['California', 'California', 'Texas', 'Texas', 'New York', 'New York'], [2000, 2010, 2000, 2010, 2000, 2010]])
df

California  2000    33871618
            2010    37253956
Texas       2000    18976457
            2010    19378102
New York    2000    20851820
            2010    25145561
dtype: int64

## Creating a multiply indexed series - Method 2: create from dictionary

In [41]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

## Creating a multiply indexed series - Method 3: using ``MultiIndex``

### Create a multiple index

In [170]:
# method 1
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [172]:
# method 2
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [173]:
# method 3
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [174]:
# method 4
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
              labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

### Set up a multiIndex series

In [186]:
index = [('California', 2000), ('California', 2001), ('Texas', 2000), ('Texas', 2001), ('New York', 2000), ('New York', 2001)]
index = pd.MultiIndex.from_tuples(index)
pop = pd.Series([33871618, 37253956, 18976457, 19378102, 20851820, 25145561], index = index)
pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871618
            2001    37253956
Texas       2000    18976457
            2001    19378102
New York    2000    20851820
            2001    25145561
dtype: int64

## using-stack()-and-unstack()-in-series

In [179]:
pop_df = pop.unstack() # unstack is move row index to col index
pop_df

year,2000,2001
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871618,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [46]:
pop_df.stack() # unstack is move col index to row index

California  2000    33871618
            2001    37253956
New York    2000    18976457
            2001    19378102
Texas       2000    20851820
            2001    25145561
dtype: int64

In [47]:
pop_df.T.stack() #compare this to the above

2000  California    33871618
      New York      18976457
      Texas         20851820
2001  California    37253956
      New York      19378102
      Texas         25145561
dtype: int64

## MulitiIndex in DataFrame - MultiIndex on both rows and cols

In [164]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,40.0,36.4,30.0,36.3,56.0,38.6
2013,2,34.0,36.4,42.0,35.9,50.0,37.7
2014,1,41.0,38.7,32.0,34.8,28.0,36.3
2014,2,21.0,38.9,35.0,36.3,39.0,37.4


## using-stack()-and-unstack()-in-dataframe

In [266]:
# using unstack() to move row index to column index. it becomes a series, then use reset_index
temp = health_data.unstack().unstack()
print(type(temp))
temp = temp.reset_index(name = 'values')
temp[:5]

<class 'pandas.core.series.Series'>


Unnamed: 0,subject,type,visit,year,values
0,Bob,HR,1,2013,40.0
1,Bob,HR,1,2014,41.0
2,Bob,HR,2,2013,34.0
3,Bob,HR,2,2014,21.0
4,Bob,Temp,1,2013,36.4


In [267]:
# using stack() to move column index to row index, it becomes a series, then use reset_index
temp = health_data.stack().stack()
print(type(temp))
temp = temp.reset_index(name = 'values')
temp[:5]

<class 'pandas.core.series.Series'>


Unnamed: 0,year,visit,type,subject,values
0,2013,1,HR,Bob,40.0
1,2013,1,HR,Guido,30.0
2,2013,1,HR,Sue,56.0
3,2013,1,Temp,Bob,36.4
4,2013,1,Temp,Guido,36.3


# Selection 

## Selection in MultiIndex Series

In [187]:
pop

state       year
California  2000    33871618
            2001    37253956
Texas       2000    18976457
            2001    19378102
New York    2000    20851820
            2001    25145561
dtype: int64

In [194]:
#indexing with multiple terms:
pop['California', 2000]

33871618

In [205]:
#partial indexing, it returns a series
pop['California']

year
2000    33871618
2001    37253956
dtype: int64

In [207]:
# partial indexing, it returns a series
pop[:, 2000]

state
California    33871618
Texas         18976457
New York      20851820
dtype: int64

In [215]:
#slicing using iloc
pop.iloc[0:5]
# note that the slicing using pop.loc only works for the sorted index

state       year
California  2000    33871618
            2001    37253956
Texas       2000    18976457
            2001    19378102
New York    2000    20851820
dtype: int64

In [216]:
#masking
pop[pop > 22000000]

state       year
California  2000    33871618
            2001    37253956
New York    2001    25145561
dtype: int64

In [217]:
#fancy indexing
pop[['California', 'Texas']]

state       year
California  2000    33871618
            2001    37253956
Texas       2000    18976457
            2001    19378102
dtype: int64

## Selection in MultiIndex DataFrames

In [218]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,40.0,36.4,30.0,36.3,56.0,38.6
2013,2,34.0,36.4,42.0,35.9,50.0,37.7
2014,1,41.0,38.7,32.0,34.8,28.0,36.3
2014,2,21.0,38.9,35.0,36.3,39.0,37.4


In [223]:
# recall columns are primary in dataframe
# indexing
health_data['Guido', 'HR']

year  visit
2013  1        30.0
      2        42.0
2014  1        32.0
      2        35.0
Name: (Guido, HR), dtype: float64

In [224]:
#slicing
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,40.0,36.4
2013,2,34.0,36.4


In [225]:
#fancy indexing
health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1        40.0
      2        34.0
2014  1        41.0
      2        21.0
Name: (Bob, HR), dtype: float64

Working with slices within these index tuples is not especially convenient; trying to create a slice within a tuple will lead to a syntax error:

In [248]:
#select the data with visit = 1 and type = HR by its location
health_data.loc[::2, ::2]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,40.0,30.0,56.0
2014,1,41.0,32.0,28.0


In [239]:
#select the data by building the desired slice explicitly using ``IndexSlice`` object
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,40.0,30.0,56.0
2014,1,41.0,32.0,28.0


# Rearranging Multi-Indices

## Sorted and unsorted indices
slicing only works in the sorted index

In [249]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.097578
      2      0.204560
c     1      0.015209
      2      0.924188
b     1      0.512304
      2      0.879448
dtype: float64

In [258]:
# slicing returns an error when using unsorted index
try:
    data['a':'b']
except KeyError as e:
    print(type(e))
    print(e)

In [259]:
data = data.sort_index()
data

char  int
a     1      0.097578
      2      0.204560
b     1      0.512304
      2      0.879448
c     1      0.015209
      2      0.924188
dtype: float64

In [260]:
# slicing works as expected when using sorted index
data['a':'b']

char  int
a     1      0.097578
      2      0.204560
b     1      0.512304
      2      0.879448
dtype: float64

## Stacking and unstacking indices
also refers to [using-stack()-and-unstack()-in-series](#using-stack()-and-unstack()-in-series) and [using-stack()-and-unstack()-in-dataframe](#using-stack()-and-unstack()-in-dataframe).

In [263]:
pop

state       year
California  2000    33871618
            2001    37253956
Texas       2000    18976457
            2001    19378102
New York    2000    20851820
            2001    25145561
dtype: int64

In [261]:
pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871618,20851820,18976457
2001,37253956,25145561,19378102


In [262]:
pop.unstack(level=1)

year,2000,2001
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871618,37253956
New York,20851820,25145561
Texas,18976457,19378102


### Index setting and resetting

In [279]:
pop_flat = pop.reset_index(name='population')# this flatten method only apply to series
print(type(pop_flat), pop_flat.shape)
pop_flat

<class 'pandas.core.frame.DataFrame'> (6, 3)


Unnamed: 0,state,year,population
0,California,2000,33871618
1,California,2001,37253956
2,Texas,2000,18976457
3,Texas,2001,19378102
4,New York,2000,20851820
5,New York,2001,25145561


In [278]:
pop_nest = pop_flat.set_index(['state', 'year'])
print(type(pop_nest), pop_nest.shape)
pop_nest

<class 'pandas.core.frame.DataFrame'> (6, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871618
California,2001,37253956
Texas,2000,18976457
Texas,2001,19378102
New York,2000,20851820
New York,2001,25145561


## Data Aggregations on Multi-Indices

In [280]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,40.0,36.4,30.0,36.3,56.0,38.6
2013,2,34.0,36.4,42.0,35.9,50.0,37.7
2014,1,41.0,38.7,32.0,34.8,28.0,36.3
2014,2,21.0,38.9,35.0,36.3,39.0,37.4


In [285]:
data_mean = health_data.mean(level='year') # we can use groupby when we deal with a regular dataframe.
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,37.0,36.4,36.0,36.1,53.0,38.15
2014,31.0,38.8,33.5,35.55,33.5,36.85


In [286]:
data_mean.mean(axis=1, level='type')

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,42.0,36.883333
2014,32.666667,37.066667


# Background
Q: why we use ``multi-index`` data rather than ``panel`` or ``panel4d``? 

A: When there is many dimensions of features. ``multi-index`` allows us to easily and quickly manipulate and explore the data.

Pandas does provide ``Panel`` and ``Panel4D`` objects that natively handle three-dimensional and four-dimensional data, a far more common pattern in practice is to make use of *hierarchical indexing* (also known as *multi-indexing*) to incorporate multiple index *levels* within a single index.
In this way, higher-dimensional data can be compactly represented within the familiar one-dimensional ``Series`` and two-dimensional ``DataFrame`` objects.

In [287]:
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094, 4687374, 4318033, 5906301, 6879014]})
pop_df

Unnamed: 0_level_0,Unnamed: 1_level_0,total,under18
state,year,Unnamed: 2_level_1,Unnamed: 3_level_1
California,2000,33871618,9267089
California,2001,37253956,9284094
Texas,2000,18976457,4687374
Texas,2001,19378102,4318033
New York,2000,20851820,5906301
New York,2001,25145561,6879014


In [288]:
f_u18 = pop_df['under18'] / pop_df['total']
print(f_u18)

state       year
California  2000    0.273595
            2001    0.249211
Texas       2000    0.247010
            2001    0.222831
New York    2000    0.283251
            2001    0.273568
dtype: float64
