# Hierarchical Indexing

In [1]:
import pandas as pd
import numpy as np 

#### multiple indexed series

Represent 2D data within 1D Series

In [2]:
index = [('India',2000), ('India', 2010),
         ('China', 2000), ('China',2010),
        ('USA', 2000), ('USA', 2010)]
population= [100000000,1300000000,
            80000000, 15000000000,
            20847199,240204932]

In [3]:
pop = pd.Series(population, index= index)
pop

(India, 2000)      100000000
(India, 2010)     1300000000
(China, 2000)       80000000
(China, 2010)    15000000000
(USA, 2000)         20847199
(USA, 2010)        240204932
dtype: int64

In [4]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('India', 2000),
            ('India', 2010),
            ('China', 2000),
            ('China', 2010),
            (  'USA', 2000),
            (  'USA', 2010)],
           )

In [5]:
pop = pop.reindex(index)
pop

India  2000      100000000
       2010     1300000000
China  2000       80000000
       2010    15000000000
USA    2000       20847199
       2010      240204932
dtype: int64

In [6]:
pop[:, 2010]

India     1300000000
China    15000000000
USA        240204932
dtype: int64

In [7]:
pop[:,2000]

India    100000000
China     80000000
USA       20847199
dtype: int64

In [8]:
pop[0]

100000000

#### Multiindex as extra dimension

In [9]:
#unstack() convert a multiplyindexed Series into a conventionally indexed DataFrame
popDf = pop.unstack()
popDf

Unnamed: 0,2000,2010
China,80000000,15000000000
India,100000000,1300000000
USA,20847199,240204932


In [10]:
# stack() will do opposite
popDf.stack()

China  2000       80000000
       2010    15000000000
India  2000      100000000
       2010     1300000000
USA    2000       20847199
       2010      240204932
dtype: int64

In [12]:
popDf = pd.DataFrame({'total' : pop,
                     'under18' : [1298634, 22938294,
                                 24839, 480382,
                                 590393, 4839274]})
popDf

Unnamed: 0,Unnamed: 1,total,under18
India,2000,100000000,1298634
India,2010,1300000000,22938294
China,2000,80000000,24839
China,2010,15000000000,480382
USA,2000,20847199,590393
USA,2010,240204932,4839274


In [14]:
#compute fraction of people under 18
fU18 = popDf['under18'] / popDf['total']
fU18.unstack()

Unnamed: 0,2000,2010
China,0.00031,3.2e-05
India,0.012986,0.017645
USA,0.02832,0.020146


#### Methods to create multiindex

In [29]:
#1
df = pd.DataFrame(np.random.rand(4,2),
                 index = [['a','a','b','b'], [1,2,1,2]],
                 columns = ['data1','data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.888808,0.368093
a,2,0.802529,0.021628
b,1,0.920289,0.356095
b,2,0.411198,0.082063


In [30]:
#pass the dictionary

data = {('California', 2000): 33871648,
('California', 2010): 37253956,
('Texas', 2000): 20851820,
('Texas', 2010): 25145561,
('New York', 2000): 18976457,
('New York', 2010): 19378102}

pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

#### MultiIndex for columns

In [31]:
#hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013,2014], [1,2]],
                                  names=['year','visit'])
columns = pd.MultiIndex.from_product([['Bobb', 'Guido', 'Sue'], 
                                     ['HR','Temp']],
                                    names=['subject', 'type'])

In [32]:
index

MultiIndex([(2013, 1),
            (2013, 2),
            (2014, 1),
            (2014, 2)],
           names=['year', 'visit'])

In [33]:
columns

MultiIndex([( 'Bobb',   'HR'),
            ( 'Bobb', 'Temp'),
            ('Guido',   'HR'),
            ('Guido', 'Temp'),
            (  'Sue',   'HR'),
            (  'Sue', 'Temp')],
           names=['subject', 'type'])

In [34]:
#mock some data
data = np.round(np.random.randn(4,6),1)
data

array([[ 0.5, -1. ,  0.4, -0.4,  0.8,  0.2],
       [-0.9, -1. ,  0. ,  1. ,  1.4,  0.2],
       [ 0.6,  0.8, -1.2, -0.5, -0.8, -1.5],
       [-0.5,  1.2, -1.8,  0.4, -0. , -0.4]])

In [35]:
data[:,::2] *= 10
data +=37

In [36]:
data

array([[42. , 36. , 41. , 36.6, 45. , 37.2],
       [28. , 36. , 37. , 38. , 51. , 37.2],
       [43. , 37.8, 25. , 36.5, 29. , 35.5],
       [32. , 38.2, 19. , 37.4, 37. , 36.6]])

In [37]:
#create dataframe
healthData = pd.DataFrame(data, index=index, columns=columns)
healthData

Unnamed: 0_level_0,subject,Bobb,Bobb,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,42.0,36.0,41.0,36.6,45.0,37.2
2013,2,28.0,36.0,37.0,38.0,51.0,37.2
2014,1,43.0,37.8,25.0,36.5,29.0,35.5
2014,2,32.0,38.2,19.0,37.4,37.0,36.6


In [49]:
healthData['Sue']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,45.0,37.2
2013,2,51.0,37.2
2014,1,29.0,35.5
2014,2,37.0,36.6


#### Indexing and Slicing

In [45]:
pop

India  2000      100000000
       2010     1300000000
China  2000       80000000
       2010    15000000000
USA    2000       20847199
       2010      240204932
dtype: int64

In [50]:
pop['India',2010]

1300000000

In [54]:
pop['India']

2000     100000000
2010    1300000000
dtype: int64

In [55]:
pop[:,2000]

India    100000000
China     80000000
USA       20847199
dtype: int64