## 4. 계층적 인덱스
단일 인덱스 내에 여러 인덱스 레벨을 포하하는 계층적 인덱싱(hierarchical indexing)   
고차원 데이터를 1~2차원으로 표현 가능
  - MultiIndex
  - stack, unstack
  - 생성방법
  - 인덱싱 & 슬라이싱

In [1]:
import pandas as pd
import numpy as np

### 4.1 다중 인덱스 Series

In [2]:
population = [300,370,
              190,195,
              200,250]
index = [('California',2000),('California',2010),
         ('New York',2000),('New York',2010),
         ('Texas',2000),('Texas',2010)]
data = pd.Series(population, index=index)
data

(California, 2000)    300
(California, 2010)    370
(New York, 2000)      190
(New York, 2010)      195
(Texas, 2000)         200
(Texas, 2010)         250
dtype: int64

In [3]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [4]:
data = data.reindex(index)
data

California  2000    300
            2010    370
New York    2000    190
            2010    195
Texas       2000    200
            2010    250
dtype: int64

In [5]:
data.index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

### 인덱스를 통한 값 선택

In [6]:
data['Texas']

2000    200
2010    250
dtype: int64

In [7]:
data[:,2010]

California    370
New York      195
Texas         250
dtype: int64

### stack과 unstack 

In [8]:
data

California  2000    300
            2010    370
New York    2000    190
            2010    195
Texas       2000    200
            2010    250
dtype: int64

In [9]:
df = data.unstack()
df

Unnamed: 0,2000,2010
California,300,370
New York,190,195
Texas,200,250


In [10]:
data.unstack().stack()

California  2000    300
            2010    370
New York    2000    190
            2010    195
Texas       2000    200
            2010    250
dtype: int64

In [11]:
data.unstack(level=0)

Unnamed: 0,California,New York,Texas
2000,300,190,200
2010,370,195,250


In [12]:
data.unstack(level=1)

Unnamed: 0,2000,2010
California,300,370
New York,190,195
Texas,200,250


### 4.2 MultiIndex 생성 방법 

#### 1) 인덱스에 리스트 전달

In [13]:
df = pd.DataFrame(np.random.rand(4,2),
                  index=[['a','a','b','b'],[1,2,1,2]],
                  columns=['data1','data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.040278,0.254049
a,2,0.600118,0.145353
b,1,0.480141,0.870656
b,2,0.796594,0.165889


#### 2) 튜플을 키로 갖는 딕셔너리 전달 

In [14]:
data = {('California',2000):330,('California',2010):338,
         ('New York',2000):208,('New York',2010):251,
         ('Texas',2000):254,('Texas',2010):251}
pd.Series(data)

California  2000    330
            2010    338
New York    2000    208
            2010    251
Texas       2000    254
            2010    251
dtype: int64

#### 3) 명시적 생성자

In [15]:
pd.MultiIndex.from_arrays([['a','a','b','b'],[1,2,1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [16]:
pd.MultiIndex.from_tuples([('a',1),('a',2),('b',1),('b',2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [17]:
pd.MultiIndex.from_product([['a','b'],[1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

### 4.3 MultiIndex 레벨 이름 설정

In [18]:
data = {('California',2000):300,('California',2010):370,
         ('New York',2000):190,('New York',2010):195,
         ('Texas',2000):200,('Texas',2010):250}
data = pd.Series(data)
data.index.names = ['state','year']
data

state       year
California  2000    300
            2010    370
New York    2000    190
            2010    195
Texas       2000    200
            2010    250
dtype: int64

### 4.4 MultiIndex (Column)

In [19]:
index = pd.MultiIndex.from_product([[2013,2014],[1,2]], 
                                   names=['year','visit'])
columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']], 
                                     names=['subject','type'])

data = np.round(np.random.randn(4,6),1)
data[:, ::2] *= 10
data += 37

df = pd.DataFrame(data, index=index, columns=columns)
df

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,30.0,36.6,19.0,39.0,43.0,40.2
2013,2,38.0,36.0,20.0,37.7,39.0,38.4
2014,1,32.0,36.4,69.0,37.8,46.0,36.6
2014,2,35.0,38.7,28.0,38.5,42.0,38.2


In [20]:
df['Bob']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,30.0,36.6
2013,2,38.0,36.0
2014,1,32.0,36.4
2014,2,35.0,38.7


### 4.5 MultiIndex 인덱싱 및 슬라이싱 

In [21]:
data = {('California',2000):300,('California',2010):370,
         ('New York',2000):190,('New York',2010):195,
         ('Texas',2000):200,('Texas',2010):250}
data = pd.Series(data)
data.index.names = ['state','year']
data

state       year
California  2000    300
            2010    370
New York    2000    190
            2010    195
Texas       2000    200
            2010    250
dtype: int64

#### Series 인덱싱

In [22]:
data['California', 2010] # 단일 요소 접근

370

In [23]:
data['California'] # 한 레벨만 선택

year
2000    300
2010    370
dtype: int64

In [24]:
data[:,2010] # 한 레벨만 선택

state
California    370
New York      195
Texas         250
dtype: int64

#### Series 슬라이싱 
- 정렬이 되어 있어야 함

In [25]:
data = { ('New York',2000):190,('New York',2010):195,
         ('Texas',2000):200,('Texas',2010):250,
         ('California',2000):300,('California',2010):370}
data = pd.Series(data)
data.index.names = ['state','year']
data

state       year
New York    2000    190
            2010    195
Texas       2000    200
            2010    250
California  2000    300
            2010    370
dtype: int64

In [None]:
data['New York':'Texas'] # Error

In [27]:
data = data.sort_index()
data

state       year
California  2000    300
            2010    370
New York    2000    190
            2010    195
Texas       2000    200
            2010    250
dtype: int64

In [28]:
data['New York':'Texas'] 

state     year
New York  2000    190
          2010    195
Texas     2000    200
          2010    250
dtype: int64

* 그 외 마스킹과 팬시 인덱싱 가능

#### DataFrame 인덱싱

In [29]:
index = pd.MultiIndex.from_product([[2013,2014],[1,2]], names=['year','visit'])
columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']], names=['subject','type'])

data = np.round(np.random.randn(4,6),1)
data[:, ::2] *= 10
data += 37

df = pd.DataFrame(data, index=index, columns=columns)
df

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,37.0,37.4,42.0,35.2,39.0,35.9
2013,2,62.0,37.9,31.0,37.7,24.0,36.6
2014,1,31.0,37.5,40.0,38.4,29.0,37.8
2014,2,34.0,37.6,40.0,35.8,29.0,38.4


In [30]:
df['Guido', 'HR']

year  visit
2013  1        42.0
      2        31.0
2014  1        40.0
      2        40.0
Name: (Guido, HR), dtype: float64

In [31]:
df.iloc[:2,:2] # 인덱서 사용

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,37.0,37.4
2013,2,62.0,37.9


In [32]:
df.loc[:,('Bob','HR')] # 인덱서와 튜플

year  visit
2013  1        37.0
      2        62.0
2014  1        31.0
      2        34.0
Name: (Bob, HR), dtype: float64

In [33]:
df.loc[(:,2),(:,'HR')] # 튜플 내 슬라이싱 시도 Error

SyntaxError: invalid syntax (<ipython-input-33-f7885847a4c2>, line 1)

In [34]:
idx = pd.IndexSlice # 인덱스 객체 생성
df.loc[idx[:,2],idx[:,'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,2,62.0,31.0,24.0
2014,2,34.0,40.0,29.0
