## Data Wrangling: Join, Combine

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

### 1. Hierarchical Indexing(계층적 색인)

In [18]:
data = pd.Series(np.random.randn(9), # 다중 색인 지정가능
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1    0.186980
   2   -0.391725
   3   -0.272293
b  1   -0.017141
   3    0.680321
c  1    0.635512
   2   -0.757177
d  2    0.718086
   3   -0.304273
dtype: float64

In [19]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [21]:
#부분적 색인으로 접근가능
print(data['a'])
print(data['b':'c'])
print(data.loc[['b', 'd']])

1    0.186980
2   -0.391725
3   -0.272293
dtype: float64
b  1   -0.017141
   3    0.680321
c  1    0.635512
   2   -0.757177
dtype: float64
b  1   -0.017141
   3    0.680321
d  2    0.718086
   3   -0.304273
dtype: float64


In [22]:
#하위 계층의 객체를 선택할 수 있다.
data.loc[:, 2]

a   -0.391725
c   -0.757177
d    0.718086
dtype: float64

In [23]:
data.unstack()

Unnamed: 0,1,2,3
a,0.18698,-0.391725,-0.272293
b,-0.017141,,0.680321
c,0.635512,-0.757177,
d,,0.718086,-0.304273


In [24]:
data.unstack().stack()

a  1    0.186980
   2   -0.391725
   3   -0.272293
b  1   -0.017141
   3    0.680321
c  1    0.635512
   2   -0.757177
d  2    0.718086
   3   -0.304273
dtype: float64

In [27]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)), #두 축 모두 계층적 색인 적용 가능
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns=[['Ohio', 'Ohio', 'Colorado'],
                              ['Green', 'Red', 'Green']])
print(frame)

     Ohio     Colorado
    Green Red    Green
a 1     0   1        2
  2     3   4        5
b 1     6   7        8
  2     9  10       11


In [28]:
frame.index.names = ['key1', 'key2'] #이름도 가질 수 있다.
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [32]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10
