In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Series 인덱스

## 단일 계층 인덱스

In [3]:
s =  pd.Series(data = np.random.rand(5))
s               #>  Series 객체를 생성할 때 index를 설정하지 않으면 RangeIndex가 자동으로 만들어짐.

Unnamed: 0,0
0,0.991415
1,0.173426
2,0.712843
3,0.243102
4,0.41223


In [4]:
s.index             # 인덱스 - row label(행 레이블)

RangeIndex(start=0, stop=5, step=1)

In [5]:
s.values            # 시리즈의 값  --> np.ndarray

array([0.99141469, 0.17342585, 0.71284292, 0.24310247, 0.41222981])

In [6]:
print(type(s.values))

<class 'numpy.ndarray'>


In [7]:
s = pd.Series(data = np.random.rand(5),
              index = ['a','b','c','d','e'])
s           #> 생성자에서 설정한 Index 객체가 만들어짐.

Unnamed: 0,0
a,0.933819
b,0.251089
c,0.724137
d,0.730809
e,0.370731


In [8]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [9]:
s.index.nlevels         # Index.nlevels 속성(property): 인덱스 계층(level)의 개수.

1

## 계층적 인덱스(hierachical index), Multi-level index

In [10]:
s = pd.Series(data = np.random.randn(6),
             index = [['m','m','f','f','u','u'],
                        [1,2,3,1,2,3,]])
s

Unnamed: 0,Unnamed: 1,0
m,1,-0.660759
m,2,-0.121783
f,3,-1.90919
f,1,0.716416
u,2,-0.859627
u,3,0.207506


In [11]:
s.values

array([-0.66075931, -0.12178323, -1.90919037,  0.71641634, -0.85962703,
        0.20750625])

In [12]:
s.index         #> MultiIndex: 튜플들을 원소로 갖는 배열.

MultiIndex([('m', 1),
            ('m', 2),
            ('f', 3),
            ('f', 1),
            ('u', 2),
            ('u', 3)],
           )

In [13]:
s.index.nlevels

2

## Indexing, Slicing

nlevels(인덱스의 계층 개수)가 2 이상인 MultiIndex를 사용해서 loc 속성을 이용할 때,
*   첫번째 레벨의 인덱스만 가지고 indexing, slicing을 할 수 있음.
*   두번째 이상의 인덱스만 가지고는 indexing, slicing을 할 수 없음.
*   튜플 형태의 인덱스로는 indexing, slicing 이 가능.

In [14]:
# indexing
s.loc['m']      #= s.loc['m']

Unnamed: 0,0
1,-0.660759
2,-0.121783


In [15]:
# slicing
# s.loc['m':'f']      #> UnsortedIndexError 발생

In [16]:
# s.loc[1]            #> KeyError 발생
s.loc[('m', 1)]       # 튜플 타입 인덱스는 사용 가능

np.float64(-0.6607593097363548)

In [17]:
s.loc[[('m',1), ('f',1)]]

Unnamed: 0,Unnamed: 1,0
m,1,-0.660759
f,1,0.716416


In [18]:
# s.loc[('m',1):('f',3)]              #> UnsortedIndexError 발생

*   `pd.Series.values()`: 값들을 정렬
*   `pd.Series.sort_index()`: 인덱스들을 정렬

In [19]:
s

Unnamed: 0,Unnamed: 1,0
m,1,-0.660759
m,2,-0.121783
f,3,-1.90919
f,1,0.716416
u,2,-0.859627
u,3,0.207506


In [20]:
s_idx_sort = s.sort_index()
s_idx_sort

Unnamed: 0,Unnamed: 1,0
f,1,0.716416
f,3,-1.90919
m,1,-0.660759
m,2,-0.121783
u,2,-0.859627
u,3,0.207506


In [21]:
s_idx_sort['f':'m']         #> 인덱스들을 정렬하느 수에는 slicing이 가능.


Unnamed: 0,Unnamed: 1,0
f,1,0.716416
f,3,-1.90919
m,1,-0.660759
m,2,-0.121783


In [22]:
s_idx_sort[('f',2):('m',2)]

Unnamed: 0,Unnamed: 1,0
f,3,-1.90919
m,1,-0.660759
m,2,-0.121783


## index swapping
인덱스의 레벨을 바꾸는 것.

In [23]:
s

Unnamed: 0,Unnamed: 1,0
m,1,-0.660759
m,2,-0.121783
f,3,-1.90919
f,1,0.716416
u,2,-0.859627
u,3,0.207506


In [24]:
# s.swaplevel(i= 0, j =1)
s_swap = s.swaplevel().sort_index()
s_swap

Unnamed: 0,Unnamed: 1,0
1,f,0.716416
1,m,-0.660759
2,m,-0.121783
2,u,-0.859627
3,f,-1.90919
3,u,0.207506


In [25]:
s_swap.loc[1]

Unnamed: 0,0
f,0.716416
m,-0.660759


In [26]:
s_swap.loc[1:2]

Unnamed: 0,Unnamed: 1,0
1,f,0.716416
1,m,-0.660759
2,m,-0.121783
2,u,-0.859627


In [27]:
# nlevel = 3인 인덱스를 갖는 시리즈
s = pd.Series(data = np.random.rand(6),
              index = [np.arange(1,7),['A','A','A','B','B','B'],['aa','bb','aa','bb','aa','bb']])

s


Unnamed: 0,Unnamed: 1,Unnamed: 2,0
1,A,aa,0.570559
2,A,bb,0.41779
3,A,aa,0.918072
4,B,bb,0.869856
5,B,aa,0.112299
6,B,bb,0.057513


In [28]:
s.index

MultiIndex([(1, 'A', 'aa'),
            (2, 'A', 'bb'),
            (3, 'A', 'aa'),
            (4, 'B', 'bb'),
            (5, 'B', 'aa'),
            (6, 'B', 'bb')],
           )

In [29]:
s.index.nlevels

3

In [30]:
s.swaplevel()       # i = -2, j = -1이 기본값: 마지막 인덱스와 끝에서 두번쨰 인덱스를 서로 바꿈.

Unnamed: 0,Unnamed: 1,Unnamed: 2,0
1,aa,A,0.570559
2,bb,A,0.41779
3,aa,A,0.918072
4,bb,B,0.869856
5,aa,B,0.112299
6,bb,B,0.057513


In [31]:
s.swaplevel(i=0,j=1)

Unnamed: 0,Unnamed: 1,Unnamed: 2,0
A,1,aa,0.570559
A,2,bb,0.41779
A,3,aa,0.918072
B,4,bb,0.869856
B,5,aa,0.112299
B,6,bb,0.057513


# DataFrame 계층적 인덱스

In [32]:
df = pd.DataFrame(data=np.random.rand(6, 3),
                  columns = ['a','b','c'],
                  index= [['Fri','Fri','Sat','Sat','Sun','Sun'],
                          ['Lunch','Dinner'] * 3])
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.295678,0.650253,0.97966
Fri,Dinner,0.692959,0.215769,0.972213
Sat,Lunch,0.862411,0.203052,0.988185
Sat,Dinner,0.342691,0.850786,0.142071
Sun,Lunch,0.177767,0.836126,0.940114
Sun,Dinner,0.562825,0.628545,0.512684


In [33]:
df.values           #> DataFrame의 값들로 이루어진 2차원 ndarray

array([[0.29567845, 0.65025336, 0.97966017],
       [0.69295902, 0.21576921, 0.97221286],
       [0.86241098, 0.20305218, 0.98818489],
       [0.34269139, 0.85078628, 0.14207119],
       [0.17776694, 0.83612558, 0.94011423],
       [0.56282487, 0.62854509, 0.51268416]])

In [34]:
df.index            #> MultiIndex

MultiIndex([('Fri',  'Lunch'),
            ('Fri', 'Dinner'),
            ('Sat',  'Lunch'),
            ('Sat', 'Dinner'),
            ('Sun',  'Lunch'),
            ('Sun', 'Dinner')],
           )

In [35]:
df.index.nlevels

2

In [36]:
df.loc['Fri']

Unnamed: 0,a,b,c
Lunch,0.295678,0.650253,0.97966
Dinner,0.692959,0.215769,0.972213


In [37]:
df.loc['Fri':'Sat']         #> slicing

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.295678,0.650253,0.97966
Fri,Dinner,0.692959,0.215769,0.972213
Sat,Lunch,0.862411,0.203052,0.988185
Sat,Dinner,0.342691,0.850786,0.142071


In [38]:
# df.loc['Lunch']       #> KeyError발생 - 두번쨰 레벨의 인덱스로는 인덱싱을 할 수 없음.

In [39]:
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.295678,0.650253,0.97966
Fri,Dinner,0.692959,0.215769,0.972213
Sat,Lunch,0.862411,0.203052,0.988185
Sat,Dinner,0.342691,0.850786,0.142071
Sun,Lunch,0.177767,0.836126,0.940114
Sun,Dinner,0.562825,0.628545,0.512684


In [40]:
df_swap = df.swaplevel()
df_swap

Unnamed: 0,Unnamed: 1,a,b,c
Lunch,Fri,0.295678,0.650253,0.97966
Dinner,Fri,0.692959,0.215769,0.972213
Lunch,Sat,0.862411,0.203052,0.988185
Dinner,Sat,0.342691,0.850786,0.142071
Lunch,Sun,0.177767,0.836126,0.940114
Dinner,Sun,0.562825,0.628545,0.512684


In [41]:
df_swap.loc['Dinner']

Unnamed: 0,a,b,c
Fri,0.692959,0.215769,0.972213
Sat,0.342691,0.850786,0.142071
Sun,0.562825,0.628545,0.512684


# DataFrame 컬럼<--> Row 레이블

*   `pd.DataFrame.set_index()`: 데이터 프레임의 컬럼(들)을 인덱스(row 레이블)로 변환한 데이터 프레임을 리턴.
*   `pd.DataFrame.reset_index()`: 데이터프레임의 인덱스(들)을 컬럼으로 변환한 데이터프레임을 리턴

## `reset_index`

In [42]:
df

Unnamed: 0,Unnamed: 1,a,b,c
Fri,Lunch,0.295678,0.650253,0.97966
Fri,Dinner,0.692959,0.215769,0.972213
Sat,Lunch,0.862411,0.203052,0.988185
Sat,Dinner,0.342691,0.850786,0.142071
Sun,Lunch,0.177767,0.836126,0.940114
Sun,Dinner,0.562825,0.628545,0.512684


In [43]:
df.reset_index()            #> level = None 기본값: 모든 레벨의 인덱스를 컬럼으로 변환.

Unnamed: 0,level_0,level_1,a,b,c
0,Fri,Lunch,0.295678,0.650253,0.97966
1,Fri,Dinner,0.692959,0.215769,0.972213
2,Sat,Lunch,0.862411,0.203052,0.988185
3,Sat,Dinner,0.342691,0.850786,0.142071
4,Sun,Lunch,0.177767,0.836126,0.940114
5,Sun,Dinner,0.562825,0.628545,0.512684


In [44]:
df.reset_index(names=['day','time'])            #> 인덱스를 컬럼으로 변환할 때, 컬럼의 이름을 설정.

Unnamed: 0,day,time,a,b,c
0,Fri,Lunch,0.295678,0.650253,0.97966
1,Fri,Dinner,0.692959,0.215769,0.972213
2,Sat,Lunch,0.862411,0.203052,0.988185
3,Sat,Dinner,0.342691,0.850786,0.142071
4,Sun,Lunch,0.177767,0.836126,0.940114
5,Sun,Dinner,0.562825,0.628545,0.512684


In [45]:
df.reset_index(level = 0)           # 레벨 0의 인덱스만 컬럼으로 변환.

Unnamed: 0,level_0,a,b,c
Lunch,Fri,0.295678,0.650253,0.97966
Dinner,Fri,0.692959,0.215769,0.972213
Lunch,Sat,0.862411,0.203052,0.988185
Dinner,Sat,0.342691,0.850786,0.142071
Lunch,Sun,0.177767,0.836126,0.940114
Dinner,Sun,0.562825,0.628545,0.512684


In [46]:
df.reset_index(level = 1)           # 레벨 1의 인덱스만 컬럼으로 변환.

Unnamed: 0,level_1,a,b,c
Fri,Lunch,0.295678,0.650253,0.97966
Fri,Dinner,0.692959,0.215769,0.972213
Sat,Lunch,0.862411,0.203052,0.988185
Sat,Dinner,0.342691,0.850786,0.142071
Sun,Lunch,0.177767,0.836126,0.940114
Sun,Dinner,0.562825,0.628545,0.512684


In [47]:
df.reset_index(level = [1,0])

Unnamed: 0,level_0,level_1,a,b,c
0,Fri,Lunch,0.295678,0.650253,0.97966
1,Fri,Dinner,0.692959,0.215769,0.972213
2,Sat,Lunch,0.862411,0.203052,0.988185
3,Sat,Dinner,0.342691,0.850786,0.142071
4,Sun,Lunch,0.177767,0.836126,0.940114
5,Sun,Dinner,0.562825,0.628545,0.512684


## `set_index`

In [48]:

exam = pd.DataFrame(data={
        'class': [1] * 5 + [2] * 5,
        'id': np.arange(1, 11),
        'math': np.random.randint(101, size=10),
        'science': np.random.randint(101, size=10)
})
exam



Unnamed: 0,class,id,math,science
0,1,1,64,48
1,1,2,70,91
2,1,3,15,52
3,1,4,6,25
4,1,5,57,54
5,2,6,17,45
6,2,7,12,50
7,2,8,93,42
8,2,9,59,84
9,2,10,78,86


In [49]:
# class = 1인 데이터
exam[exam['class'] == 1]

Unnamed: 0,class,id,math,science
0,1,1,64,48
1,1,2,70,91
2,1,3,15,52
3,1,4,6,25
4,1,5,57,54


In [50]:
exam_class = exam.set_index(keys = 'class')
exam_class

Unnamed: 0_level_0,id,math,science
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,64,48
1,2,70,91
1,3,15,52
1,4,6,25
1,5,57,54
2,6,17,45
2,7,12,50
2,8,93,42
2,9,59,84
2,10,78,86


In [51]:
exam_class.loc[1]

Unnamed: 0_level_0,id,math,science
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,64,48
1,2,70,91
1,3,15,52
1,4,6,25
1,5,57,54


In [52]:
exam_class_id = exam.set_index(keys = ['class','id'])
exam_class_id

Unnamed: 0_level_0,Unnamed: 1_level_0,math,science
class,id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,64,48
1,2,70,91
1,3,15,52
1,4,6,25
1,5,57,54
2,6,17,45
2,7,12,50
2,8,93,42
2,9,59,84
2,10,78,86


In [53]:
exam_class_id.reset_index(level='class')
    #> 인덱스가 이름을 가지고 있는 경우 rest_index메서드의 level 파라미터로 문자열(들의 리스트) 줄 수 있음

Unnamed: 0_level_0,class,math,science
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,64,48
2,1,70,91
3,1,15,52
4,1,6,25
5,1,57,54
6,2,17,45
7,2,12,50
8,2,93,42
9,2,59,84
10,2,78,86


`reset_index`예

In [54]:
exam

Unnamed: 0,class,id,math,science
0,1,1,64,48
1,1,2,70,91
2,1,3,15,52
3,1,4,6,25
4,1,5,57,54
5,2,6,17,45
6,2,7,12,50
7,2,8,93,42
8,2,9,59,84
9,2,10,78,86


In [55]:

# exam 데이터프레임에서 반별 과목들의 평균
exam_by_class = exam.groupby(by=['class'])[['math', 'science']].mean()
exam_by_class

Unnamed: 0_level_0,math,science
class,Unnamed: 1_level_1,Unnamed: 2_level_1
1,42.4,54.0
2,51.8,61.4


In [56]:
exam_by_class.reset_index()

Unnamed: 0,class,math,science
0,1,42.4,54.0
1,2,51.8,61.4


성별 요일별 시간별 팁의 평균