# Pandas Advanced
> INDEX
- Merge (병합)
- Grouping (그룹화)
- Reshaping (변형)
- Time Series (시계열)
- Categoricals (범주화)

In [3]:
import numpy as np
import pandas as pd

## 6. Merge (병합)
> Concat (연결)

- 결합 (join) / 병합 (merge) 형태의 연산에 대한 인덱스, 관계 대수 기능을 위한 다양한 형태의 논리를 포함한 Series, 데이터프레임, Panel 객체를 손쉽게 결합할 수 있도록 하는 다양한 기능을 pandas 에서 제공합니다.

In [4]:
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,0.507017,0.959263,0.649105,-0.813497
1,-0.751879,0.588357,0.204476,-0.057229
2,-0.376269,-1.391675,-0.203795,0.123562
3,-0.143103,0.680273,-1.187362,-1.662709
4,-1.022351,0.24841,2.797055,0.22966
5,0.449444,-0.244849,-1.414926,0.995995
6,1.036284,-0.990939,0.235488,-0.28333
7,-0.82472,-0.293675,1.326577,-0.599258
8,0.7657,1.311732,-0.696388,2.32328
9,-0.894428,0.40455,0.133275,-1.703928


In [5]:
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0  0.507017  0.959263  0.649105 -0.813497
 1 -0.751879  0.588357  0.204476 -0.057229
 2 -0.376269 -1.391675 -0.203795  0.123562,
           0         1         2         3
 3 -0.143103  0.680273 -1.187362 -1.662709
 4 -1.022351  0.248410  2.797055  0.229660
 5  0.449444 -0.244849 -1.414926  0.995995
 6  1.036284 -0.990939  0.235488 -0.283330,
           0         1         2         3
 7 -0.824720 -0.293675  1.326577 -0.599258
 8  0.765700  1.311732 -0.696388  2.323280
 9 -0.894428  0.404550  0.133275 -1.703928]

In [6]:
pd.concat(pieces)   # concat = union all

Unnamed: 0,0,1,2,3
0,0.507017,0.959263,0.649105,-0.813497
1,-0.751879,0.588357,0.204476,-0.057229
2,-0.376269,-1.391675,-0.203795,0.123562
3,-0.143103,0.680273,-1.187362,-1.662709
4,-1.022351,0.24841,2.797055,0.22966
5,0.449444,-0.244849,-1.414926,0.995995
6,1.036284,-0.990939,0.235488,-0.28333
7,-0.82472,-0.293675,1.326577,-0.599258
8,0.7657,1.311732,-0.696388,2.32328
9,-0.894428,0.40455,0.133275,-1.703928


In [7]:
df2 = pd.DataFrame(np.random.randn(5,4))
df2

Unnamed: 0,0,1,2,3
0,-0.524998,-0.113414,0.107509,0.288999
1,-0.605694,-0.637527,-0.552748,-0.756183
2,-0.53328,-1.431117,-1.092549,-0.087489
3,0.470752,1.477561,-1.22001,0.510608
4,-0.571342,-2.229518,-0.272524,0.339881


In [9]:
pd.concat([df, df2])   # concat = union all

Unnamed: 0,0,1,2,3
0,0.507017,0.959263,0.649105,-0.813497
1,-0.751879,0.588357,0.204476,-0.057229
2,-0.376269,-1.391675,-0.203795,0.123562
3,-0.143103,0.680273,-1.187362,-1.662709
4,-1.022351,0.24841,2.797055,0.22966
5,0.449444,-0.244849,-1.414926,0.995995
6,1.036284,-0.990939,0.235488,-0.28333
7,-0.82472,-0.293675,1.326577,-0.599258
8,0.7657,1.311732,-0.696388,2.32328
9,-0.894428,0.40455,0.133275,-1.703928


In [10]:
pd.concat([df, df2], ignore_index = True)   # ignore_index = True 시, index번호 순차적으로

Unnamed: 0,0,1,2,3
0,0.507017,0.959263,0.649105,-0.813497
1,-0.751879,0.588357,0.204476,-0.057229
2,-0.376269,-1.391675,-0.203795,0.123562
3,-0.143103,0.680273,-1.187362,-1.662709
4,-1.022351,0.24841,2.797055,0.22966
5,0.449444,-0.244849,-1.414926,0.995995
6,1.036284,-0.990939,0.235488,-0.28333
7,-0.82472,-0.293675,1.326577,-0.599258
8,0.7657,1.311732,-0.696388,2.32328
9,-0.894428,0.40455,0.133275,-1.703928


> Join (결합)

- SQL 방식으로 병합합니다.
- 데이터베이스 스타일 결합 부분을 참고하세요.

- Case1)

In [11]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [12]:
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [13]:
# -------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------
# --------------------------------------- join 방법  ----------------------------------------
pd.merge(left, right, on = 'key')
# -------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


- Case2)

In [14]:
left = pd.DataFrame({'key' : ['foo', 'bar'], 'lval' : [1, 2]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [15]:
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [16]:
pd.merge(left, right, on = 'key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


> 요온연습 - key 값이 두개라면???

In [40]:
left = pd.DataFrame({'corp' : ['전북','전북','광주','광주'],
                     'key' : ['김민수', '이철민','박상민','최정훈'],
                     'lval' : [1, 2, 3, 4]})
left

Unnamed: 0,corp,key,lval
0,전북,김민수,1
1,전북,이철민,2
2,광주,박상민,3
3,광주,최정훈,4


In [41]:
right = pd.DataFrame({'corp' : ['전북','전북','광주','광주'],
                     'key' : ['김민수', '이철민','박상민','윤동주'],
                     'rval' : [5, 6, 7, 8]})
right

Unnamed: 0,corp,key,rval
0,전북,김민수,5
1,전북,이철민,6
2,광주,박상민,7
3,광주,윤동주,8


In [42]:
pd.merge(left, right, on = ['corp','key'])   # default : inner join !!!!!!!!!!!!!!!!!!!!!!!!!!!!

Unnamed: 0,corp,key,lval,rval
0,전북,김민수,1,5
1,전북,이철민,2,6
2,광주,박상민,3,7


In [67]:
pd.merge(left, right, on = ['corp','key'], how = 'left') # left join !!!!!!!!!!!!!!!!!!!!!!!!!!

Unnamed: 0,corp,key,lval,rval
0,전북,김민수,1,5.0
1,전북,이철민,2,6.0
2,광주,박상민,3,7.0
3,광주,최정훈,4,


In [68]:
pd.merge(left, right, on = ['corp','key'], how = 'right') # right join !!!!!!!!!!!!!!!!!!!!!!!!

Unnamed: 0,corp,key,lval,rval
0,전북,김민수,1.0,5
1,전북,이철민,2.0,6
2,광주,박상민,3.0,7
3,광주,윤동주,,8


In [69]:
pd.merge(left, right, on = ['corp','key'], how = 'outer') # outer join !!!!!!!!!!!!!!!!!!!!!!!!

Unnamed: 0,corp,key,lval,rval
0,전북,김민수,1.0,5.0
1,전북,이철민,2.0,6.0
2,광주,박상민,3.0,7.0
3,광주,최정훈,4.0,
4,광주,윤동주,,8.0


> Append (추가)

- 데이터프레임에 행을 추가합니다.
- Appending 부분을 참조

In [25]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0.774455,-0.342121,-0.096118,-0.240101
1,-1.820693,-0.557898,-0.197075,-1.463134
2,-0.950169,-0.163065,-0.411959,-0.61457
3,0.27875,-0.567334,0.669363,0.17239
4,0.233632,-1.412806,-0.904353,1.073539
5,0.541722,-0.116494,0.805444,1.510962
6,-1.613717,-0.574312,-0.752844,0.961154
7,-0.045933,0.355505,0.349007,-1.964868


In [26]:
s = df.iloc[3]
s

A    0.278750
B   -0.567334
C    0.669363
D    0.172390
Name: 3, dtype: float64

In [27]:
df.append(s)

  df.append(s)


Unnamed: 0,A,B,C,D
0,0.774455,-0.342121,-0.096118,-0.240101
1,-1.820693,-0.557898,-0.197075,-1.463134
2,-0.950169,-0.163065,-0.411959,-0.61457
3,0.27875,-0.567334,0.669363,0.17239
4,0.233632,-1.412806,-0.904353,1.073539
5,0.541722,-0.116494,0.805444,1.510962
6,-1.613717,-0.574312,-0.752844,0.961154
7,-0.045933,0.355505,0.349007,-1.964868
3,0.27875,-0.567334,0.669363,0.17239


In [28]:
df.append(s, ignore_index=True)

  df.append(s, ignore_index=True)


Unnamed: 0,A,B,C,D
0,0.774455,-0.342121,-0.096118,-0.240101
1,-1.820693,-0.557898,-0.197075,-1.463134
2,-0.950169,-0.163065,-0.411959,-0.61457
3,0.27875,-0.567334,0.669363,0.17239
4,0.233632,-1.412806,-0.904353,1.073539
5,0.541722,-0.116494,0.805444,1.510962
6,-1.613717,-0.574312,-0.752844,0.961154
7,-0.045933,0.355505,0.349007,-1.964868
8,0.27875,-0.567334,0.669363,0.17239


# 7. Grouping (그룹화)
> 그룹화는 다음 단계 중 하나 이상을 포함하는 과정을 가리킵니다.

- 몇몇 기준에 따라 여러 그룹으로 데이터를 분할 (splitting)
- 각 그룹에 독립적으로 함수를 적용 (applying)
- 결과물들을 하나의 데이터 구조로 결합 (combining)

In [29]:
df = pd.DataFrame(
    {
        'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
        'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
        'C' : np.random.randn(8),
        'D' : np.random.randn(8)
    })
df

Unnamed: 0,A,B,C,D
0,foo,one,0.672229,0.882191
1,bar,one,0.87938,-0.299996
2,foo,two,-0.40244,-0.536585
3,bar,three,-0.03436,0.753506
4,foo,two,-1.951066,1.677801
5,bar,two,0.451634,0.138925
6,foo,one,0.453375,2.244439
7,foo,three,-0.781698,-0.131664


In [30]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.296653,0.592435
foo,-2.0096,4.136182


In [31]:
# 여려개의 변수로 그룹화하여 summary값 산출하는 경우!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
df.groupby(['A','B']).sum()
# 여려개의 변수로 그룹화하여 summary값 산출하는 경우!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.87938,-0.299996
bar,three,-0.03436,0.753506
bar,two,0.451634,0.138925
foo,one,1.125604,3.12663
foo,three,-0.781698,-0.131664
foo,two,-2.353506,1.141216


# 8. Reshaping (변형)
> Stack (스택)

- stack() 메소드는 데이터프레임 열들의 계층을 “압축”합니다.
- “Stack된” 데이터프레임 또는 (MultiIndex를 인덱스로 사용하는) Series인 경우, stack()의 역 연산은 unstack()이며, 기본적으로 마지막 계층을 unstack합니다.

In [44]:
tuples = list(zip(*[['bar1', 'bar1', 'bar2', 'bar2',
                     'bar3', 'bar3', 'bar4', 'bar4'],
                    ['val1', 'val2', 'val3', 'val4',
                     'val5', 'val6', 'val7', 'val8']]))
tuples

[('bar1', 'val1'),
 ('bar1', 'val2'),
 ('bar2', 'val3'),
 ('bar2', 'val4'),
 ('bar3', 'val5'),
 ('bar3', 'val6'),
 ('bar4', 'val7'),
 ('bar4', 'val8')]

In [45]:
index = pd.MultiIndex.from_tuples(tuples, names = ['first','second'])
index

MultiIndex([('bar1', 'val1'),
            ('bar1', 'val2'),
            ('bar2', 'val3'),
            ('bar2', 'val4'),
            ('bar3', 'val5'),
            ('bar3', 'val6'),
            ('bar4', 'val7'),
            ('bar4', 'val8')],
           names=['first', 'second'])

In [46]:
df = pd.DataFrame(np.random.randn(8,2), index = index, columns = ['A','B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar1,val1,-1.47407,1.12501
bar1,val2,0.779974,-0.18737
bar2,val3,0.931927,-1.443723
bar2,val4,1.810525,-0.234949
bar3,val5,-0.499298,-0.432344
bar3,val6,-0.803701,2.688719
bar4,val7,-1.568793,0.052791
bar4,val8,-0.738812,0.273234


In [48]:
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar1,val1,-1.47407,1.12501
bar1,val2,0.779974,-0.18737
bar2,val3,0.931927,-1.443723
bar2,val4,1.810525,-0.234949


In [49]:
stacked = df2.stack()
stacked

first  second   
bar1   val1    A   -1.474070
               B    1.125010
       val2    A    0.779974
               B   -0.187370
bar2   val3    A    0.931927
               B   -1.443723
       val4    A    1.810525
               B   -0.234949
dtype: float64

In [50]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar1,val1,-1.47407,1.12501
bar1,val2,0.779974,-0.18737
bar2,val3,0.931927,-1.443723
bar2,val4,1.810525,-0.234949


In [51]:
stacked.unstack(0)   # 첫번째 index를 열로 올리고 싶다면 괄호 안에 해당 index 넣으면 됨!!!!!!!

Unnamed: 0_level_0,first,bar1,bar2
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
val1,A,-1.47407,
val1,B,1.12501,
val2,A,0.779974,
val2,B,-0.18737,
val3,A,,0.931927
val3,B,,-1.443723
val4,A,,1.810525
val4,B,,-0.234949


In [52]:
stacked.unstack(1)

Unnamed: 0_level_0,second,val1,val2,val3,val4
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bar1,A,-1.47407,0.779974,,
bar1,B,1.12501,-0.18737,,
bar2,A,,,0.931927,1.810525
bar2,B,,,-1.443723,-0.234949


> Pivot Tables (피봇 테이블)

In [53]:
df = pd.DataFrame({'A' : ['1st', '1st', '2nd', '3rd'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['광주은행', '광주은행', '광주은행', '전북은행', '전북은행', '전북은행'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})
df

Unnamed: 0,A,B,C,D,E
0,1st,A,광주은행,-1.163126,0.399966
1,1st,B,광주은행,0.105966,0.344362
2,2nd,C,광주은행,0.485866,-0.433713
3,3rd,A,전북은행,0.220632,0.32962
4,1st,B,전북은행,-1.318433,-0.007611
5,1st,C,전북은행,0.021918,-0.387756
6,2nd,A,광주은행,1.206611,-0.539277
7,3rd,B,광주은행,-1.899877,1.80046
8,1st,C,광주은행,0.427795,-0.665345
9,1st,A,전북은행,-0.243899,-0.581432


In [54]:
# -------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------
# ---------------------------------- 피벗 테이블 작성 방법 ---------------------------------- 
pd.pivot_table(df, values = 'D', index = ['A','B'], columns = ['C'])
# -------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------

Unnamed: 0_level_0,C,광주은행,전북은행
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
1st,A,-1.163126,-0.243899
1st,B,0.105966,-1.318433
1st,C,0.427795,0.021918
2nd,A,1.206611,
2nd,B,,-0.587564
2nd,C,0.485866,
3rd,A,,0.220632
3rd,B,-1.899877,
3rd,C,,0.676814


In [55]:
pd.pivot_table(df, values = 'D', index = ['A','C'], columns = ['B'])

Unnamed: 0_level_0,B,A,B,C
A,C,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1st,광주은행,-1.163126,0.105966,0.427795
1st,전북은행,-0.243899,-1.318433,0.021918
2nd,광주은행,1.206611,,0.485866
2nd,전북은행,,-0.587564,
3rd,광주은행,,-1.899877,
3rd,전북은행,0.220632,,0.676814


# 9. Time Series (시계열)
- Pandas는 자주 일어나는 변환 (예시 : 5분마다 일어나는 데이터에 대한 2차 데이터 변환) 사이에 수행하는 리샘플링 연산을 위한 간단하고, 강력하며, 효율적인 함수를 제공합니다.
- 이는 재무 (금융) 응용에서 매우 일반적이지만 이에 국한되지는 않습니다.
- 시계열 부분을 참고하세요.

In [56]:
rng = pd.date_range('5/30/2022', periods=100, freq='S')
rng

DatetimeIndex(['2022-05-30 00:00:00', '2022-05-30 00:00:01',
               '2022-05-30 00:00:02', '2022-05-30 00:00:03',
               '2022-05-30 00:00:04', '2022-05-30 00:00:05',
               '2022-05-30 00:00:06', '2022-05-30 00:00:07',
               '2022-05-30 00:00:08', '2022-05-30 00:00:09',
               '2022-05-30 00:00:10', '2022-05-30 00:00:11',
               '2022-05-30 00:00:12', '2022-05-30 00:00:13',
               '2022-05-30 00:00:14', '2022-05-30 00:00:15',
               '2022-05-30 00:00:16', '2022-05-30 00:00:17',
               '2022-05-30 00:00:18', '2022-05-30 00:00:19',
               '2022-05-30 00:00:20', '2022-05-30 00:00:21',
               '2022-05-30 00:00:22', '2022-05-30 00:00:23',
               '2022-05-30 00:00:24', '2022-05-30 00:00:25',
               '2022-05-30 00:00:26', '2022-05-30 00:00:27',
               '2022-05-30 00:00:28', '2022-05-30 00:00:29',
               '2022-05-30 00:00:30', '2022-05-30 00:00:31',
               '2022-05-

In [58]:
ts = pd.Series(np.random.randint(0,500, len(rng)), index = rng)
ts

2022-05-30 00:00:00    208
2022-05-30 00:00:01     29
2022-05-30 00:00:02     40
2022-05-30 00:00:03    366
2022-05-30 00:00:04    452
                      ... 
2022-05-30 00:01:35    212
2022-05-30 00:01:36    335
2022-05-30 00:01:37     77
2022-05-30 00:01:38    177
2022-05-30 00:01:39    142
Freq: S, Length: 100, dtype: int32

In [60]:
rng = pd.date_range('6/1/2022 00:00', periods=5, freq='D')
rng

DatetimeIndex(['2022-06-01', '2022-06-02', '2022-06-03', '2022-06-04',
               '2022-06-05'],
              dtype='datetime64[ns]', freq='D')

In [61]:
ts_utc = ts.tz_localize('UTC')
ts_utc

2022-05-30 00:00:00+00:00    208
2022-05-30 00:00:01+00:00     29
2022-05-30 00:00:02+00:00     40
2022-05-30 00:00:03+00:00    366
2022-05-30 00:00:04+00:00    452
                            ... 
2022-05-30 00:01:35+00:00    212
2022-05-30 00:01:36+00:00    335
2022-05-30 00:01:37+00:00     77
2022-05-30 00:01:38+00:00    177
2022-05-30 00:01:39+00:00    142
Freq: S, Length: 100, dtype: int32

In [62]:
# 분기 데이터 생성!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
prng = pd.period_range('2012Q1','2022Q4', freq = 'Q-NOV')
prng

PeriodIndex(['2012Q1', '2012Q2', '2012Q3', '2012Q4', '2013Q1', '2013Q2',
             '2013Q3', '2013Q4', '2014Q1', '2014Q2', '2014Q3', '2014Q4',
             '2015Q1', '2015Q2', '2015Q3', '2015Q4', '2016Q1', '2016Q2',
             '2016Q3', '2016Q4', '2017Q1', '2017Q2', '2017Q3', '2017Q4',
             '2018Q1', '2018Q2', '2018Q3', '2018Q4', '2019Q1', '2019Q2',
             '2019Q3', '2019Q4', '2020Q1', '2020Q2', '2020Q3', '2020Q4',
             '2021Q1', '2021Q2', '2021Q3', '2021Q4', '2022Q1', '2022Q2',
             '2022Q3', '2022Q4'],
            dtype='period[Q-NOV]')

In [63]:
ts = pd.Series(np.random.randn(len(prng)), prng)
ts

2012Q1    1.118930
2012Q2    0.174644
2012Q3   -2.133065
2012Q4    0.282291
2013Q1    0.766231
2013Q2    1.366011
2013Q3    0.255132
2013Q4    0.555292
2014Q1    0.934709
2014Q2   -1.391521
2014Q3    2.419987
2014Q4   -2.711380
2015Q1   -0.556733
2015Q2    0.739663
2015Q3    0.363120
2015Q4   -0.908365
2016Q1    0.403723
2016Q2    2.090726
2016Q3   -1.355577
2016Q4   -0.763579
2017Q1    0.803687
2017Q2   -0.233688
2017Q3    0.891252
2017Q4    1.730880
2018Q1    0.273404
2018Q2    0.308909
2018Q3    0.435336
2018Q4    0.084522
2019Q1   -0.141372
2019Q2    0.326688
2019Q3   -0.005677
2019Q4   -0.458595
2020Q1   -0.017002
2020Q2    0.538721
2020Q3   -0.105123
2020Q4    0.043269
2021Q1   -1.045308
2021Q2    0.121093
2021Q3    0.254331
2021Q4   -0.689470
2022Q1    0.243207
2022Q2   -1.228458
2022Q3   -0.582923
2022Q4    0.199556
Freq: Q-NOV, dtype: float64

In [64]:
prng

PeriodIndex(['2012Q1', '2012Q2', '2012Q3', '2012Q4', '2013Q1', '2013Q2',
             '2013Q3', '2013Q4', '2014Q1', '2014Q2', '2014Q3', '2014Q4',
             '2015Q1', '2015Q2', '2015Q3', '2015Q4', '2016Q1', '2016Q2',
             '2016Q3', '2016Q4', '2017Q1', '2017Q2', '2017Q3', '2017Q4',
             '2018Q1', '2018Q2', '2018Q3', '2018Q4', '2019Q1', '2019Q2',
             '2019Q3', '2019Q4', '2020Q1', '2020Q2', '2020Q3', '2020Q4',
             '2021Q1', '2021Q2', '2021Q3', '2021Q4', '2022Q1', '2022Q2',
             '2022Q3', '2022Q4'],
            dtype='period[Q-NOV]')

In [65]:
ts.index = (prng.asfreq('M','e') + 1).asfreq('H', 's') + 9
ts.index

PeriodIndex(['2012-03-01 09:00', '2012-06-01 09:00', '2012-09-01 09:00',
             '2012-12-01 09:00', '2013-03-01 09:00', '2013-06-01 09:00',
             '2013-09-01 09:00', '2013-12-01 09:00', '2014-03-01 09:00',
             '2014-06-01 09:00', '2014-09-01 09:00', '2014-12-01 09:00',
             '2015-03-01 09:00', '2015-06-01 09:00', '2015-09-01 09:00',
             '2015-12-01 09:00', '2016-03-01 09:00', '2016-06-01 09:00',
             '2016-09-01 09:00', '2016-12-01 09:00', '2017-03-01 09:00',
             '2017-06-01 09:00', '2017-09-01 09:00', '2017-12-01 09:00',
             '2018-03-01 09:00', '2018-06-01 09:00', '2018-09-01 09:00',
             '2018-12-01 09:00', '2019-03-01 09:00', '2019-06-01 09:00',
             '2019-09-01 09:00', '2019-12-01 09:00', '2020-03-01 09:00',
             '2020-06-01 09:00', '2020-09-01 09:00', '2020-12-01 09:00',
             '2021-03-01 09:00', '2021-06-01 09:00', '2021-09-01 09:00',
             '2021-12-01 09:00', '2022-03-01 09:00'

In [66]:
ts.head()

2012-03-01 09:00    1.118930
2012-06-01 09:00    0.174644
2012-09-01 09:00   -2.133065
2012-12-01 09:00    0.282291
2013-03-01 09:00    0.766231
Freq: H, dtype: float64

# 10. Categoricals (범주화)
- Pandas는 데이터프레임 내에 범주형 데이터를 포함할 수 있습니다.
- 범주형 소개 와 API 문서 부분을 참조하세요.

In [71]:
df = pd.DataFrame({'id' : [1,2,3,4,5,6], 'raw_grade' : ['a','b','b','a','d','e']})
df

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,d
5,6,e


In [72]:
df['grade'] = df['raw_grade'].astype('category')
df

Unnamed: 0,id,raw_grade,grade
0,1,a,a
1,2,b,b
2,3,b,b
3,4,a,a
4,5,d,d
5,6,e,e


In [73]:
df['grade']

0    a
1    b
2    b
3    a
4    d
5    e
Name: grade, dtype: category
Categories (4, object): ['a', 'b', 'd', 'e']

In [74]:
df['grade'].cat.categories = ['very good','good','bad','very bad']
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,d,bad
5,6,e,very bad


In [75]:
df['grade'] = df['grade'].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,d,bad
5,6,e,very bad


In [76]:
df['grade']

0    very good
1         good
2         good
3    very good
4          bad
5     very bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']

In [77]:
df.sort_values(by = 'grade')

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
4,5,d,bad
1,2,b,good
2,3,b,good
0,1,a,very good
3,4,a,very good


In [78]:
df.groupby('grade').size() # 범주의 열을 기준으로 그룹화하면 빈 범주도 표시 !!!!!!!!!!!!!!!!!!!!!

grade
very bad     1
bad          1
medium       0
good         2
very good    2
dtype: int64