# Industry 4.0 의 중심, BigData

<div align='right'><font size=2 color='gray'>Data Processing Based Python @ <font color='blue'><a href='https://www.facebook.com/jskim.kr'>FB / jskim.kr</a></font>, [김진수](bigpycraft@gmail.com)</font></div>
<hr>

# Pandas Advanced 

> ### INDEX
> - Merge (병합)
> - Grouping (그룹화)
> - Reshaping (변형)
> - Time Series (시계열)
> - Categoricals (범주화)


In [1]:
import numpy as np
import pandas as pd

## 6. Merge (병합)

> Concat (연결)
> - 결합 (join) / 병합 (merge) 형태의 연산에 대한 인덱스, 관계 대수 기능을 위한 다양한 형태의 논리를 포함한 Series, 데이터프레임, Panel 객체를 손쉽게 결합할 수 있도록 하는 다양한 기능을 pandas 에서 제공합니다.

In [2]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,0.53243,-1.021865,-0.211462,-0.405714
1,0.016611,-0.995084,-0.43823,0.175143
2,1.419739,0.208713,-0.275591,0.078142
3,-0.973241,-0.731984,0.439926,-0.035265
4,0.768878,1.442249,1.167291,0.972574
5,-0.161379,-0.992797,-0.078373,-0.382872
6,-0.440381,-0.904445,-0.306625,-0.25941
7,0.58485,0.590291,-0.489082,-0.044299
8,0.162276,-0.104564,0.15875,-0.763762
9,-1.211034,1.745078,-0.01764,-0.39078


In [3]:
# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0  0.532430 -1.021865 -0.211462 -0.405714
 1  0.016611 -0.995084 -0.438230  0.175143
 2  1.419739  0.208713 -0.275591  0.078142,
           0         1         2         3
 3 -0.973241 -0.731984  0.439926 -0.035265
 4  0.768878  1.442249  1.167291  0.972574
 5 -0.161379 -0.992797 -0.078373 -0.382872
 6 -0.440381 -0.904445 -0.306625 -0.259410,
           0         1         2         3
 7  0.584850  0.590291 -0.489082 -0.044299
 8  0.162276 -0.104564  0.158750 -0.763762
 9 -1.211034  1.745078 -0.017640 -0.390780]

In [4]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,0.53243,-1.021865,-0.211462,-0.405714
1,0.016611,-0.995084,-0.43823,0.175143
2,1.419739,0.208713,-0.275591,0.078142
3,-0.973241,-0.731984,0.439926,-0.035265
4,0.768878,1.442249,1.167291,0.972574
5,-0.161379,-0.992797,-0.078373,-0.382872
6,-0.440381,-0.904445,-0.306625,-0.25941
7,0.58485,0.590291,-0.489082,-0.044299
8,0.162276,-0.104564,0.15875,-0.763762
9,-1.211034,1.745078,-0.01764,-0.39078


> Join (결합)
> - SQL 방식으로 병합합니다. 
> - 데이터베이스 스타일 결합 부분을 참고하세요.

##### CASE1

In [5]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [6]:
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [7]:
pd.merge(left, right, on= 'key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


##### CASE2

In [8]:
left = pd.DataFrame({'key' : ['foo', 'bar'], 'lval' : [1, 2]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [9]:
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [10]:
pd.merge(left, right, on= 'key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


> Append (추가)
> - 데이터프레임에 행을 추가합니다. 
> - Appending 부분을 참조

In [11]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,2.241368,1.247555,0.297152,0.189754
1,-0.25759,0.116652,1.776055,-0.262084
2,0.997986,0.883636,1.856861,1.528528
3,0.352967,1.106009,0.76479,-0.853713
4,1.877853,0.285144,1.484966,-1.3685
5,-0.183434,0.003366,1.021911,-1.16626
6,-1.545378,-1.582241,-0.34199,-0.310718
7,0.255094,-1.556955,0.631254,-0.918404


In [12]:
s = df.iloc[3]
s

A    0.352967
B    1.106009
C    0.764790
D   -0.853713
Name: 3, dtype: float64

In [13]:
df.append(s, ignore_index=True)

  df.append(s, ignore_index=True)


Unnamed: 0,A,B,C,D
0,2.241368,1.247555,0.297152,0.189754
1,-0.25759,0.116652,1.776055,-0.262084
2,0.997986,0.883636,1.856861,1.528528
3,0.352967,1.106009,0.76479,-0.853713
4,1.877853,0.285144,1.484966,-1.3685
5,-0.183434,0.003366,1.021911,-1.16626
6,-1.545378,-1.582241,-0.34199,-0.310718
7,0.255094,-1.556955,0.631254,-0.918404
8,0.352967,1.106009,0.76479,-0.853713


## 7. Grouping (그룹화)

> 그룹화는 다음 단계 중 하나 이상을 포함하는 과정을 가리킵니다.
> - 몇몇 기준에 따라 여러 그룹으로 데이터를 분할 (splitting)
> - 각 그룹에 독립적으로 함수를 적용 (applying)
> - 결과물들을 하나의 데이터 구조로 결합 (combining)

In [14]:
df = pd.DataFrame(
    {
        'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
        'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
        'C' : np.random.randn(8),
        'D' : np.random.randn(8)
    })
df

Unnamed: 0,A,B,C,D
0,foo,one,0.310513,0.484804
1,bar,one,0.761116,0.755089
2,foo,two,1.377937,0.182627
3,bar,three,0.029388,1.195826
4,foo,two,0.634413,0.315054
5,bar,two,0.368382,-0.59391
6,foo,one,-0.318653,-0.847164
7,foo,three,-1.465674,1.887051


In [15]:
# 생성된 데이터프레임을 그룹화한 후 각 그룹에 sum() 함수를 적용
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.158886,1.357005
foo,0.538535,2.022371


In [16]:
# 여러 열을 기준으로 그룹화하면 계층적 인덱스가 형성됩니다
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.761116,0.755089
bar,three,0.029388,1.195826
bar,two,0.368382,-0.59391
foo,one,-0.00814,-0.36236
foo,three,-1.465674,1.887051
foo,two,2.012349,0.497681


## 8. Reshaping (변형)

> Stack (스택)
> - stack() 메소드는 데이터프레임 열들의 계층을 “압축”합니다.
> - “Stack된” 데이터프레임 또는 (MultiIndex를 인덱스로 사용하는) Series인 경우, stack()의 역 연산은 unstack()이며, 기본적으로 마지막 계층을 unstack합니다.

In [17]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                     'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                     'one', 'two', 'one', 'two']]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [18]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [19]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.274982,-1.631127
bar,two,0.719825,-0.645236
baz,one,0.486078,1.359231
baz,two,-0.636085,-0.536705
foo,one,-2.320144,0.56096
foo,two,0.927683,-1.24995
qux,one,-0.330941,-0.271011
qux,two,-0.222199,0.527607


In [20]:
df2  =  df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.274982,-1.631127
bar,two,0.719825,-0.645236
baz,one,0.486078,1.359231
baz,two,-0.636085,-0.536705


In [21]:
stacked = df2.stack()
stacked

first  second   
bar    one     A    0.274982
               B   -1.631127
       two     A    0.719825
               B   -0.645236
baz    one     A    0.486078
               B    1.359231
       two     A   -0.636085
               B   -0.536705
dtype: float64

In [22]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.274982,-1.631127
bar,two,0.719825,-0.645236
baz,one,0.486078,1.359231
baz,two,-0.636085,-0.536705


In [23]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,0.274982,0.719825
bar,B,-1.631127,-0.645236
baz,A,0.486078,-0.636085
baz,B,1.359231,-0.536705


In [24]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.274982,0.486078
one,B,-1.631127,1.359231
two,A,0.719825,-0.636085
two,B,-0.645236,-0.536705


> Pivot Tables (피봇 테이블)

In [25]:
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,0.330764,-0.34152
1,one,B,foo,-0.459271,-1.520345
2,two,C,foo,2.22471,-1.591963
3,three,A,bar,3.094003,-0.89966
4,one,B,bar,0.536076,-1.361001
5,one,C,bar,-0.471899,-1.624881
6,two,A,foo,-0.606844,-0.250344
7,three,B,foo,-0.454704,-1.263489
8,one,C,foo,-1.105477,-0.762186
9,one,A,bar,0.93216,1.858974


In [26]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.93216,0.330764
one,B,0.536076,-0.459271
one,C,-0.471899,-1.105477
three,A,3.094003,
three,B,,-0.454704
three,C,1.098933,
two,A,,-0.606844
two,B,0.122197,
two,C,,2.22471


## 9. Time Series (시계열)

> - Pandas는 자주 일어나는 변환 (예시 : 5분마다 일어나는 데이터에 대한 2차 데이터 변환) 사이에 수행하는 리샘플링 연산을 위한 간단하고, 강력하며, 효율적인 함수를 제공합니다. 
> - 이는 재무 (금융) 응용에서 매우 일반적이지만 이에 국한되지는 않습니다. 
> - 시계열 부분을 참고하세요.

In [27]:
rng = pd.date_range('1/1/2022', periods=100, freq='S')
rng

DatetimeIndex(['2022-01-01 00:00:00', '2022-01-01 00:00:01',
               '2022-01-01 00:00:02', '2022-01-01 00:00:03',
               '2022-01-01 00:00:04', '2022-01-01 00:00:05',
               '2022-01-01 00:00:06', '2022-01-01 00:00:07',
               '2022-01-01 00:00:08', '2022-01-01 00:00:09',
               '2022-01-01 00:00:10', '2022-01-01 00:00:11',
               '2022-01-01 00:00:12', '2022-01-01 00:00:13',
               '2022-01-01 00:00:14', '2022-01-01 00:00:15',
               '2022-01-01 00:00:16', '2022-01-01 00:00:17',
               '2022-01-01 00:00:18', '2022-01-01 00:00:19',
               '2022-01-01 00:00:20', '2022-01-01 00:00:21',
               '2022-01-01 00:00:22', '2022-01-01 00:00:23',
               '2022-01-01 00:00:24', '2022-01-01 00:00:25',
               '2022-01-01 00:00:26', '2022-01-01 00:00:27',
               '2022-01-01 00:00:28', '2022-01-01 00:00:29',
               '2022-01-01 00:00:30', '2022-01-01 00:00:31',
               '2022-01-

In [28]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts

2022-01-01 00:00:00    127
2022-01-01 00:00:01    180
2022-01-01 00:00:02     90
2022-01-01 00:00:03    337
2022-01-01 00:00:04     47
                      ... 
2022-01-01 00:01:35    342
2022-01-01 00:01:36    488
2022-01-01 00:01:37     32
2022-01-01 00:01:38     74
2022-01-01 00:01:39     10
Freq: S, Length: 100, dtype: int32

In [29]:
# 시간대를 표현
rng = pd.date_range('6/1/2022 00:00', periods=5, freq='D')
rng

DatetimeIndex(['2022-06-01', '2022-06-02', '2022-06-03', '2022-06-04',
               '2022-06-05'],
              dtype='datetime64[ns]', freq='D')

In [30]:
ts = pd.Series(np.random.randn(len(rng)), rng)
ts

2022-06-01    1.191766
2022-06-02    0.170214
2022-06-03   -0.932116
2022-06-04   -0.831222
2022-06-05   -1.355124
Freq: D, dtype: float64

In [31]:
ts_utc = ts.tz_localize('UTC')
ts_utc

2022-06-01 00:00:00+00:00    1.191766
2022-06-02 00:00:00+00:00    0.170214
2022-06-03 00:00:00+00:00   -0.932116
2022-06-04 00:00:00+00:00   -0.831222
2022-06-05 00:00:00+00:00   -1.355124
Freq: D, dtype: float64

In [32]:
# 다른 시간대로 변환
ts_utc.tz_convert('US/Eastern')

2022-05-31 20:00:00-04:00    1.191766
2022-06-01 20:00:00-04:00    0.170214
2022-06-02 20:00:00-04:00   -0.932116
2022-06-03 20:00:00-04:00   -0.831222
2022-06-04 20:00:00-04:00   -1.355124
Freq: D, dtype: float64

In [33]:
# 시간 표현 ↔ 기간 표현으로 변환
rng = pd.date_range('1/1/2022', periods=5, freq='M')
rng

DatetimeIndex(['2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31'],
              dtype='datetime64[ns]', freq='M')

In [34]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2022-01-31    0.952553
2022-02-28   -0.751300
2022-03-31   -0.021290
2022-04-30    0.649988
2022-05-31   -0.767996
Freq: M, dtype: float64

In [35]:
ps = ts.to_period()
ps

2022-01    0.952553
2022-02   -0.751300
2022-03   -0.021290
2022-04    0.649988
2022-05   -0.767996
Freq: M, dtype: float64

In [36]:
ps.to_timestamp()

2022-01-01    0.952553
2022-02-01   -0.751300
2022-03-01   -0.021290
2022-04-01    0.649988
2022-05-01   -0.767996
Freq: MS, dtype: float64

> - 기간 ↔ 시간 변환은 편리한 산술 기능들을 사용할 수 있도록 만들어줍니다. 
> - 다음 예제에서, 우리는 11월에 끝나는 연말 결산의 분기별 빈도를 분기말 익월의 월말일 오전 9시로 변환합니다.


In [37]:
prng = pd.period_range('2012Q1', '2022Q4', freq='Q-NOV')
prng

PeriodIndex(['2012Q1', '2012Q2', '2012Q3', '2012Q4', '2013Q1', '2013Q2',
             '2013Q3', '2013Q4', '2014Q1', '2014Q2', '2014Q3', '2014Q4',
             '2015Q1', '2015Q2', '2015Q3', '2015Q4', '2016Q1', '2016Q2',
             '2016Q3', '2016Q4', '2017Q1', '2017Q2', '2017Q3', '2017Q4',
             '2018Q1', '2018Q2', '2018Q3', '2018Q4', '2019Q1', '2019Q2',
             '2019Q3', '2019Q4', '2020Q1', '2020Q2', '2020Q3', '2020Q4',
             '2021Q1', '2021Q2', '2021Q3', '2021Q4', '2022Q1', '2022Q2',
             '2022Q3', '2022Q4'],
            dtype='period[Q-NOV]')

In [38]:
ts = pd.Series(np.random.randn(len(prng)), prng)
ts

2012Q1   -0.635535
2012Q2    0.538316
2012Q3    0.275236
2012Q4    0.540712
2013Q1   -0.180140
2013Q2   -0.669974
2013Q3    0.477079
2013Q4    0.236019
2014Q1   -0.002925
2014Q2    0.953223
2014Q3   -1.750801
2014Q4    0.066891
2015Q1    0.581959
2015Q2   -0.880031
2015Q3   -0.950649
2015Q4    0.257874
2016Q1    0.323881
2016Q2    0.800493
2016Q3    0.285600
2016Q4    0.426142
2017Q1   -2.316591
2017Q2   -1.199751
2017Q3    0.148711
2017Q4    1.414687
2018Q1    0.156217
2018Q2   -0.343269
2018Q3   -1.478434
2018Q4   -2.887448
2019Q1   -0.934674
2019Q2   -0.969834
2019Q3    0.924461
2019Q4   -2.680063
2020Q1   -1.517489
2020Q2   -0.590448
2020Q3    1.207867
2020Q4   -0.500739
2021Q1   -0.452366
2021Q2   -0.277287
2021Q3   -0.297344
2021Q4   -0.674332
2022Q1    0.279586
2022Q2    0.427561
2022Q3    0.746600
2022Q4   -2.763961
Freq: Q-NOV, dtype: float64

In [39]:
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts.index 

PeriodIndex(['2012-03-01 09:00', '2012-06-01 09:00', '2012-09-01 09:00',
             '2012-12-01 09:00', '2013-03-01 09:00', '2013-06-01 09:00',
             '2013-09-01 09:00', '2013-12-01 09:00', '2014-03-01 09:00',
             '2014-06-01 09:00', '2014-09-01 09:00', '2014-12-01 09:00',
             '2015-03-01 09:00', '2015-06-01 09:00', '2015-09-01 09:00',
             '2015-12-01 09:00', '2016-03-01 09:00', '2016-06-01 09:00',
             '2016-09-01 09:00', '2016-12-01 09:00', '2017-03-01 09:00',
             '2017-06-01 09:00', '2017-09-01 09:00', '2017-12-01 09:00',
             '2018-03-01 09:00', '2018-06-01 09:00', '2018-09-01 09:00',
             '2018-12-01 09:00', '2019-03-01 09:00', '2019-06-01 09:00',
             '2019-09-01 09:00', '2019-12-01 09:00', '2020-03-01 09:00',
             '2020-06-01 09:00', '2020-09-01 09:00', '2020-12-01 09:00',
             '2021-03-01 09:00', '2021-06-01 09:00', '2021-09-01 09:00',
             '2021-12-01 09:00', '2022-03-01 09:00'

In [40]:
ts.head()

2012-03-01 09:00   -0.635535
2012-06-01 09:00    0.538316
2012-09-01 09:00    0.275236
2012-12-01 09:00    0.540712
2013-03-01 09:00   -0.180140
Freq: H, dtype: float64

## 10. Categoricals (범주화)

> - Pandas는 데이터프레임 내에 범주형 데이터를 포함할 수 있습니다. 
> - 범주형 소개 와 API 문서 부분을 참조하세요.

In [41]:
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [42]:
df["grade"] = df["raw_grade"].astype("category")
df

Unnamed: 0,id,raw_grade,grade
0,1,a,a
1,2,b,b
2,3,b,b
3,4,a,a
4,5,a,a
5,6,e,e


In [43]:
df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [44]:
df["grade"].cat.categories = ["very good", "good", "very bad"]
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [45]:
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [46]:
df["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']

In [47]:
df.sort_values(by="grade")

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
1,2,b,good
2,3,b,good
0,1,a,very good
3,4,a,very good
4,5,a,very good


In [48]:
# 범주의 열을 기준으로 그룹화하면 빈 범주도 표시
df.groupby("grade").size()

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

<hr>
<marquee><font size=3 color='brown'>The BigpyCraft find the information to design valuable society with Technology & Craft.</font></marquee>
<div align='right'><font size=2 color='gray'> &lt; The End &gt; </font></div>