# Python for Data Analysis - part12

##### Python의 numpy, pandas 등을 정리하였으며 파이썬 라이브러리를 활용한 데이터분석(2판)을 참고하여 작성하였습니다.
##### 해당 자료는 python 3.6 기반으로 작성되었습니다.

## 12.고급 Pandas

### 12.1 Categorical Data

#### 12.1 개발 배경과 동기

In [1]:
import pandas as pd
import numpy as np

values = pd.Series(['apple', 'orange', 'apple', 'apple'] * 2)
print(values)
print('--------------------------------------')

print(pd.unique(values))
print('--------------------------------------')

print(pd.value_counts(values))
print('--------------------------------------')

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object
--------------------------------------
['apple' 'orange']
--------------------------------------
apple     6
orange    2
dtype: int64
--------------------------------------


#### - 데이터웨어하우스의 경우 구별되는 값을 담고 있는 차원 테이블과 그 테이블을 참조하는 정수키를 사용하는 것이 일반적

In [2]:
values = pd.Series([0,1,0,0] * 2)
dim = pd.Series(['apple', 'orange'])

print(values)
print('--------------------------------------')

print(dim)
print('--------------------------------------')

# take 메서드 - Series 내 저장된 원래 문자열 
print(dim.take(values))
print('--------------------------------------')

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64
--------------------------------------
0     apple
1    orange
dtype: object
--------------------------------------
0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object
--------------------------------------


#### 12.1.2 Pandas의 Categorical 

In [8]:
fruits = ['apple', 'orange', 'apple', 'apple'] *2
N = len(fruits)
df = pd.DataFrame({'fruit' : fruits, 
                  'basket_id' : np.arange(N), 
                  'count' : np.random.randint(3, 15, size = N), 
                  'weight' : np.random.uniform(0,4, size = N)}, 
                 columns = ['basket_id', 'fruit', 'count', 'weight'])
print(df)
print('--------------------------------------')

fruit_cat = df['fruit'].astype('category')
print(fruit_cat)
print('--------------------------------------')

c = fruit_cat.values
print(type(c))
print('--------------------------------------')

print(c.categories)
print('--------------------------------------')

print(c.codes)
print('--------------------------------------')

df['fruit'] = df['fruit'].astype('category')
print(df.fruit)
print('--------------------------------------')

   basket_id   fruit  count    weight
0          0   apple      7  1.244498
1          1  orange     13  3.871715
2          2   apple      5  2.784968
3          3   apple     12  2.099764
4          4   apple     11  1.853214
5          5  orange     12  2.202083
6          6   apple      3  0.816188
7          7   apple     11  1.175845
--------------------------------------
0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']
--------------------------------------
<class 'pandas.core.arrays.categorical.Categorical'>
--------------------------------------
Index(['apple', 'orange'], dtype='object')
--------------------------------------
[0 1 0 0 0 1 0 0]
--------------------------------------
0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']
-----------

In [11]:
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])
print(my_categories)
print('--------------------------------------')

categories = ['foo', 'bar', 'baz']
codes = [0,1,2,0,0,1]
my_cats_2 = pd.Categorical.from_codes(codes, categories)
print(my_cats_2)
print('--------------------------------------')

ordered_cat = pd.Categorical.from_codes(codes, categories, ordered = True)
print(ordered_cat)
print('--------------------------------------')

print(my_cats_2.as_ordered())
print('--------------------------------------')

['foo', 'bar', 'baz', 'foo', 'bar']
Categories (3, object): ['bar', 'baz', 'foo']
--------------------------------------
['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']
--------------------------------------
['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']
--------------------------------------
['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']
--------------------------------------


#### 12.1.3 Catergorical 연산

In [15]:
np.random.seed(12345)
draws = np.random.randn(1000)
print(draws[:5])
print('--------------------------------------')

# qcut 메서드 - data 분할
bins = pd.qcut(draws, 4)
print(bins)
print('--------------------------------------')

bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
print(bins)
print('--------------------------------------')

bins = pd.Series(bins, name = 'quartile')
results = (pd.Series(draws).groupby(bins).agg(['count', 'min', 'max']).reset_index())
print(results)
print('--------------------------------------')

[-0.20470766  0.47894334 -0.51943872 -0.5557303   1.96578057]
--------------------------------------
[(-0.684, -0.0101], (-0.0101, 0.63], (-0.684, -0.0101], (-0.684, -0.0101], (0.63, 3.928], ..., (-0.0101, 0.63], (-0.684, -0.0101], (-2.9499999999999997, -0.684], (-0.0101, 0.63], (0.63, 3.928]]
Length: 1000
Categories (4, interval[float64]): [(-2.9499999999999997, -0.684] < (-0.684, -0.0101] < (-0.0101, 0.63] < (0.63, 3.928]]
--------------------------------------
['Q2', 'Q3', 'Q2', 'Q2', 'Q4', ..., 'Q3', 'Q2', 'Q1', 'Q3', 'Q4']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']
--------------------------------------
  quartile  count       min       max
0       Q1    250 -2.949343 -0.685484
1       Q2    250 -0.683066 -0.010115
2       Q3    250 -0.010032  0.628894
3       Q4    250  0.634238  3.927528
--------------------------------------


#### categorical을 이용한 성능 개선

In [16]:
N = 10000000
draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N//4))
categories = labels.astype('category')

print(labels.memory_usage())
print('--------------------------------------')

print(categories.memory_usage())
print('--------------------------------------')

80000128
--------------------------------------
10000320
--------------------------------------


In [17]:
%time _ = labels.astype('category')

CPU times: user 302 ms, sys: 5.53 ms, total: 308 ms
Wall time: 306 ms


#### 12.1.4 Categorical 메서드

In [22]:
s = pd.Series(['a','b','c','d'] *2)
cat_s = s.astype('category')
print(cat_s)
print('--------------------------------------')

print(cat_s.cat.codes)
print('--------------------------------------')

print(cat_s.cat.categories)
print('--------------------------------------')

actual_categories = ['a', 'b', 'c', 'd', 'e']
cat_s2 = cat_s.cat.set_categories(actual_categories)
print(cat_s2)
print('--------------------------------------')

print(cat_s.value_counts())
print('--------------------------------------')

print(cat_s2.value_counts())
print('--------------------------------------')

cat_s3 = cat_s[cat_s.isin(['a','b'])]
print(cat_s3)
print('--------------------------------------')

print(cat_s3.cat.remove_unused_categories())
print('--------------------------------------')

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']
--------------------------------------
0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8
--------------------------------------
Index(['a', 'b', 'c', 'd'], dtype='object')
--------------------------------------
0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']
--------------------------------------
d    2
c    2
b    2
a    2
dtype: int64
--------------------------------------
d    2
c    2
b    2
a    2
e    0
dtype: int64
--------------------------------------
0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']
--------------------------------------
0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): ['a', 'b']
--------------------------------------


#### - categorical 메서드 : add_categories(추가), as_ordered(순서), as_unordered(순서 가지지 않음), remove_categories(제거), remove_unused_categories(없는 카테고리 제거), rename_categories(이름 변경), reorder_categories(새로운 카테고리가 순서), set_categories(새로운 카테고리로 변경)

#### 더미값 생성(one-hot encoding) - get_dummies()

In [23]:
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype = 'category')
print(pd.get_dummies(cat_s))
print('--------------------------------------')

   a  b  c  d
0  1  0  0  0
1  0  1  0  0
2  0  0  1  0
3  0  0  0  1
4  1  0  0  0
5  0  1  0  0
6  0  0  1  0
7  0  0  0  1
--------------------------------------


### 12.2 고급 GroupBy 사용
#### 12.2.1 그룹 변환과 GroupBy 객체 풀어내기

In [25]:
df = pd.DataFrame({'key' : ['a', 'b', 'c'] * 4, 
                  'value' : np.arange(12.)})
print(df)
print('--------------------------------------')

g = df.groupby('key').value
print(g.mean())
print('--------------------------------------')

print(g.transform(lambda x : x.mean()))
print('--------------------------------------')

print(g.transform('mean'))
print('--------------------------------------')

print(g.transform(lambda x : x*2))
print('--------------------------------------')

print(g.transform(lambda x : x.rank(ascending = False)))
print('--------------------------------------')

   key  value
0    a    0.0
1    b    1.0
2    c    2.0
3    a    3.0
4    b    4.0
5    c    5.0
6    a    6.0
7    b    7.0
8    c    8.0
9    a    9.0
10   b   10.0
11   c   11.0
--------------------------------------
key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64
--------------------------------------
0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64
--------------------------------------
0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64
--------------------------------------
0      0.0
1      2.0
2      4.0
3      6.0
4      8.0
5     10.0
6     12.0
7     14.0
8     16.0
9     18.0
10    20.0
11    22.0
Name: value, dtype: float64
--------------------------------------
0     4.0
1     4.0
2     4.0
3     3.0
4     3.0
5     3.0
6     2.0
7     2.0
8     2.0
9     1.0
10    1.

In [26]:
def normalize(x):
    return (x - x.mean()) / x.std()

print(g.transform(normalize))
print('--------------------------------------')

print(g.apply(normalize))
print('--------------------------------------')

print(g.transform('mean'))
print('--------------------------------------')

normalized = (df['value'] - g.transform('mean')) / g.transform('std')
print(normalized)
print('--------------------------------------')

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64
--------------------------------------
0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64
--------------------------------------
0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64
--------------------------------------
0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64
--------------------------------------


#### 12.2.2 시계열 그룹 리샘플링

In [29]:
N = 15 
times = pd.date_range('2017-05-20 00:00', freq = '1min', periods=N)
df = pd.DataFrame({'time' : times, 
                  'values' : np.arange(N)})
print(df)
print('--------------------------------------')

print(df.set_index('time').resample('5min').count())
print('--------------------------------------')

df2 = pd.DataFrame({'time' : times.repeat(3), 
                   'key' : np.tile(['a', 'b', 'c'], N), 
                   'value' : np.arange(N * 3)})
print(df2[:7])
print('--------------------------------------')

# time_key = pd.TimeGrouper('5min')
# resampled = (df2.set_index('time').groupby(['key', time_key]).sum())
# print(resample)
# print('--------------------------------------')

# print(resampled.reset_index())
# print('--------------------------------------')

                  time  values
0  2017-05-20 00:00:00       0
1  2017-05-20 00:01:00       1
2  2017-05-20 00:02:00       2
3  2017-05-20 00:03:00       3
4  2017-05-20 00:04:00       4
5  2017-05-20 00:05:00       5
6  2017-05-20 00:06:00       6
7  2017-05-20 00:07:00       7
8  2017-05-20 00:08:00       8
9  2017-05-20 00:09:00       9
10 2017-05-20 00:10:00      10
11 2017-05-20 00:11:00      11
12 2017-05-20 00:12:00      12
13 2017-05-20 00:13:00      13
14 2017-05-20 00:14:00      14
--------------------------------------
                     values
time                       
2017-05-20 00:00:00       5
2017-05-20 00:05:00       5
2017-05-20 00:10:00       5
--------------------------------------
                 time key  value
0 2017-05-20 00:00:00   a      0
1 2017-05-20 00:00:00   b      1
2 2017-05-20 00:00:00   c      2
3 2017-05-20 00:01:00   a      3
4 2017-05-20 00:01:00   b      4
5 2017-05-20 00:01:00   c      5
6 2017-05-20 00:02:00   a      6
----------------------

### 12.3 메서드 연결 기법

In [30]:
# df.assign() - 컬럼에 값을 대입하는 함수, 객체 변경하는 대신 값 대입이 완료된 새로운 df 반환
# df2 = df.copy()
# df2['k'] v

### 동일

# df2 = df.assign(k=v)

#### 12.3.1 pipe 메서드

In [None]:
# a = f(df1, arg1 = v1)
# b = g(a, v2, arg3 = v3)
# c = h(b, arg4 = v4)

### 동일

# result = (df.pipe(f, arg1=v1).pipe(g, v2, arg3=v3).pipe(h, arg4=v4))