# 그룹연산과 변형

In [1]:
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
import pandas as pd

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

In [94]:
# 실습 예제1
df = pd.DataFrame({
    'city': ['부산', '부산', '부산', '부산', '서울', '서울', '서울'],
    'fruits': ['apple', 'orange', 'banana', 'banana', 'apple', 'apple', 'banana'],
    'price': [100, 200, 250, 300, 150, 200, 400],
    'quantity': [1, 2, 3, 4, 5, 6, 7]
})

In [3]:
# 실습 예제2
tips = pd.read_csv('examples/tips.csv')
tips = tips.sample(10)
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size
68,20.23,2.01,No,Sat,Dinner,2
175,32.9,3.11,Yes,Sun,Dinner,2
77,27.2,4.0,No,Thur,Lunch,4
187,30.46,2.0,Yes,Sun,Dinner,5
227,20.45,3.0,No,Sat,Dinner,4
89,21.16,3.0,No,Thur,Lunch,2
198,13.0,2.0,Yes,Thur,Lunch,2
148,9.78,1.73,No,Thur,Lunch,2
159,16.49,2.0,No,Sun,Dinner,4
197,43.11,5.0,Yes,Thur,Lunch,4


## 그룹별 연산과 변형
* groupby()
* transform() : 스칼라 값이나 같은 크기를 가지는 배열을 반환하는 함수
* apply() : pandas 객체나 스칼라 값을 반환하는 함수
* apply : 분리 - 적용 - 병합
* 그룹 색인 생략하기
* 변위치 분석과 버킷 분석
* 그룹에 국한된 값으로 누락값 채우기

### add_prefix()

In [9]:
tip_means = tips.groupby('time').mean().add_prefix('mean_')
tip_means

Unnamed: 0_level_0,mean_total_bill,mean_tip,mean_size
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dinner,24.106,2.424,3.4
Lunch,22.85,3.146,2.8


### groupby()

In [95]:
df

Unnamed: 0,city,fruits,price,quantity
0,부산,apple,100,1
1,부산,orange,200,2
2,부산,banana,250,3
3,부산,banana,300,4
4,서울,apple,150,5
5,서울,apple,200,6
6,서울,banana,400,7


In [96]:
df.groupby('city').mean()

Unnamed: 0_level_0,price,quantity
city,Unnamed: 1_level_1,Unnamed: 2_level_1
부산,212.5,2.5
서울,250.0,6.0


In [97]:
df.groupby(['city','fruits']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,price,quantity
city,fruits,Unnamed: 2_level_1,Unnamed: 3_level_1
부산,apple,100.0,1.0
부산,banana,275.0,3.5
부산,orange,200.0,2.0
서울,apple,175.0,5.5
서울,banana,400.0,7.0


In [100]:
# 'as_index=False' - groupby()의 결과값을 인덱스가 아닌 컬럼으로 반환
df.groupby(['city','fruits'], as_index=False).mean()

Unnamed: 0,city,fruits,price,quantity
0,부산,apple,100.0,1.0
1,부산,banana,275.0,3.5
2,부산,orange,200.0,2.0
3,서울,apple,175.0,5.5
4,서울,banana,400.0,7.0


In [103]:
# groupby() 내의 데이터 확인
df.groupby(['city','fruits']).get_group(('부산','banana'))

Unnamed: 0,city,fruits,price,quantity
2,부산,banana,250,3
3,부산,banana,300,4


In [104]:
# 그룹별 갯수를 반환
df.groupby(['city','fruits']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,price,quantity
city,fruits,Unnamed: 2_level_1,Unnamed: 3_level_1
부산,apple,1,1
부산,banana,2,2
부산,orange,1,1
서울,apple,2,2
서울,banana,1,1


### groupby().agg(), groupby().aggregate()

In [106]:
df.groupby(['city']).agg(np.mean)

Unnamed: 0_level_0,price,quantity
city,Unnamed: 1_level_1,Unnamed: 2_level_1
부산,212.5,2.5
서울,250.0,6.0


In [107]:
def my_mean(s):
    return np.mean(s)
df.groupby('city').agg([my_mean, np.min, 'max'])

Unnamed: 0_level_0,price,price,price,quantity,quantity,quantity
Unnamed: 0_level_1,my_mean,amin,max,my_mean,amin,max
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
부산,212.5,100,300,2.5,1,4
서울,250.0,150,400,6.0,5,7


In [109]:
def my_mean(s):
    return np.mean(s)
df.groupby('city').agg({'price':my_mean, 'quantity':np.min})

Unnamed: 0_level_0,price,quantity
city,Unnamed: 1_level_1,Unnamed: 2_level_1
부산,212.5,1
서울,250.0,5


In [110]:
df

Unnamed: 0,city,fruits,price,quantity
0,부산,apple,100,1
1,부산,orange,200,2
2,부산,banana,250,3
3,부산,banana,300,4
4,서울,apple,150,5
5,서울,apple,200,6
6,서울,banana,400,7


### groupby().filter()

In [117]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])

def filter_func(x):
    return x['data2'].std() > 4

display('df', "df.groupby('key').std()", "df.groupby('key').filter(filter_func)")


Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.12132,1.414214
B,2.12132,4.949747
C,2.12132,4.242641

Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


### groupby().transform()
transform()은 들어온 메서드에 대해 각 원소를 살리고 그 안에 연산결과를 채운다.

In [129]:
tips.groupby('day').transform(np.mean)

Unnamed: 0,total_bill,tip,size,pct
68,20.34,2.505,3.0,12.302833
175,26.616667,2.37,3.666667,9.382479
77,22.85,3.146,2.8,14.711118
187,26.616667,2.37,3.666667,9.382479
227,20.34,2.505,3.0,12.302833
89,22.85,3.146,2.8,14.711118
198,22.85,3.146,2.8,14.711118
148,22.85,3.146,2.8,14.711118
159,26.616667,2.37,3.666667,9.382479
197,22.85,3.146,2.8,14.711118


In [119]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])

display('df',"df.groupby('key').mean()",'df.groupby("key").transform(lambda x: x - x.mean())')

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1.5,4.0
B,2.5,3.5
C,3.5,6.0

Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-3.5
2,-1.5,-3.0
3,1.5,-1.0
4,1.5,3.5
5,1.5,3.0


In [120]:
df.groupby('key').mean()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1.5,4.0
B,2.5,3.5
C,3.5,6.0


In [123]:
display('df',"df.groupby('key').mean()")

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1.5,4.0
B,2.5,3.5
C,3.5,6.0


In [124]:
df.groupby('key')['data2'].transform(np.mean)

0    4.0
1    3.5
2    6.0
3    4.0
4    3.5
5    6.0
Name: data2, dtype: float64

### groupby().apply()
apply() 함수는 그룹별로 정리된 결과값으로 출력된다.

In [12]:
# tips.groupby('day').transform(np.mean) # transform()과 apply() 비교
tips.groupby('day').apply(np.mean)

Unnamed: 0_level_0,total_bill,tip,size
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sat,20.34,2.505,3.0
Sun,26.616667,2.37,3.666667
Thur,22.85,3.146,2.8


In [16]:
tips['pct'] = tips['tip']/tips['total_bill']*100
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size,pct
68,20.23,2.01,No,Sat,Dinner,2,9.935739
175,32.9,3.11,Yes,Sun,Dinner,2,9.452888
77,27.2,4.0,No,Thur,Lunch,4,14.705882
187,30.46,2.0,Yes,Sun,Dinner,5,6.565988
227,20.45,3.0,No,Sat,Dinner,4,14.669927
89,21.16,3.0,No,Thur,Lunch,2,14.177694
198,13.0,2.0,Yes,Thur,Lunch,2,15.384615
148,9.78,1.73,No,Thur,Lunch,2,17.689162
159,16.49,2.0,No,Sun,Dinner,4,12.128563
197,43.11,5.0,Yes,Thur,Lunch,4,11.598237


In [20]:
def top(df, n=3, column='pct'):
    return df.sort_values(by=column)[-n:]

top(tips, n=3)

Unnamed: 0,total_bill,tip,smoker,day,time,size,pct
77,27.2,4.0,No,Thur,Lunch,4,14.705882
198,13.0,2.0,Yes,Thur,Lunch,2,15.384615
148,9.78,1.73,No,Thur,Lunch,2,17.689162


In [18]:
tips.sort_values(by='pct')[-6:]

Unnamed: 0,total_bill,tip,smoker,day,time,size,pct
159,16.49,2.0,No,Sun,Dinner,4,12.128563
89,21.16,3.0,No,Thur,Lunch,2,14.177694
227,20.45,3.0,No,Sat,Dinner,4,14.669927
77,27.2,4.0,No,Thur,Lunch,4,14.705882
198,13.0,2.0,Yes,Thur,Lunch,2,15.384615
148,9.78,1.73,No,Thur,Lunch,2,17.689162


In [21]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,227,20.45,3.0,No,Sat,Dinner,4,14.669927
No,77,27.2,4.0,No,Thur,Lunch,4,14.705882
No,148,9.78,1.73,No,Thur,Lunch,2,17.689162
Yes,175,32.9,3.11,Yes,Sun,Dinner,2,9.452888
Yes,197,43.11,5.0,Yes,Thur,Lunch,4,11.598237
Yes,198,13.0,2.0,Yes,Thur,Lunch,2,15.384615


In [24]:
tips.groupby('smoker').apply(top, n=3, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,227,20.45,3.0,No,Sat,Dinner,4,14.669927
No,89,21.16,3.0,No,Thur,Lunch,2,14.177694
No,77,27.2,4.0,No,Thur,Lunch,4,14.705882
Yes,187,30.46,2.0,Yes,Sun,Dinner,5,6.565988
Yes,175,32.9,3.11,Yes,Sun,Dinner,2,9.452888
Yes,197,43.11,5.0,Yes,Thur,Lunch,4,11.598237


In [2]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])

def norm_by_data2(x):
    # x is a DataFrame of group values
    x['data1'] /= x['data2'].sum()
    return x

display('df', "df.groupby('key').apply(norm_by_data2)")

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9

Unnamed: 0,key,data1,data2
0,A,0.0,5
1,B,0.142857,0
2,C,0.166667,3
3,A,0.375,3
4,B,0.571429,7
5,C,0.416667,9


In [7]:
df = pd.DataFrame({
    'city': ['부산', '부산', '부산', '부산', '서울', '서울', '서울'],
    'fruits': ['apple', 'orange', 'banana', 'banana', 'apple', 'apple', 'banana'],
    'price': [100, 200, 250, 300, 150, 200, 400],
    'quantity': [1, 2, 3, 4, 5, 6, 7]
})
result = df.groupby(['city','fruits'], as_index=False).apply(lambda x: (x.price * x.quantity).sum())
display('df','result')

Unnamed: 0,city,fruits,price,quantity
0,부산,apple,100,1
1,부산,orange,200,2
2,부산,banana,250,3
3,부산,banana,300,4
4,서울,apple,150,5
5,서울,apple,200,6
6,서울,banana,400,7

Unnamed: 0,city,fruits,NaN
0,부산,apple,100
1,부산,banana,1950
2,부산,orange,400
3,서울,apple,1950
4,서울,banana,2800


In [13]:
result1 = df.groupby(['city','fruits'], as_index=False).sum()
result = df.groupby(['city','fruits'], as_index=False).sum().pivot('city','fruits')
display('df','result1','result')

Unnamed: 0,city,fruits,price,quantity
0,부산,apple,100,1
1,부산,orange,200,2
2,부산,banana,250,3
3,부산,banana,300,4
4,서울,apple,150,5
5,서울,apple,200,6
6,서울,banana,400,7

Unnamed: 0,city,fruits,price,quantity
0,부산,apple,100,1
1,부산,banana,550,7
2,부산,orange,200,2
3,서울,apple,350,11
4,서울,banana,400,7

Unnamed: 0_level_0,price,price,price,quantity,quantity,quantity
fruits,apple,banana,orange,apple,banana,orange
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
부산,100.0,550.0,200.0,1.0,7.0,2.0
서울,350.0,400.0,,11.0,7.0,


In [14]:
# 별도의 그룹 리스트(L)를 이용하여 DataFrame 조회 
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])

L = [0, 1, 0, 1, 2, 0]
display('df', 'df.groupby(L).sum()')

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9

Unnamed: 0,data1,data2
0,7,17
1,4,3
2,4,7


In [17]:
# DataFrame의 인덱스를 dict와 결합하여 새롭게 그룹화하여 조회
df2 = df.set_index('key')
mapping = {'A':'vowel', 'B':'constant', 'C':'constant'}
df3 = df2.groupby(mapping).sum()
display('df2', 'df3')

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9

Unnamed: 0,data1,data2
constant,12,19
vowel,3,8


In [19]:
df3 = df2.groupby(str.lower).mean()
display('df2','df3')

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9

Unnamed: 0,data1,data2
a,1.5,4.0
b,2.5,3.5
c,3.5,6.0


In [23]:
mapping = {'A':'vowel', 'B':'constant', 'C':'constant'}
df3 = df2.groupby([str.lower, mapping]).mean()
display('df2','df3')

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9

Unnamed: 0,Unnamed: 1,data1,data2
a,vowel,1.5,4.0
b,constant,2.5,3.5
c,constant,3.5,6.0


In [32]:
import seaborn as sns
planets = sns.load_dataset('planets')

decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
result = planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)
display('planets','result')

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0


## 그룹함수 적용의 색인 없애기
groupby() + apply() 연산의 결과값을 보면 index와 결과값안의 데이터가 중복된다. index의 출력여부를 group_keys로 설정한다.

In [25]:
tips.groupby('smoker', group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,size,pct
227,20.45,3.0,No,Sat,Dinner,4,14.669927
77,27.2,4.0,No,Thur,Lunch,4,14.705882
148,9.78,1.73,No,Thur,Lunch,2,17.689162
175,32.9,3.11,Yes,Sun,Dinner,2,9.452888
197,43.11,5.0,Yes,Thur,Lunch,4,11.598237
198,13.0,2.0,Yes,Thur,Lunch,2,15.384615


In [27]:
tips.groupby('smoker', group_keys =False).apply(top).describe()

Unnamed: 0,total_bill,tip,size,pct
count,6.0,6.0,6.0,6.0
mean,24.406667,3.14,3.0,13.916785
std,12.565514,1.225349,1.095445,2.928388
min,9.78,1.73,2.0,9.452888
25%,14.8625,2.25,2.0,12.366159
50%,23.825,3.055,3.0,14.687905
75%,31.475,3.7775,4.0,15.214932
max,43.11,5.0,4.0,17.689162


## 변위치 분석과 버킷 분석
cut(), qcut() 메서드를 이용하여 선택한 크기만큼 혹은 표본 변위치에 따라 데이터 분리하고 groupby()와 조합하여 데이터 묶음에 대한 변위치 분석이나 버킷 분석

cut(), qcut() 계열의 매서드는 기본적으로 설정된 컬럼에 대한 sorting(정렬)을 진행한다.

### cut()
지정된 컬럼의 최소값과 최대값을 등분위로 나누는 메소드

In [30]:
factor_cut = pd.cut(tips['tip'], 4)
factor_cut

68     (1.727, 2.548]
175    (2.548, 3.365]
77     (3.365, 4.182]
187    (1.727, 2.548]
227    (2.548, 3.365]
89     (2.548, 3.365]
198    (1.727, 2.548]
148    (1.727, 2.548]
159    (1.727, 2.548]
197      (4.182, 5.0]
Name: tip, dtype: category
Categories (4, interval[float64]): [(1.727, 2.548] < (2.548, 3.365] < (3.365, 4.182] < (4.182, 5.0]]

In [32]:
tips.groupby(factor_cut).agg(['size','mean'])

Unnamed: 0_level_0,total_bill,total_bill,tip,tip,size,size,pct,pct
Unnamed: 0_level_1,size,mean,size,mean,size,mean,size,mean
tip,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
"(1.727, 2.548]",5,17.992,5,1.948,5,3.0,5,12.340813
"(2.548, 3.365]",3,24.836667,3,3.036667,3,2.666667,3,12.766836
"(3.365, 4.182]",1,27.2,1,4.0,1,4.0,1,14.705882
"(4.182, 5.0]",1,43.11,1,5.0,1,4.0,1,11.598237


### qcut()
주어진 데이터의 갯수를 n등분하는 메서드

In [52]:
factor_qcut = pd.qcut(tips.total_bill, 5, labels=False)
factor_qcut
factor_qcut.value_counts()

0    2
1    2
2    2
3    2
4    2
Name: total_bill, dtype: int64

In [56]:
factor_qcut = pd.qcut(tips.total_bill, 5, labels=False)
factor_qcut
# factor_qcut.value_counts()

68     1
175    4
77     3
187    3
227    2
89     2
198    0
148    0
159    1
197    4
Name: total_bill, dtype: int64

In [57]:
tips.groupby(factor_qcut).agg(['size','mean'])

Unnamed: 0_level_0,total_bill,total_bill,tip,tip,size,size,pct,pct
Unnamed: 0_level_1,size,mean,size,mean,size,mean,size,mean
total_bill,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,2,11.39,2,1.865,2,2.0,2,16.536888
1,2,18.36,2,2.005,2,3.0,2,11.032151
2,2,20.805,2,3.0,2,3.0,2,14.42381
3,2,28.83,2,3.0,2,4.5,2,10.635935
4,2,38.005,2,4.055,2,3.0,2,10.525562


In [59]:
# tits.tip 에 같은 값이 있는 경우 qcut()의 동일한 개수를 반환하지 않는다.
factor_qcut = pd.qcut(tips.tip, 5, labels=False)
tips.groupby(factor_qcut).agg(['size','mean'])

Unnamed: 0_level_0,total_bill,total_bill,tip,tip,size,size,pct,pct
Unnamed: 0_level_1,size,mean,size,mean,size,mean,size,mean
tip,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,4,17.4325,4,1.9325,4,3.25,4,12.942082
2,3,20.613333,3,2.67,3,2.666667,3,12.927786
3,1,32.9,1,3.11,1,2.0,1,9.452888
4,2,35.155,2,4.5,2,4.0,2,13.15206


## 그룹에 국한된 값으로 누락값 채우기

### fillna()

In [60]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1    0.788156
2         NaN
3    1.075705
4         NaN
5   -0.420205
dtype: float64

In [61]:
# s의 평균으로 Null 값을 대체
s.fillna(s.mean())

0    0.481218
1    0.788156
2    0.481218
3    1.075705
4    0.481218
5   -0.420205
dtype: float64

In [65]:
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio         -0.686776
New York     -1.067945
Vermont            NaN
Florida      -0.222128
Oregon       -1.047117
Nevada             NaN
California   -0.758808
Idaho              NaN
dtype: float64

In [66]:
data.groupby(group_key).mean()

East   -0.658950
West   -0.902963
dtype: float64

In [74]:
fill_mean = lambda x: x.fillna(x.mean())
data.groupby(group_key).apply(fill_mean)
# data.groupby(group_key).transform(fill_mean) # 위와 동일

Ohio         -0.686776
New York     -1.067945
Vermont      -0.658950
Florida      -0.222128
Oregon       -1.047117
Nevada       -0.902963
California   -0.758808
Idaho        -0.902963
dtype: float64

In [75]:
data.groupby(group_key).transform(np.mean)

Ohio         -0.658950
New York     -0.658950
Vermont      -0.658950
Florida      -0.658950
Oregon       -0.902963
Nevada       -0.902963
California   -0.902963
Idaho        -0.902963
dtype: float64

In [76]:
# 그룹에 따라 정의된 값 채우기
fill_values = {'East':0.5, 'West':-1}
fill_func = lambda x: x.fillna(fill_values[x.name])
data.groupby(group_key).apply(fill_func)

Ohio         -0.686776
New York     -1.067945
Vermont       0.500000
Florida      -0.222128
Oregon       -1.047117
Nevada       -1.000000
California   -0.758808
Idaho        -1.000000
dtype: float64

## 피벗테이블과 교차 일람표


### 교차 일람표 - pivot_table()

In [77]:
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size,pct
68,20.23,2.01,No,Sat,Dinner,2,9.935739
175,32.9,3.11,Yes,Sun,Dinner,2,9.452888
77,27.2,4.0,No,Thur,Lunch,4,14.705882
187,30.46,2.0,Yes,Sun,Dinner,5,6.565988
227,20.45,3.0,No,Sat,Dinner,4,14.669927
89,21.16,3.0,No,Thur,Lunch,2,14.177694
198,13.0,2.0,Yes,Thur,Lunch,2,15.384615
148,9.78,1.73,No,Thur,Lunch,2,17.689162
159,16.49,2.0,No,Sun,Dinner,4,12.128563
197,43.11,5.0,Yes,Thur,Lunch,4,11.598237


In [78]:
# tips.groupby(['day','smoker']).mean()  # 아래와 동일
tips.pivot_table(index=['day','smoker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,size,tip,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Sat,No,12.302833,3.0,2.505,20.34
Sun,No,12.128563,4.0,2.0,16.49
Sun,Yes,8.009438,3.5,2.555,31.68
Thur,No,15.524246,2.666667,2.91,19.38
Thur,Yes,13.491426,3.0,3.5,28.055


In [84]:
tips.pivot_table(['tip','size'], index=['day','smoker'], columns='time')

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip,tip
Unnamed: 0_level_1,time,Dinner,Lunch,Dinner,Lunch
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Sat,No,3.0,,2.505,
Sun,No,4.0,,2.0,
Sun,Yes,3.5,,2.555,
Thur,No,,2.666667,,2.91
Thur,Yes,,3.0,,3.5


In [86]:
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size,pct
68,20.23,2.01,No,Sat,Dinner,2,9.935739
175,32.9,3.11,Yes,Sun,Dinner,2,9.452888
77,27.2,4.0,No,Thur,Lunch,4,14.705882
187,30.46,2.0,Yes,Sun,Dinner,5,6.565988
227,20.45,3.0,No,Sat,Dinner,4,14.669927
89,21.16,3.0,No,Thur,Lunch,2,14.177694
198,13.0,2.0,Yes,Thur,Lunch,2,15.384615
148,9.78,1.73,No,Thur,Lunch,2,17.689162
159,16.49,2.0,No,Sun,Dinner,4,12.128563
197,43.11,5.0,Yes,Thur,Lunch,4,11.598237


In [89]:
tips.pivot_table(['tip','size'], index=['day','smoker'], columns='time', margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip,tip,tip
Unnamed: 0_level_1,time,Dinner,Lunch,All,Dinner,Lunch,All
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Sat,No,3.0,,3.0,2.505,,2.505
Sun,No,4.0,,4.0,2.0,,2.0
Sun,Yes,3.5,,3.5,2.555,,2.555
Thur,No,,2.666667,2.666667,,2.91,2.91
Thur,Yes,,3.0,3.0,,3.5,3.5
All,,3.4,2.8,3.1,2.424,3.146,2.785


In [90]:
tips.pivot_table(['tip','size'], index=['day','smoker'], columns='time', margins=True,fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip,tip,tip
Unnamed: 0_level_1,time,Dinner,Lunch,All,Dinner,Lunch,All
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Sat,No,3.0,0.0,3.0,2.505,0.0,2.505
Sun,No,4.0,0.0,4.0,2.0,0.0,2.0
Sun,Yes,3.5,0.0,3.5,2.555,0.0,2.555
Thur,No,0.0,2.666667,2.666667,0.0,2.91,2.91
Thur,Yes,0.0,3.0,3.0,0.0,3.5,3.5
All,,3.4,2.8,3.1,2.424,3.146,2.785


In [91]:
tips.pivot_table(['tip','size'], index=['day','smoker'], columns='time', margins=True,fill_value=0, aggfunc='sum')


Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip,tip,tip
Unnamed: 0_level_1,time,Dinner,Lunch,All,Dinner,Lunch,All
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Sat,No,6,0,6,5.01,0.0,5.01
Sun,No,4,0,4,2.0,0.0,2.0
Sun,Yes,7,0,7,5.11,0.0,5.11
Thur,No,0,8,8,0.0,8.73,8.73
Thur,Yes,0,6,6,0.0,7.0,7.0
All,,17,14,31,12.12,15.73,27.85


### 교차 일람표 - crosstab()
그룹 빈도를 개산하기 위한 피벗 테이블의 특수한 경우

In [93]:
pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Sat,2,0,2
Dinner,Sun,1,2,3
Lunch,Thur,3,2,5
All,,6,4,10


### merge()

In [41]:
 file_path='examples/drinks.csv'
 drinks = pd.read_csv(file_path)
 drinks['continent'] = drinks['continent'].fillna('OT')

 result = drinks.groupby('continent').mean()['wine_servings']
 df = result.to_frame().reset_index()
 df = df.rename(columns={'wine_servings':'wine_servings_cont_avg'})
 df = pd.merge(drinks, df, on='continent', how='outer')
 display('drinks','df')

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF
...,...,...,...,...,...,...
188,Venezuela,333,100,3,7.7,SA
189,Vietnam,111,2,1,2.0,AS
190,Yemen,6,0,0,0.1,AS
191,Zambia,32,19,4,2.5,AF

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent,wine_servings_cont_avg
0,Afghanistan,0,0,0,0.0,AS,9.068182
1,Bahrain,42,63,7,2.0,AS,9.068182
2,Bangladesh,0,0,0,0.0,AS,9.068182
3,Bhutan,23,0,0,0.4,AS,9.068182
4,Brunei,31,2,1,0.6,AS,9.068182
...,...,...,...,...,...,...,...
188,Samoa,105,18,24,2.6,OC,35.625000
189,Solomon Islands,56,11,1,1.2,OC,35.625000
190,Tonga,36,21,5,1.1,OC,35.625000
191,Tuvalu,6,41,9,1.0,OC,35.625000


In [39]:
# 위의 전체 과정을 한줄로 바꿀수 있음
drinks['wine_servings_cont_avg'] = drinks.groupby('continent')['wine_servings'].transform(np.mean)
drinks

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent,wine_servings_cont_avg
0,Afghanistan,0,0,0,0.0,AS,9.068182
1,Albania,89,132,54,4.9,EU,142.222222
2,Algeria,25,0,14,0.7,AF,16.264151
3,Andorra,245,138,312,12.4,EU,142.222222
4,Angola,217,57,45,5.9,AF,16.264151
...,...,...,...,...,...,...,...
188,Venezuela,333,100,3,7.7,SA,62.416667
189,Vietnam,111,2,1,2.0,AS,9.068182
190,Yemen,6,0,0,0.1,AS,9.068182
191,Zambia,32,19,4,2.5,AF,16.264151
