## １０章groupbyによる分割ー適用ー結合

### 集約

In [2]:
import pandas as pd
df = pd.read_csv("./data/gapminder.tsv", sep="\t")
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952.0,28.801,8425333.0,779.445314
1,Afghanistan,Asia,1957.0,30.332,9240934.0,820.85303
2,Afghanistan,Asia,1962.0,31.997,10267083.0,853.10071
3,Afghanistan,Asia,1967.0,34.02,11537966.0,836.197138
4,Afghanistan,Asia,1972.0,36.088,13079460.0,739.981106


In [4]:
# 年で集約
df.groupby("year")["lifeExp"].mean()

year
1952.0    49.057620
1957.0    51.507401
1962.0    53.609249
1967.0    55.678290
1972.0    57.647386
1977.0    59.570157
1982.0    61.533197
1987.0    63.212613
1992.0    64.160338
1997.0    65.014676
2002.0    65.694923
2007.0    67.007423
Name: lifeExp, dtype: float64

In [7]:
# 複数の要約統計量
df.groupby("continent")["lifeExp"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Africa,624.0,48.86533,9.15021,23.599,42.3725,47.792,54.4115,76.442
Americas,300.0,64.658737,9.345088,37.579,58.41,67.048,71.6995,80.653
Asia,396.0,60.064903,11.864532,28.801,51.42625,61.7915,69.50525,82.603
Europe,360.0,71.903686,5.433178,43.585,69.57,72.241,75.4505,81.757
Oceania,24.0,74.326208,3.795611,69.12,71.205,73.665,77.5525,81.235


### 集約関数

In [9]:
import numpy as np
df.groupby("continent")["lifeExp"].agg(np.mean)
# 上記と同じ
df.groupby("continent")["lifeExp"].aggregate(np.mean)

continent
Africa      48.865330
Americas    64.658737
Asia        60.064903
Europe      71.903686
Oceania     74.326208
Name: lifeExp, dtype: float64

### 自作関数の利用

In [10]:
def my_mean(val):
    n = len(val)
    sum = 0
    for v in val:
        sum += v
    return (sum / n)

In [11]:
df.groupby("continent")["lifeExp"].agg(my_mean)

continent
Africa      48.865330
Americas    64.658737
Asia        60.064903
Europe      71.903686
Oceania     74.326208
Name: lifeExp, dtype: float64

In [13]:
def my_mean_diff(values, diff_value):
    """平均値とdiff_valueの差
    """
    n = len(values)
    sum = 0
    for value in values:
        sum += value
    mean = sum / n
    return (mean - diff_value)

In [14]:
g_mean = df["lifeExp"].agg(np.mean)
print(g_mean)
df.groupby("year")["lifeExp"].agg(my_mean_diff, diff_value=g_mean)

59.47443936619713


year
1952.0   -10.416820
1957.0    -7.967038
1962.0    -5.865190
1967.0    -3.796150
1972.0    -1.827053
1977.0     0.095718
1982.0     2.058758
1987.0     3.738173
1992.0     4.685899
1997.0     5.540237
2002.0     6.220483
2007.0     7.532983
Name: lifeExp, dtype: float64

### 複数の関数を同時に計算

In [16]:
df.groupby("year")["lifeExp"].agg([np.count_nonzero, np.mean, np.std])

Unnamed: 0_level_0,count_nonzero,mean,std
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952.0,142.0,49.05762,12.225956
1957.0,142.0,51.507401,12.231286
1962.0,142.0,53.609249,12.097245
1967.0,142.0,55.67829,11.718858
1972.0,142.0,57.647386,11.381953
1977.0,142.0,59.570157,11.227229
1982.0,142.0,61.533197,10.770618
1987.0,142.0,63.212613,10.556285
1992.0,142.0,64.160338,11.22738
1997.0,142.0,65.014676,11.559439


### 変換
aggと異なり一対一のデータを返す。

In [22]:
# 標準スコア
def my_zscore(x):
    return( ( x - x.mean() ) / x.std() )