# pandasライブラリ活用入門 

## 第１章DataFrameの基礎

In [2]:
import pandas as pd

In [10]:
df = pd.read_csv("./data/gapminder.tsv", sep="\t")

In [11]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952.0,28.801,8425333.0,779.445314
1,Afghanistan,Asia,1957.0,30.332,9240934.0,820.85303
2,Afghanistan,Asia,1962.0,31.997,10267083.0,853.10071
3,Afghanistan,Asia,1967.0,34.02,11537966.0,836.197138
4,Afghanistan,Asia,1972.0,36.088,13079460.0,739.981106


### 行を絞り込む

In [13]:
df.loc[0]

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop          8.42533e+06
gdpPercap        779.445
Name: 0, dtype: object

### groupごとの平均値

In [14]:
df.groupby("year")["lifeExp"].mean()

year
1952.0    49.057620
1957.0    51.507401
1962.0    53.609249
1967.0    55.678290
1972.0    57.647386
1977.0    59.570157
1982.0    61.533197
1987.0    63.212613
1992.0    64.160338
1997.0    65.014676
2002.0    65.694923
2007.0    67.007423
Name: lifeExp, dtype: float64

In [18]:
# 複数の組み合わせ
gp = df.groupby(["year", "continent"])[["lifeExp", "gdpPercap"]].mean()
gp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lifeExp,gdpPercap
year,continent,Unnamed: 2_level_1,Unnamed: 3_level_1
1952.0,Africa,39.1355,1252.572466
1952.0,Americas,53.27984,4079.062552
1952.0,Asia,46.314394,5195.484004
1952.0,Europe,64.4085,5661.057435
1952.0,Oceania,69.255,10298.08565


In [20]:
# indexを平坦化したい場合はresetindex
flat = gp.reset_index()
flat.head()

Unnamed: 0,year,continent,lifeExp,gdpPercap
0,1952.0,Africa,39.1355,1252.572466
1,1952.0,Americas,53.27984,4079.062552
2,1952.0,Asia,46.314394,5195.484004
3,1952.0,Europe,64.4085,5661.057435
4,1952.0,Oceania,69.255,10298.08565


### グループごとの度数、頻度

In [31]:
# seriseに含まれるユニークな値の頻度
# 重複除く
print(df.groupby('continent')["country"].nunique())
# 重複含む
print(df.groupby('continent')["country"].value_counts())

continent
Africa      52
Americas    25
Asia        33
Europe      30
Oceania      2
Name: country, dtype: int64
continent  country                 
Africa     Algeria                     12
           Angola                      12
           Benin                       12
           Botswana                    12
           Burkina Faso                12
           Burundi                     12
           Cameroon                    12
           Central African Republic    12
           Chad                        12
           Comoros                     12
           Congo, Dem. Rep.            12
           Congo, Rep.                 12
           Cote d'Ivoire               12
           Djibouti                    12
           Egypt                       12
           Equatorial Guinea           12
           Eritrea                     12
           Ethiopia                    12
           Gabon                       12
           Gambia                      12
           

## 第二章pandasのデータ構造

In [34]:
## seriseをつくる
s = pd.Series(["banana", 42])
s

0    banana
1        42
dtype: object

In [36]:
## dataframeをつくる
df = pd.DataFrame({
    "name":["aaa", "bbb"]
    ,"year":[1,2]
})
df

Unnamed: 0,name,year
0,aaa,1
1,bbb,2


## pickle(ピクル)
pythonではデータをpickleで永続化できる。データをシリアライズし、バイナリフォーマットで保存する。pickleデータの読み出しには後方互換性がある。pickleは一連の計算処理の中間ファイルやpyyhon環境のみに閉じる場合有効なファイル形式である。

In [38]:
df.to_pickle("./output/test.pickle")

In [39]:
df = pd.read_pickle("./output/test.pickle")
df

Unnamed: 0,name,year
0,aaa,1
1,bbb,2
