# Numpy と Pandas を用いた演算

In [1]:
# 数値計算やデータフレーム操作に関するライブラリをインポートする
import numpy as np
import pandas as pd

## まずは Pandas の基本操作から

In [2]:
df = pd.DataFrame([[1,4,7,10,13,16],[2,5,8,11,14,17],[3,6,9,12,15,18],[21,24,27,20,23,26]],
                   index = ['i1','i2','i3', 'i4'],
                   columns = list("abcdef"))

In [3]:
df

Unnamed: 0,a,b,c,d,e,f
i1,1,4,7,10,13,16
i2,2,5,8,11,14,17
i3,3,6,9,12,15,18
i4,21,24,27,20,23,26


In [4]:
# インデックス名を指定した行の取り出し。
df.ix['i1']

a     1
b     4
c     7
d    10
e    13
f    16
Name: i1, dtype: int64

In [5]:
# インデックス番号を指定した行の取り出し
df.ix[1]

a     2
b     5
c     8
d    11
e    14
f    17
Name: i2, dtype: int64

In [6]:
# インデックス番号を指定した行の取り出し
df.ix[1:] # （ゼロスタートで）１行目以降を取り出す

Unnamed: 0,a,b,c,d,e,f
i2,2,5,8,11,14,17
i3,3,6,9,12,15,18
i4,21,24,27,20,23,26


In [7]:
# インデックス番号を指定した行の取り出し
df.ix[:1] # （ゼロスタートで）１行目より手前を取り出す

Unnamed: 0,a,b,c,d,e,f
i1,1,4,7,10,13,16


In [8]:
# インデックス番号を指定した行の取り出し
df.ix[1:3] # （ゼロスタートで）１行目から、３行目の手前までを取り出す

Unnamed: 0,a,b,c,d,e,f
i2,2,5,8,11,14,17
i3,3,6,9,12,15,18


In [9]:
# 一つ目のパラメータで行を、二つ目のパラメータで列を指定して取り出す
df.ix['i3','b']

6

In [10]:
# : は全指定の意味
df.ix[:, 'a']

i1     1
i2     2
i3     3
i4    21
Name: a, dtype: int64

In [11]:
# 複数の指定も可能。飛び飛びの指定も可能。
# 番号での指定も名前での指定も可能。
df.ix[[1,3], ['b','d']]

Unnamed: 0,b,d
i2,5,11
i4,24,20


In [12]:
# 列に関する操作は[カラム名]で渡す。
df['a']

i1     1
i2     2
i3     3
i4    21
Name: a, dtype: int64

In [13]:
# arrayとして取得する。
df['a'].values

array([ 1,  2,  3, 21])

In [14]:
# さらにindex名を指定することで値として取得できる。
df['a']['i3']

3

In [15]:
# DataFrameをtableとみなして、位置指定から値を明示的に取る方法。
df.iloc[2,3]

12

In [16]:
# これも同じ
df.ix[2,3]

12

In [17]:
# 行の取得
df.iloc[2]

a     3
b     6
c     9
d    12
e    15
f    18
Name: i3, dtype: int64

In [18]:
# 複数の列の取得
df.iloc[:, 2:4]

Unnamed: 0,c,d
i1,7,10
i2,8,11
i3,9,12
i4,27,20


## Numpy で生成した乱数を Pandas で使う

In [19]:
df1 = pd.DataFrame(np.random.randint(10, size=(4,5)))

In [20]:
df1

Unnamed: 0,0,1,2,3,4
0,1,2,9,4,4
1,2,9,8,5,4
2,8,5,0,1,2
3,9,1,4,1,9


In [21]:
# 条件を満たすものだけを抽出
df1[df1>1]

Unnamed: 0,0,1,2,3,4
0,,2.0,9.0,4.0,4
1,2.0,9.0,8.0,5.0,4
2,8.0,5.0,,,2
3,9.0,,4.0,,9


In [22]:
# # 条件をみたすものに-1を代入
df1[df1>5] = -1

In [23]:
df1

Unnamed: 0,0,1,2,3,4
0,1,2,-1,4,4
1,2,-1,-1,5,4
2,-1,5,0,1,2
3,-1,1,4,1,-1


## 欠損値を含むデータを取り扱う

In [24]:
# 欠損値 (NaN) を含むランダムデータを作成する
df2 = pd.DataFrame(np.random.randint(10, size=(8,7)))
df2 = df2[df2>0]

In [25]:
df2

Unnamed: 0,0,1,2,3,4,5,6
0,8,4.0,1,8.0,5.0,1.0,2
1,1,,9,4.0,2.0,,3
2,8,4.0,9,3.0,5.0,2.0,2
3,5,7.0,6,,6.0,2.0,8
4,6,6.0,5,1.0,4.0,2.0,4
5,4,3.0,9,8.0,,4.0,7
6,3,,1,9.0,,9.0,7
7,3,7.0,6,6.0,2.0,2.0,2


In [26]:
# NaNを含む行を削除
df2.dropna()

Unnamed: 0,0,1,2,3,4,5,6
0,8,4,1,8,5,1,2
2,8,4,9,3,5,2,2
4,6,6,5,1,4,2,4
7,3,7,6,6,2,2,2


In [27]:
# NaNを含む列を削除
df2.dropna(axis=1)

Unnamed: 0,0,2,6
0,8,1,2
1,1,9,3
2,8,9,2
3,5,6,8
4,6,5,4
5,4,9,7
6,3,1,7
7,3,6,2


In [28]:
# NaNを埋める
df2.fillna(-1)

Unnamed: 0,0,1,2,3,4,5,6
0,8,4,1,8,5,1,2
1,1,-1,9,4,2,-1,3
2,8,4,9,3,5,2,2
3,5,7,6,-1,6,2,8
4,6,6,5,1,4,2,4
5,4,3,9,8,-1,4,7
6,3,-1,1,9,-1,9,7
7,3,7,6,6,2,2,2


In [29]:
#直前の値で埋める
df2.fillna(method='pad')

Unnamed: 0,0,1,2,3,4,5,6
0,8,4,1,8,5,1,2
1,1,4,9,4,2,1,3
2,8,4,9,3,5,2,2
3,5,7,6,3,6,2,8
4,6,6,5,1,4,2,4
5,4,3,9,8,4,4,7
6,3,3,1,9,4,9,7
7,3,7,6,6,2,2,2


In [30]:
#直後の値で埋める
df2.fillna(method='bfill')

Unnamed: 0,0,1,2,3,4,5,6
0,8,4,1,8,5,1,2
1,1,4,9,4,2,2,3
2,8,4,9,3,5,2,2
3,5,7,6,1,6,2,8
4,6,6,5,1,4,2,4
5,4,3,9,8,2,4,7
6,3,7,1,9,2,9,7
7,3,7,6,6,2,2,2


In [31]:
# misssing valueの前後の線形の値で埋める
df2.apply(pd.Series.interpolate)

Unnamed: 0,0,1,2,3,4,5,6
0,8,4,1,8,5.0,1.0,2
1,1,4,9,4,2.0,1.5,3
2,8,4,9,3,5.0,2.0,2
3,5,7,6,2,6.0,2.0,8
4,6,6,5,1,4.0,2.0,4
5,4,3,9,8,3.333333,4.0,7
6,3,5,1,9,2.666667,9.0,7
7,3,7,6,6,2.0,2.0,2


## 重複のあるデータを取り扱う

In [32]:
df3 = pd.DataFrame(np.random.randint(2, size=(10,4)))

In [33]:
df3

Unnamed: 0,0,1,2,3
0,0,0,0,0
1,1,1,0,0
2,0,1,0,1
3,1,0,0,0
4,0,1,0,0
5,1,1,1,1
6,0,0,0,1
7,0,0,0,0
8,1,0,0,0
9,1,0,1,0


In [34]:
# 重複を調べる
df3.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
8     True
9    False
dtype: bool

In [35]:
# 重複を調査するcolumnを指定することも可能
df3.duplicated(0)

0    False
1    False
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
dtype: bool

In [36]:
# 重複を除去する
df3.drop_duplicates()

Unnamed: 0,0,1,2,3
0,0,0,0,0
1,1,1,0,0
2,0,1,0,1
3,1,0,0,0
4,0,1,0,0
5,1,1,1,1
6,0,0,0,1
9,1,0,1,0


In [37]:
# 指定した列の重複を除去する
df3.drop_duplicates(0)

Unnamed: 0,0,1,2,3
0,0,0,0,0
1,1,1,0,0


In [38]:
# 指定した列の重複を除去し最後のを残す
df3.drop_duplicates(0, take_last=True)

Unnamed: 0,0,1,2,3
7,0,0,0,0
9,1,0,1,0


## 行列演算を行う

In [39]:
A = pd.DataFrame(np.random.randint(10, size=(2,2)))
B = pd.DataFrame(np.random.randint(10, size=(2,2)))

In [40]:
A

Unnamed: 0,0,1
0,4,4
1,3,6


In [41]:
B

Unnamed: 0,0,1
0,6,3
1,1,5


In [42]:
# 行列の転置
A.T

Unnamed: 0,0,1
0,4,3
1,4,6


In [43]:
# 行列の転置
B.T

Unnamed: 0,0,1
0,6,1
1,3,5


In [44]:
# 行列の要素ごとの和
A + B

Unnamed: 0,0,1
0,10,7
1,4,11


In [45]:
# 行列の要素ごとの積（「行列の積」ではない）
A * B

Unnamed: 0,0,1
0,24,12
1,3,30


In [46]:
# 行列の積をとりたい場合は DataFrame.dot。ただし、行列の積をとるためには元データの columns と 引数の index のラベルが一致している必要がある
A.dot(B)

Unnamed: 0,0,1
0,28,32
1,24,39


In [47]:
B.dot(A)

Unnamed: 0,0,1
0,33,42
1,19,34


## 簡単な統計量

In [48]:
df4 = pd.DataFrame(np.random.randint(10, size=(5,10)))

In [49]:
df4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,7,5,5,9,4,8,6,4,4,5
1,8,6,0,9,9,5,0,9,6,3
2,1,8,2,0,1,5,7,1,2,4
3,1,3,3,5,2,2,6,6,0,2
4,1,8,3,2,1,8,6,1,7,0


In [50]:
# 基本統計量の表示
df4.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,3.6,6.0,2.6,5.0,3.4,5.6,5.0,4.2,3.8,2.8
std,3.577709,2.12132,1.81659,4.062019,3.361547,2.50998,2.828427,3.420526,2.863564,1.923538
min,1.0,3.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0
25%,1.0,5.0,2.0,2.0,1.0,5.0,6.0,1.0,2.0,2.0
50%,1.0,6.0,3.0,5.0,2.0,5.0,6.0,4.0,4.0,3.0
75%,7.0,8.0,3.0,9.0,4.0,8.0,6.0,6.0,6.0,4.0
max,8.0,8.0,5.0,9.0,9.0,8.0,7.0,9.0,7.0,5.0


In [51]:
# 列の合計値
df4.sum()

0    18
1    30
2    13
3    25
4    17
5    28
6    25
7    21
8    19
9    14
dtype: int64

In [52]:
# 列の平均値
df4.mean()

0    3.6
1    6.0
2    2.6
3    5.0
4    3.4
5    5.6
6    5.0
7    4.2
8    3.8
9    2.8
dtype: float64

In [53]:
# 列の不偏分散
df4.var()

0    12.8
1     4.5
2     3.3
3    16.5
4    11.3
5     6.3
6     8.0
7    11.7
8     8.2
9     3.7
dtype: float64

In [54]:
# 列の標本分散
df4.var(ddof=False)

0    10.24
1     3.60
2     2.64
3    13.20
4     9.04
5     5.04
6     6.40
7     9.36
8     6.56
9     2.96
dtype: float64

In [55]:
# 列の不偏標準偏差
df4.std()

0    3.577709
1    2.121320
2    1.816590
3    4.062019
4    3.361547
5    2.509980
6    2.828427
7    3.420526
8    2.863564
9    1.923538
dtype: float64

In [56]:
# 列の標本標準偏差
df4.std(ddof=False)

0    3.200000
1    1.897367
2    1.624808
3    3.633180
4    3.006659
5    2.244994
6    2.529822
7    3.059412
8    2.561250
9    1.720465
dtype: float64

In [57]:
# 行の合計値
df4.sum(axis = 1)

0    57
1    55
2    31
3    30
4    37
dtype: int64

In [58]:
# 行の平均値
df4.mean(axis = 1)

0    5.7
1    5.5
2    3.1
3    3.0
4    3.7
dtype: float64

## 行列の正規化（標準化）

正規化 (normalize) とは、異なる基準のデータを一定の基準にしたがって変形し利用しやすくすることです。

In [59]:
df4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,7,5,5,9,4,8,6,4,4,5
1,8,6,0,9,9,5,0,9,6,3
2,1,8,2,0,1,5,7,1,2,4
3,1,3,3,5,2,2,6,6,0,2
4,1,8,3,2,1,8,6,1,7,0


In [60]:
# 一般的には平均 0 、分散 (及び標準偏差) が 1 になるように値を変換することを指します。
df4.apply(lambda x: (x-x.mean())/x.std(), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.950329,-0.471405,1.321157,0.984732,0.178489,0.956183,0.353553,-0.058471,0.069843,1.143726
1,1.229837,0.0,-1.431253,0.984732,1.6659,-0.239046,-1.767767,1.403293,0.768273,0.103975
2,-0.726722,0.942809,-0.330289,-1.230915,-0.713957,-0.239046,0.707107,-0.935529,-0.628587,0.62385
3,-0.726722,-1.414214,0.220193,0.0,-0.416475,-1.434274,0.353553,0.526235,-1.327018,-0.4159
4,-0.726722,0.942809,0.220193,-0.738549,-0.713957,0.956183,0.353553,-0.935529,1.117488,-1.455651


In [61]:
# 最大値を1、最小値を0にするような正規化もできます。
df4.apply(lambda x: (x-x.min())/(x.max() - x.min()), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.857143,0.4,1.0,1.0,0.375,1.0,0.857143,0.375,0.571429,1.0
1,1.0,0.6,0.0,1.0,1.0,0.5,0.0,1.0,0.857143,0.6
2,0.0,1.0,0.4,0.0,0.0,0.5,1.0,0.0,0.285714,0.8
3,0.0,0.0,0.6,0.555556,0.125,0.0,0.857143,0.625,0.0,0.4
4,0.0,1.0,0.6,0.222222,0.0,1.0,0.857143,0.0,1.0,0.0


In [62]:
# 合計値が１になるような正規化もできます。
df4.apply(lambda x: x/x.sum(), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.388889,0.166667,0.384615,0.36,0.235294,0.285714,0.24,0.190476,0.210526,0.357143
1,0.444444,0.2,0.0,0.36,0.529412,0.178571,0.0,0.428571,0.315789,0.214286
2,0.055556,0.266667,0.153846,0.0,0.058824,0.178571,0.28,0.047619,0.105263,0.285714
3,0.055556,0.1,0.230769,0.2,0.117647,0.071429,0.24,0.285714,0.0,0.142857
4,0.055556,0.266667,0.230769,0.08,0.058824,0.285714,0.24,0.047619,0.368421,0.0


## 相関行列

相関行列とは、各要素間の相関係数を並べたものであり、その性質から必ず対称行列である。

In [63]:
# まずランダムな行列を作ってみる
df5 = pd.DataFrame(np.random.rand(5, 10))

In [64]:
df5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.226256,0.179995,0.501218,0.042712,0.518461,0.250401,0.538263,0.277074,0.38164,0.206244
1,0.058678,0.581003,0.778982,0.003519,0.347807,0.707787,0.521578,0.328937,0.394446,0.817811
2,0.626644,0.040506,0.117516,0.857788,0.701137,0.275158,0.807031,0.518195,0.98166,0.825709
3,0.204449,0.719918,0.911821,0.955502,0.270247,0.281635,0.148356,0.671398,0.441276,0.380514
4,0.08375,0.181657,0.208537,0.896279,0.70532,0.808194,0.221535,0.217418,0.090762,0.379003


In [65]:
# 行間の相関行列
pd.DataFrame(np.corrcoef(df5.dropna().as_matrix().tolist()))

Unnamed: 0,0,1,2,3,4
0,1.0,0.331614,0.008597,-0.361545,-0.282089
1,0.331614,1.0,-0.438151,-0.057235,-0.123217
2,0.008597,-0.438151,1.0,-0.345468,0.090937
3,-0.361545,-0.057235,-0.345468,1.0,0.104524
4,-0.282089,-0.123217,0.090937,0.104524,1.0


In [66]:
# 列間の相関行列
pd.DataFrame(np.corrcoef(df5.dropna().T.as_matrix().tolist()))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,-0.546697,-0.530846,0.354762,0.446902,-0.654766,0.700749,0.461793,0.924027,0.370019
1,-0.546697,1.0,0.948513,-0.076822,-0.942433,0.081172,-0.584482,0.450873,-0.285252,-0.000772
2,-0.530846,0.948513,1.0,-0.306073,-0.994661,-0.094762,-0.472527,0.383875,-0.263352,-0.129336
3,0.354762,-0.076822,-0.306073,1.0,0.295043,-0.072207,-0.340681,0.507647,0.16083,-0.027219
4,0.446902,-0.942433,-0.994661,0.295043,1.0,0.155644,0.401186,-0.452233,0.163044,0.059461
5,-0.654766,0.081172,-0.094762,-0.072207,0.155644,1.0,-0.321037,-0.618987,-0.632376,0.19221
6,0.700749,-0.584482,-0.472527,-0.340681,0.401186,-0.321037,1.0,-0.069271,0.76752,0.587976
7,0.461793,0.450873,0.383875,0.507647,-0.452233,-0.618987,-0.069271,1.0,0.568691,0.194874
8,0.924027,-0.285252,-0.263352,0.16083,0.163044,-0.632376,0.76752,0.568691,1.0,0.598632
9,0.370019,-0.000772,-0.129336,-0.027219,0.059461,0.19221,0.587976,0.194874,0.598632,1.0
