# Numpy と Pandas を用いた演算

In [1]:
# 数値計算やデータフレーム操作に関するライブラリをインポートする
import numpy as np
import pandas as pd

## まずは Pandas の基本操作から

In [2]:
df = pd.DataFrame([[1,4,7],[2,5,8],[3,6,9]],
                   index = ['i1','i2','i3'],
                   columns = list("abc"))

In [3]:
df

Unnamed: 0,a,b,c
i1,1,4,7
i2,2,5,8
i3,3,6,9


In [4]:
# 指定した行の取り出し。index名かindex番号で与えられる。
df.ix['i1']

a    1
b    4
c    7
Name: i1, dtype: int64

In [5]:
df.ix[1]

a    2
b    5
c    8
Name: i2, dtype: int64

In [6]:
# 2つ目のparameterに列を渡すことで、行を取得できる。
df.ix['i1','a']

1

In [7]:
# : は全指定する。(R, scilabと対応)
df.ix[:, 'a']

i1    1
i2    2
i3    3
Name: a, dtype: int64

In [8]:
# 複数の指定も配列で可能。
df.ix[[1,2], ['b','c']]

Unnamed: 0,b,c
i2,5,8
i3,6,9


In [9]:
df.ix[[1,2], [1,2]]

Unnamed: 0,b,c
i2,5,8
i3,6,9


In [10]:
# 列に関する操作は[column名]で渡す。
df['a']

i1    1
i2    2
i3    3
Name: a, dtype: int64

In [11]:
# arrayとして取得する。
df['a'].values

array([1, 2, 3])

In [12]:
# さらにindex名を指定することで値として取得できる。
df['a']['i3']

3

In [13]:
# DataFrameをtableとみなして、位置指定から値を明示的に取る方法。
df.iloc[0,0]

1

In [14]:
# これも同じ
df.ix[0,0]

1

In [15]:
# 列の取得
df.iloc[2]

a    3
b    6
c    9
Name: i3, dtype: int64

## Numpy で生成した乱数を Pandas で使う

In [16]:
df1 = pd.DataFrame(np.random.randint(10, size=(4,5)))

In [17]:
df1

Unnamed: 0,0,1,2,3,4
0,8,2,2,6,1
1,3,9,6,3,7
2,3,6,2,0,5
3,3,0,4,4,5


In [18]:
# 条件を満たすものだけを抽出
df1[df1>5]

Unnamed: 0,0,1,2,3,4
0,8.0,,,6.0,
1,,9.0,6.0,,7.0
2,,6.0,,,
3,,,,,


In [19]:
# # 条件をみたすものに-1を代入
df1[df1>5] = -1

In [20]:
df1

Unnamed: 0,0,1,2,3,4
0,-1,2,2,-1,1
1,3,-1,-1,3,-1
2,3,-1,2,0,5
3,3,0,4,4,5


## 欠損値を含むデータを取り扱う

In [21]:
# 欠損値 (NaN) を含むランダムデータを作成する
df2 = pd.DataFrame(np.random.randint(10, size=(8,7)))
df2 = df2[df2>0]

In [22]:
df2

Unnamed: 0,0,1,2,3,4,5,6
0,9,2.0,2.0,1,2,2,3.0
1,8,,2.0,2,6,1,8.0
2,3,4.0,,4,5,9,
3,3,5.0,3.0,1,6,3,4.0
4,8,5.0,2.0,9,4,7,2.0
5,6,3.0,7.0,9,9,3,
6,6,1.0,6.0,3,5,3,2.0
7,4,8.0,3.0,6,3,7,2.0


In [23]:
# NaNを含む行を削除
df2.dropna()

Unnamed: 0,0,1,2,3,4,5,6
0,9,2,2,1,2,2,3
3,3,5,3,1,6,3,4
4,8,5,2,9,4,7,2
6,6,1,6,3,5,3,2
7,4,8,3,6,3,7,2


In [24]:
# NaNを含む列を削除
df2.dropna(axis=1)

Unnamed: 0,0,3,4,5
0,9,1,2,2
1,8,2,6,1
2,3,4,5,9
3,3,1,6,3
4,8,9,4,7
5,6,9,9,3
6,6,3,5,3
7,4,6,3,7


In [25]:
# NaNを埋める
df2.fillna(-1)

Unnamed: 0,0,1,2,3,4,5,6
0,9,2,2,1,2,2,3
1,8,-1,2,2,6,1,8
2,3,4,-1,4,5,9,-1
3,3,5,3,1,6,3,4
4,8,5,2,9,4,7,2
5,6,3,7,9,9,3,-1
6,6,1,6,3,5,3,2
7,4,8,3,6,3,7,2


In [26]:
#直前の値で埋める
df2.fillna(method='pad')

Unnamed: 0,0,1,2,3,4,5,6
0,9,2,2,1,2,2,3
1,8,2,2,2,6,1,8
2,3,4,2,4,5,9,8
3,3,5,3,1,6,3,4
4,8,5,2,9,4,7,2
5,6,3,7,9,9,3,2
6,6,1,6,3,5,3,2
7,4,8,3,6,3,7,2


In [27]:
#直後の値で埋める
df2.fillna(method='bfill')

Unnamed: 0,0,1,2,3,4,5,6
0,9,2,2,1,2,2,3
1,8,4,2,2,6,1,8
2,3,4,3,4,5,9,4
3,3,5,3,1,6,3,4
4,8,5,2,9,4,7,2
5,6,3,7,9,9,3,2
6,6,1,6,3,5,3,2
7,4,8,3,6,3,7,2


In [28]:
# misssing valueの前後の線形の値で埋める
df2.apply(pd.Series.interpolate)

Unnamed: 0,0,1,2,3,4,5,6
0,9,2,2.0,1,2,2,3
1,8,3,2.0,2,6,1,8
2,3,4,2.5,4,5,9,6
3,3,5,3.0,1,6,3,4
4,8,5,2.0,9,4,7,2
5,6,3,7.0,9,9,3,2
6,6,1,6.0,3,5,3,2
7,4,8,3.0,6,3,7,2


## 重複のあるデータを取り扱う

In [29]:
df3 = pd.DataFrame(np.random.randint(2, size=(10,4)))

In [30]:
df3

Unnamed: 0,0,1,2,3
0,0,1,1,1
1,0,0,1,1
2,0,0,1,0
3,1,1,0,1
4,1,1,1,1
5,0,0,0,1
6,0,1,1,1
7,1,1,0,1
8,0,1,0,0
9,0,1,0,1


In [31]:
# 重複を調べる
df3.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8    False
9    False
dtype: bool

In [32]:
# 重複を調査するcolumnを指定することも可能
df3.duplicated(0)

0    False
1     True
2     True
3    False
4     True
5     True
6     True
7     True
8     True
9     True
dtype: bool

In [33]:
# 重複を除去する
df3.drop_duplicates()

Unnamed: 0,0,1,2,3
0,0,1,1,1
1,0,0,1,1
2,0,0,1,0
3,1,1,0,1
4,1,1,1,1
5,0,0,0,1
8,0,1,0,0
9,0,1,0,1


In [34]:
# 指定した列の重複を除去する
df3.drop_duplicates(0)

Unnamed: 0,0,1,2,3
0,0,1,1,1
3,1,1,0,1


In [35]:
# 指定した列の重複を除去し最後のを残す
df3.drop_duplicates(0, take_last=True)

Unnamed: 0,0,1,2,3
7,1,1,0,1
9,0,1,0,1


## 行列演算を行う

In [36]:
A = pd.DataFrame(np.random.randint(10, size=(2,2)))
B = pd.DataFrame(np.random.randint(10, size=(2,2)))

In [37]:
A

Unnamed: 0,0,1
0,5,1
1,5,6


In [38]:
B

Unnamed: 0,0,1
0,8,6
1,8,2


In [39]:
# 行列の転置
A.T

Unnamed: 0,0,1
0,5,5
1,1,6


In [40]:
# 行列の転置
B.T

Unnamed: 0,0,1
0,8,8
1,6,2


In [41]:
# 行列の要素ごとの和
A + B

Unnamed: 0,0,1
0,13,7
1,13,8


In [42]:
# 行列の要素ごとの積（「行列の積」ではない）
A * B

Unnamed: 0,0,1
0,40,6
1,40,12


In [43]:
# 行列の積をとりたい場合は DataFrame.dot。ただし、行列の積をとるためには元データの columns と 引数の index のラベルが一致している必要がある
A.dot(B)

Unnamed: 0,0,1
0,48,32
1,88,42


In [44]:
B.dot(A)

Unnamed: 0,0,1
0,70,44
1,50,20


## 簡単な統計量

In [45]:
df4 = pd.DataFrame(np.random.randint(10, size=(5,10)))

In [46]:
df4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,4,3,7,0,7,2,4,8,6,8
1,8,5,0,2,6,0,6,6,4,4
2,7,4,1,7,0,5,4,0,1,7
3,0,0,6,6,9,1,0,8,2,9
4,1,0,3,7,0,2,9,0,2,9


In [47]:
# 基本統計量の表示
df4.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5,5.0
mean,4.0,2.4,3.4,4.4,4.4,2.0,4.6,4.4,3,7.4
std,3.535534,2.302173,3.04959,3.209361,4.159327,1.870829,3.286335,4.09878,2,2.073644
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,4.0
25%,1.0,0.0,1.0,2.0,0.0,1.0,4.0,0.0,2,7.0
50%,4.0,3.0,3.0,6.0,6.0,2.0,4.0,6.0,2,8.0
75%,7.0,4.0,6.0,7.0,7.0,2.0,6.0,8.0,4,9.0
max,8.0,5.0,7.0,7.0,9.0,5.0,9.0,8.0,6,9.0


In [48]:
# 列の合計値
df4.sum()

0    20
1    12
2    17
3    22
4    22
5    10
6    23
7    22
8    15
9    37
dtype: int64

In [49]:
# 列の平均値
df4.mean()

0    4.0
1    2.4
2    3.4
3    4.4
4    4.4
5    2.0
6    4.6
7    4.4
8    3.0
9    7.4
dtype: float64

In [50]:
# 列の不偏分散
df4.var()

0    12.5
1     5.3
2     9.3
3    10.3
4    17.3
5     3.5
6    10.8
7    16.8
8     4.0
9     4.3
dtype: float64

In [51]:
# 列の標本分散
df4.var(ddof=False)

0    10.00
1     4.24
2     7.44
3     8.24
4    13.84
5     2.80
6     8.64
7    13.44
8     3.20
9     3.44
dtype: float64

In [52]:
# 列の不偏標準偏差
df4.std()

0    3.535534
1    2.302173
2    3.049590
3    3.209361
4    4.159327
5    1.870829
6    3.286335
7    4.098780
8    2.000000
9    2.073644
dtype: float64

In [53]:
# 列の標本標準偏差
df4.std(ddof=False)

0    3.162278
1    2.059126
2    2.727636
3    2.870540
4    3.720215
5    1.673320
6    2.939388
7    3.666061
8    1.788854
9    1.854724
dtype: float64

In [54]:
# 行の合計値
df4.sum(axis = 1)

0    49
1    41
2    36
3    41
4    33
dtype: int64

In [55]:
# 行の平均値
df4.mean(axis = 1)

0    4.9
1    4.1
2    3.6
3    4.1
4    3.3
dtype: float64

## 行列の正規化（標準化）

正規化 (normalize) とは、異なる基準のデータを一定の基準にしたがって変形し利用しやすくすることです。

In [56]:
df4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,4,3,7,0,7,2,4,8,6,8
1,8,5,0,2,6,0,6,6,4,4
2,7,4,1,7,0,5,4,0,1,7
3,0,0,6,6,9,1,0,8,2,9
4,1,0,3,7,0,2,9,0,2,9


In [57]:
# 一般的には平均 0 、分散 (及び標準偏差) が 1 になるように値を変換することを指します。
df4.apply(lambda x: (x-x.mean())/x.std(), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.260623,1.180487,-1.370989,0.625101,0.0,-0.182574,0.87831,1.5,0.289346
1,1.131371,1.129368,-1.114904,-0.747812,0.384678,-1.069045,0.426006,0.39036,0.5,-1.639626
2,0.848528,0.694996,-0.786991,0.81013,-1.057863,1.603567,-0.182574,-1.07349,-1.0,-0.192897
3,-1.131371,-1.042493,0.852574,0.498542,1.105948,-0.534522,-1.399735,0.87831,-0.5,0.771589
4,-0.848528,-1.042493,-0.131165,0.81013,-1.057863,0.0,1.338877,-1.07349,-0.5,0.771589


In [58]:
# 最大値を1、最小値を0にするような正規化もできます。
df4.apply(lambda x: (x-x.min())/(x.max() - x.min()), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.5,0.6,1.0,0.0,0.777778,0.4,0.444444,1.0,1.0,0.8
1,1.0,1.0,0.0,0.285714,0.666667,0.0,0.666667,0.75,0.6,0.0
2,0.875,0.8,0.142857,1.0,0.0,1.0,0.444444,0.0,0.0,0.6
3,0.0,0.0,0.857143,0.857143,1.0,0.2,0.0,1.0,0.2,1.0
4,0.125,0.0,0.428571,1.0,0.0,0.4,1.0,0.0,0.2,1.0


In [59]:
# 合計値が１になるような正規化もできます。
df4.apply(lambda x: x/x.sum(), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.2,0.25,0.411765,0.0,0.318182,0.2,0.173913,0.363636,0.4,0.216216
1,0.4,0.416667,0.0,0.090909,0.272727,0.0,0.26087,0.272727,0.266667,0.108108
2,0.35,0.333333,0.058824,0.318182,0.0,0.5,0.173913,0.0,0.066667,0.189189
3,0.0,0.0,0.352941,0.272727,0.409091,0.1,0.0,0.363636,0.133333,0.243243
4,0.05,0.0,0.176471,0.318182,0.0,0.2,0.391304,0.0,0.133333,0.243243


## 相関行列

相関行列とは、各要素間の相関係数を並べたものであり、その性質から必ず対称行列である。

In [60]:
# まずランダムな行列を作ってみる
df5 = pd.DataFrame(np.random.rand(5, 10))

In [61]:
df5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.336882,0.327188,0.758929,0.834595,0.516328,0.569779,0.177002,0.159889,0.711203,0.150895
1,0.999646,0.805735,0.933125,0.037694,0.430594,0.844623,0.605486,0.432771,0.650326,0.288624
2,0.174822,0.906186,0.424933,0.773751,0.62667,0.829352,0.027034,0.159202,0.909636,0.349619
3,0.255684,0.504232,0.650235,0.74569,0.03544,0.057191,0.321197,0.762619,0.565069,0.576957
4,0.383623,0.828361,0.002174,0.4651,0.956608,0.84573,0.18854,0.595549,0.587776,0.885108


In [62]:
# 行間の相関行列
pd.DataFrame(np.corrcoef(df5.dropna().as_matrix().tolist()))

Unnamed: 0,0,1,2,3,4
0,1.0,-0.01758,0.63622,0.085246,-0.216958
1,-0.01758,1.0,-0.048774,-0.379378,-0.289402
2,0.63622,-0.048774,1.0,-0.059842,0.472255
3,0.085246,-0.379378,-0.059842,1.0,-0.34697
4,-0.216958,-0.289402,0.472255,-0.34697,1.0


In [63]:
# 列間の相関行列
pd.DataFrame(np.corrcoef(df5.dropna().T.as_matrix().tolist()))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.217427,0.506052,-0.935105,-0.031041,0.373131,0.89862,0.086275,-0.298609,-0.213883
1,0.217427,1.0,-0.429866,-0.491785,0.497539,0.671322,-0.005676,0.001015,0.321527,0.368532
2,0.506052,-0.429866,1.0,-0.208702,-0.701378,-0.267397,0.60581,-0.205548,0.026929,-0.816271
3,-0.935105,-0.491785,-0.208702,1.0,-0.141911,-0.472731,-0.817688,-0.247005,0.330518,-0.097396
4,-0.031041,0.497539,-0.701378,-0.141911,1.0,0.82344,-0.400399,-0.330133,0.231603,0.347429
5,0.373131,0.671322,-0.267397,-0.472731,0.82344,1.0,-0.059894,-0.528745,0.443991,-0.03952
6,0.89862,-0.005676,0.60581,-0.817688,-0.400399,-0.059894,1.0,0.410318,-0.556862,-0.123324
7,0.086275,0.001015,-0.205548,-0.247005,-0.330133,-0.528745,0.410318,1.0,-0.84296,0.723358
8,-0.298609,0.321527,0.026929,0.330518,0.231603,0.443991,-0.556862,-0.84296,1.0,-0.518372
9,-0.213883,0.368532,-0.816271,-0.097396,0.347429,-0.03952,-0.123324,0.723358,-0.518372,1.0
