# Numpy と Pandas を用いた演算

In [1]:
import numpy as np
import pandas as pd

## まずは Pandas の基本操作から

In [2]:
df = pd.DataFrame([[1,4,7],[2,5,8],[3,6,9]],
                   index = ['i1','i2','i3'],
                   columns = list("abc"))

In [3]:
df

Unnamed: 0,a,b,c
i1,1,4,7
i2,2,5,8
i3,3,6,9


In [4]:
# 指定した行の取り出し。index名かindex番号で与えられる。
df.ix['i1']

a    1
b    4
c    7
Name: i1, dtype: int64

In [5]:
df.ix[1]

a    2
b    5
c    8
Name: i2, dtype: int64

In [6]:
# 2つ目のparameterに列を渡すことで、行を取得できる。
df.ix['i1','a']

1

In [7]:
# : は全指定する。(R, scilabと対応)
df.ix[:, 'a']

i1    1
i2    2
i3    3
Name: a, dtype: int64

In [8]:
# 複数の指定も配列で可能。
df.ix[[1,2], ['b','c']]

Unnamed: 0,b,c
i2,5,8
i3,6,9


In [9]:
df.ix[[1,2], [1,2]]

Unnamed: 0,b,c
i2,5,8
i3,6,9


In [10]:
# 列に関する操作は[column名]で渡す。
df['a']

i1    1
i2    2
i3    3
Name: a, dtype: int64

In [11]:
# arrayとして取得する。
df['a'].values

array([1, 2, 3])

In [12]:
# さらにindex名を指定することで値として取得できる。
df['a']['i3']

3

In [13]:
# DataFrameをtableとみなして、位置指定から値を明示的に取る方法。
df.iloc[0,0]

1

In [14]:
# これも同じ
df.ix[0,0]

1

In [15]:
# 列の取得
df.iloc[2]

a    3
b    6
c    9
Name: i3, dtype: int64

## Numpy で生成した乱数を Pandas で使う

In [45]:
df1 = pd.DataFrame(np.random.randint(10, size=(4,5)))

In [46]:
df1

Unnamed: 0,0,1,2,3,4
0,8,4,8,6,8
1,3,7,7,0,3
2,1,9,0,6,5
3,0,9,0,0,1


In [47]:
# 条件を満たすものだけを抽出
df1[df1>5]

Unnamed: 0,0,1,2,3,4
0,8.0,,8.0,6.0,8.0
1,,7.0,7.0,,
2,,9.0,,6.0,
3,,9.0,,,


In [48]:
# # 条件をみたすものに-1を代入
df1[df1>5] = -1

In [49]:
df1

Unnamed: 0,0,1,2,3,4
0,-1,4,-1,-1,-1
1,3,-1,-1,0,3
2,1,-1,0,-1,5
3,0,-1,0,0,1


## 欠損値を含むデータを取り扱う

In [73]:
# 欠損値 (NaN) を含むランダムデータを作成する
df2 = pd.DataFrame(np.random.randint(10, size=(8,7)))
df2 = df2[df2>0]

In [74]:
df2

Unnamed: 0,0,1,2,3,4,5,6
0,5,6,1,5.0,4.0,3.0,6
1,2,6,4,,9.0,8.0,5
2,3,4,3,5.0,,8.0,6
3,5,7,9,,2.0,,7
4,6,2,7,4.0,6.0,2.0,8
5,3,6,7,1.0,6.0,4.0,1
6,3,5,9,2.0,7.0,7.0,6
7,4,5,5,1.0,7.0,8.0,4


In [75]:
# NaNを含む行を削除
df2.dropna()

Unnamed: 0,0,1,2,3,4,5,6
0,5,6,1,5,4,3,6
4,6,2,7,4,6,2,8
5,3,6,7,1,6,4,1
6,3,5,9,2,7,7,6
7,4,5,5,1,7,8,4


In [76]:
# NaNを含む列を削除
df2.dropna(axis=1)

Unnamed: 0,0,1,2,6
0,5,6,1,6
1,2,6,4,5
2,3,4,3,6
3,5,7,9,7
4,6,2,7,8
5,3,6,7,1
6,3,5,9,6
7,4,5,5,4


In [78]:
# NaNを埋める
df2.fillna(-1)

Unnamed: 0,0,1,2,3,4,5,6
0,5,6,1,5,4,3,6
1,2,6,4,-1,9,8,5
2,3,4,3,5,-1,8,6
3,5,7,9,-1,2,-1,7
4,6,2,7,4,6,2,8
5,3,6,7,1,6,4,1
6,3,5,9,2,7,7,6
7,4,5,5,1,7,8,4


In [79]:
#直前の値で埋める
df2.fillna(method='pad')

Unnamed: 0,0,1,2,3,4,5,6
0,5,6,1,5,4,3,6
1,2,6,4,5,9,8,5
2,3,4,3,5,9,8,6
3,5,7,9,5,2,8,7
4,6,2,7,4,6,2,8
5,3,6,7,1,6,4,1
6,3,5,9,2,7,7,6
7,4,5,5,1,7,8,4


In [80]:
#直後の値で埋める
df2.fillna(method='bfill')

Unnamed: 0,0,1,2,3,4,5,6
0,5,6,1,5,4,3,6
1,2,6,4,5,9,8,5
2,3,4,3,5,2,8,6
3,5,7,9,4,2,2,7
4,6,2,7,4,6,2,8
5,3,6,7,1,6,4,1
6,3,5,9,2,7,7,6
7,4,5,5,1,7,8,4


In [81]:
# misssing valueの前後の線形の値で埋める
df2.apply(pd.Series.interpolate)

Unnamed: 0,0,1,2,3,4,5,6
0,5,6,1,5.0,4.0,3,6
1,2,6,4,5.0,9.0,8,5
2,3,4,3,5.0,5.5,8,6
3,5,7,9,4.5,2.0,5,7
4,6,2,7,4.0,6.0,2,8
5,3,6,7,1.0,6.0,4,1
6,3,5,9,2.0,7.0,7,6
7,4,5,5,1.0,7.0,8,4


## 重複のあるデータを取り扱う

In [83]:
df3 = pd.DataFrame(np.random.randint(2, size=(10,4)))

In [84]:
df3

Unnamed: 0,0,1,2,3
0,0,0,0,0
1,0,0,1,0
2,1,0,0,0
3,1,1,1,1
4,1,1,1,0
5,0,1,0,0
6,0,1,1,1
7,0,1,1,0
8,0,1,1,0
9,1,0,0,1


In [85]:
# 重複を調べる
df3.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8     True
9    False
dtype: bool

In [86]:
# 重複を調査するcolumnを指定することも可能
df3.duplicated(0)

0    False
1     True
2    False
3     True
4     True
5     True
6     True
7     True
8     True
9     True
dtype: bool

In [87]:
# 重複を除去する
df3.drop_duplicates()

Unnamed: 0,0,1,2,3
0,0,0,0,0
1,0,0,1,0
2,1,0,0,0
3,1,1,1,1
4,1,1,1,0
5,0,1,0,0
6,0,1,1,1
7,0,1,1,0
9,1,0,0,1


In [88]:
# 指定した列の重複を除去する
df3.drop_duplicates(0)

Unnamed: 0,0,1,2,3
0,0,0,0,0
2,1,0,0,0


In [89]:
# 指定した列の重複を除去し最後のを残す
df3.drop_duplicates(0, take_last=True)

Unnamed: 0,0,1,2,3
8,0,1,1,0
9,1,0,0,1


## 行列演算を行う

In [91]:
A = pd.DataFrame(np.random.randint(10, size=(2,2)))
B = pd.DataFrame(np.random.randint(10, size=(2,2)))

In [92]:
A

Unnamed: 0,0,1
0,5,7
1,4,2


In [93]:
B

Unnamed: 0,0,1
0,5,4
1,3,1


In [96]:
# 行列の転置
A.T

Unnamed: 0,0,1
0,5,4
1,7,2


In [97]:
# 行列の転置
B.T

Unnamed: 0,0,1
0,5,3
1,4,1


In [94]:
A + B

Unnamed: 0,0,1
0,10,11
1,7,3


In [95]:
A * B

Unnamed: 0,0,1
0,25,28
1,12,2


In [98]:
# 行列の積をとりたい場合は DataFrame.dot。ただし、行列の積をとるためには元データの columns と 引数の index のラベルが一致している必要がある
A.dot(B)

Unnamed: 0,0,1
0,46,27
1,26,18


In [99]:
B.dot(A)

Unnamed: 0,0,1
0,41,43
1,19,23


## 簡単な統計量

In [102]:
df4 = pd.DataFrame(np.random.randint(10, size=(5,10)))

In [103]:
df4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,9,2,3,5,7,5,4,9,0,3
1,9,8,3,5,3,5,5,1,0,1
2,7,0,7,2,2,8,5,1,5,2
3,5,6,6,4,1,9,8,4,8,9
4,6,7,4,7,5,4,6,9,0,0


In [111]:
# 基本統計量の表示
df4.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,7.2,4.6,4.6,4.6,3.6,6.2,5.6,4.8,2.6,3.0
std,1.788854,3.435113,1.81659,1.81659,2.408319,2.167948,1.516575,4.024922,3.714835,3.535534
min,5.0,0.0,3.0,2.0,1.0,4.0,4.0,1.0,0.0,0.0
25%,6.0,2.0,3.0,4.0,2.0,5.0,5.0,1.0,0.0,1.0
50%,7.0,6.0,4.0,5.0,3.0,5.0,5.0,4.0,0.0,2.0
75%,9.0,7.0,6.0,5.0,5.0,8.0,6.0,9.0,5.0,3.0
max,9.0,8.0,7.0,7.0,7.0,9.0,8.0,9.0,8.0,9.0


In [106]:
# 列の合計値
df4.sum()

0    36
1    23
2    23
3    23
4    18
5    31
6    28
7    24
8    13
9    15
dtype: int64

In [105]:
# 列の平均値
df4.mean()

0    7.2
1    4.6
2    4.6
3    4.6
4    3.6
5    6.2
6    5.6
7    4.8
8    2.6
9    3.0
dtype: float64

In [107]:
# 列の不偏分散
df4.var()

0     3.2
1    11.8
2     3.3
3     3.3
4     5.8
5     4.7
6     2.3
7    16.2
8    13.8
9    12.5
dtype: float64

In [108]:
# 列の標本分散
df4.var(ddof=False)

0     2.56
1     9.44
2     2.64
3     2.64
4     4.64
5     3.76
6     1.84
7    12.96
8    11.04
9    10.00
dtype: float64

In [109]:
# 列の不偏標準偏差
df4.std()

0    1.788854
1    3.435113
2    1.816590
3    1.816590
4    2.408319
5    2.167948
6    1.516575
7    4.024922
8    3.714835
9    3.535534
dtype: float64

In [110]:
# 列の標本標準偏差
df4.std(ddof=False)

0    1.600000
1    3.072458
2    1.624808
3    1.624808
4    2.154066
5    1.939072
6    1.356466
7    3.600000
8    3.322650
9    3.162278
dtype: float64

In [112]:
# 行の合計値
df4.sum(axis = 1)

0    47
1    40
2    39
3    60
4    48
dtype: int64

In [113]:
# 行の平均値
df4.mean(axis = 1)

0    4.7
1    4.0
2    3.9
3    6.0
4    4.8
dtype: float64

## 行列の正規化（標準化）

正規化 (normalize) とは、異なる基準のデータを一定の基準にしたがって変形し利用しやすくすることです。

In [115]:
df4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,9,2,3,5,7,5,4,9,0,3
1,9,8,3,5,3,5,5,1,0,1
2,7,0,7,2,2,8,5,1,5,2
3,5,6,6,4,1,9,8,4,8,9
4,6,7,4,7,5,4,6,9,0,0


In [120]:
# 一般的には平均 0 、分散 (及び標準偏差) が 1 になるように値を変換することを指します。
df4.apply(lambda x: (x-x.mean())/x.std(), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.006231,-0.756889,-0.880771,0.220193,1.411773,-0.553519,-1.055009,1.043498,-0.699896,0.0
1,1.006231,0.989778,-0.880771,0.220193,-0.249136,-0.553519,-0.395628,-0.944118,-0.699896,-0.565685
2,-0.111803,-1.339112,1.321157,-1.431253,-0.664364,0.830278,-0.395628,-0.944118,0.646058,-0.282843
3,-1.229837,0.407556,0.770675,-0.330289,-1.079591,1.291544,1.582513,-0.198762,1.453631,1.697056
4,-0.67082,0.698667,-0.330289,1.321157,0.581318,-1.014784,0.263752,1.043498,-0.699896,-0.848528


In [121]:
# 最大値を1、最小値を0にするような正規化もできます。
df4.apply(lambda x: (x-x.min())/(x.max() - x.min()), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.25,0.0,0.6,1.0,0.2,0.0,1.0,0.0,0.333333
1,1.0,1.0,0.0,0.6,0.333333,0.2,0.25,0.0,0.0,0.111111
2,0.5,0.0,1.0,0.0,0.166667,0.8,0.25,0.0,0.625,0.222222
3,0.0,0.75,0.75,0.4,0.0,1.0,1.0,0.375,1.0,1.0
4,0.25,0.875,0.25,1.0,0.666667,0.0,0.5,1.0,0.0,0.0


In [122]:
# 合計値が１になるような正規化もできます。
df4.apply(lambda x: x/x.sum(), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.25,0.086957,0.130435,0.217391,0.388889,0.16129,0.142857,0.375,0.0,0.2
1,0.25,0.347826,0.130435,0.217391,0.166667,0.16129,0.178571,0.041667,0.0,0.066667
2,0.194444,0.0,0.304348,0.086957,0.111111,0.258065,0.178571,0.041667,0.384615,0.133333
3,0.138889,0.26087,0.26087,0.173913,0.055556,0.290323,0.285714,0.166667,0.615385,0.6
4,0.166667,0.304348,0.173913,0.304348,0.277778,0.129032,0.214286,0.375,0.0,0.0
