# Numpy と Pandas を用いた演算

In [1]:
import numpy as np
import pandas as pd

## まずは Pandas の基本操作から

In [2]:
df = pd.DataFrame([[1,4,7],[2,5,8],[3,6,9]],
                   index = ['i1','i2','i3'],
                   columns = list("abc"))

In [3]:
df

Unnamed: 0,a,b,c
i1,1,4,7
i2,2,5,8
i3,3,6,9


In [4]:
# 指定した行の取り出し。index名かindex番号で与えられる。
df.ix['i1']

a    1
b    4
c    7
Name: i1, dtype: int64

In [5]:
df.ix[1]

a    2
b    5
c    8
Name: i2, dtype: int64

In [6]:
# 2つ目のparameterに列を渡すことで、行を取得できる。
df.ix['i1','a']

1

In [7]:
# : は全指定する。(R, scilabと対応)
df.ix[:, 'a']

i1    1
i2    2
i3    3
Name: a, dtype: int64

In [8]:
# 複数の指定も配列で可能。
df.ix[[1,2], ['b','c']]

Unnamed: 0,b,c
i2,5,8
i3,6,9


In [9]:
df.ix[[1,2], [1,2]]

Unnamed: 0,b,c
i2,5,8
i3,6,9


In [10]:
# 列に関する操作は[column名]で渡す。
df['a']

i1    1
i2    2
i3    3
Name: a, dtype: int64

In [11]:
# arrayとして取得する。
df['a'].values

array([1, 2, 3])

In [12]:
# さらにindex名を指定することで値として取得できる。
df['a']['i3']

3

In [13]:
# DataFrameをtableとみなして、位置指定から値を明示的に取る方法。
df.iloc[0,0]

1

In [14]:
# これも同じ
df.ix[0,0]

1

In [15]:
# 列の取得
df.iloc[2]

a    3
b    6
c    9
Name: i3, dtype: int64

## Numpy で生成した乱数を Pandas で使う

In [16]:
df1 = pd.DataFrame(np.random.randint(10, size=(4,5)))

In [17]:
df1

Unnamed: 0,0,1,2,3,4
0,3,5,7,3,6
1,2,6,2,9,9
2,4,8,8,3,5
3,6,8,9,4,7


In [18]:
# 条件を満たすものだけを抽出
df1[df1>5]

Unnamed: 0,0,1,2,3,4
0,,,7.0,,6.0
1,,6.0,,9.0,9.0
2,,8.0,8.0,,
3,6.0,8.0,9.0,,7.0


In [19]:
# # 条件をみたすものに-1を代入
df1[df1>5] = -1

In [20]:
df1

Unnamed: 0,0,1,2,3,4
0,3,5,-1,3,-1
1,2,-1,2,-1,-1
2,4,-1,-1,3,5
3,-1,-1,-1,4,-1


## 欠損値を含むデータを取り扱う

In [21]:
# 欠損値 (NaN) を含むランダムデータを作成する
df2 = pd.DataFrame(np.random.randint(10, size=(8,7)))
df2 = df2[df2>0]

In [22]:
df2

Unnamed: 0,0,1,2,3,4,5,6
0,9.0,7,3,4,8.0,8.0,9
1,5.0,3,1,9,4.0,6.0,2
2,5.0,3,2,5,8.0,6.0,9
3,7.0,1,9,7,6.0,3.0,6
4,3.0,2,5,3,5.0,6.0,5
5,,2,8,7,1.0,2.0,9
6,8.0,6,5,1,1.0,7.0,1
7,5.0,8,4,2,,,8


In [23]:
# NaNを含む行を削除
df2.dropna()

Unnamed: 0,0,1,2,3,4,5,6
0,9,7,3,4,8,8,9
1,5,3,1,9,4,6,2
2,5,3,2,5,8,6,9
3,7,1,9,7,6,3,6
4,3,2,5,3,5,6,5
6,8,6,5,1,1,7,1


In [24]:
# NaNを含む列を削除
df2.dropna(axis=1)

Unnamed: 0,1,2,3,6
0,7,3,4,9
1,3,1,9,2
2,3,2,5,9
3,1,9,7,6
4,2,5,3,5
5,2,8,7,9
6,6,5,1,1
7,8,4,2,8


In [25]:
# NaNを埋める
df2.fillna(-1)

Unnamed: 0,0,1,2,3,4,5,6
0,9,7,3,4,8,8,9
1,5,3,1,9,4,6,2
2,5,3,2,5,8,6,9
3,7,1,9,7,6,3,6
4,3,2,5,3,5,6,5
5,-1,2,8,7,1,2,9
6,8,6,5,1,1,7,1
7,5,8,4,2,-1,-1,8


In [26]:
#直前の値で埋める
df2.fillna(method='pad')

Unnamed: 0,0,1,2,3,4,5,6
0,9,7,3,4,8,8,9
1,5,3,1,9,4,6,2
2,5,3,2,5,8,6,9
3,7,1,9,7,6,3,6
4,3,2,5,3,5,6,5
5,3,2,8,7,1,2,9
6,8,6,5,1,1,7,1
7,5,8,4,2,1,7,8


In [27]:
#直後の値で埋める
df2.fillna(method='bfill')

Unnamed: 0,0,1,2,3,4,5,6
0,9,7,3,4,8.0,8.0,9
1,5,3,1,9,4.0,6.0,2
2,5,3,2,5,8.0,6.0,9
3,7,1,9,7,6.0,3.0,6
4,3,2,5,3,5.0,6.0,5
5,8,2,8,7,1.0,2.0,9
6,8,6,5,1,1.0,7.0,1
7,5,8,4,2,,,8


In [28]:
# misssing valueの前後の線形の値で埋める
df2.apply(pd.Series.interpolate)

Unnamed: 0,0,1,2,3,4,5,6
0,9.0,7,3,4,8,8,9
1,5.0,3,1,9,4,6,2
2,5.0,3,2,5,8,6,9
3,7.0,1,9,7,6,3,6
4,3.0,2,5,3,5,6,5
5,5.5,2,8,7,1,2,9
6,8.0,6,5,1,1,7,1
7,5.0,8,4,2,1,7,8


## 重複のあるデータを取り扱う

In [29]:
df3 = pd.DataFrame(np.random.randint(2, size=(10,4)))

In [30]:
df3

Unnamed: 0,0,1,2,3
0,0,1,1,1
1,1,0,0,0
2,1,0,0,1
3,1,0,1,0
4,0,1,0,0
5,1,0,0,1
6,1,0,0,1
7,1,0,1,0
8,1,1,1,1
9,0,1,0,0


In [31]:
# 重複を調べる
df3.duplicated()

0    False
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8    False
9     True
dtype: bool

In [32]:
# 重複を調査するcolumnを指定することも可能
df3.duplicated(0)

0    False
1    False
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
dtype: bool

In [33]:
# 重複を除去する
df3.drop_duplicates()

Unnamed: 0,0,1,2,3
0,0,1,1,1
1,1,0,0,0
2,1,0,0,1
3,1,0,1,0
4,0,1,0,0
8,1,1,1,1


In [34]:
# 指定した列の重複を除去する
df3.drop_duplicates(0)

Unnamed: 0,0,1,2,3
0,0,1,1,1
1,1,0,0,0


In [35]:
# 指定した列の重複を除去し最後のを残す
df3.drop_duplicates(0, take_last=True)

Unnamed: 0,0,1,2,3
8,1,1,1,1
9,0,1,0,0


## 行列演算を行う

In [36]:
A = pd.DataFrame(np.random.randint(10, size=(2,2)))
B = pd.DataFrame(np.random.randint(10, size=(2,2)))

In [37]:
A

Unnamed: 0,0,1
0,2,9
1,8,4


In [38]:
B

Unnamed: 0,0,1
0,3,2
1,2,2


In [39]:
# 行列の転置
A.T

Unnamed: 0,0,1
0,2,8
1,9,4


In [40]:
# 行列の転置
B.T

Unnamed: 0,0,1
0,3,2
1,2,2


In [41]:
# 行列の要素ごとの和
A + B

Unnamed: 0,0,1
0,5,11
1,10,6


In [43]:
# 行列の要素ごとの積（「行列の積」ではない）
A * B

Unnamed: 0,0,1
0,6,18
1,16,8


In [44]:
# 行列の積をとりたい場合は DataFrame.dot。ただし、行列の積をとるためには元データの columns と 引数の index のラベルが一致している必要がある
A.dot(B)

Unnamed: 0,0,1
0,24,22
1,32,24


In [45]:
B.dot(A)

Unnamed: 0,0,1
0,22,35
1,20,26


## 簡単な統計量

In [46]:
df4 = pd.DataFrame(np.random.randint(10, size=(5,10)))

In [47]:
df4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,6,2,6,1,0,6,9,2,6,0
1,7,1,5,1,8,3,0,7,5,0
2,7,2,2,1,1,8,3,6,7,6
3,6,1,8,8,1,0,0,4,5,6
4,4,2,2,6,6,1,1,5,1,7


In [48]:
# 基本統計量の表示
df4.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,6.0,1.6,4.6,3.4,3.2,3.6,2.6,4.8,4.8,3.8
std,1.224745,0.547723,2.607681,3.361547,3.563706,3.361547,3.781534,1.923538,2.280351,3.49285
min,4.0,1.0,2.0,1.0,0.0,0.0,0.0,2.0,1.0,0.0
25%,6.0,1.0,2.0,1.0,1.0,1.0,0.0,4.0,5.0,0.0
50%,6.0,2.0,5.0,1.0,1.0,3.0,1.0,5.0,5.0,6.0
75%,7.0,2.0,6.0,6.0,6.0,6.0,3.0,6.0,6.0,6.0
max,7.0,2.0,8.0,8.0,8.0,8.0,9.0,7.0,7.0,7.0


In [49]:
# 列の合計値
df4.sum()

0    30
1     8
2    23
3    17
4    16
5    18
6    13
7    24
8    24
9    19
dtype: int64

In [50]:
# 列の平均値
df4.mean()

0    6.0
1    1.6
2    4.6
3    3.4
4    3.2
5    3.6
6    2.6
7    4.8
8    4.8
9    3.8
dtype: float64

In [51]:
# 列の不偏分散
df4.var()

0     1.5
1     0.3
2     6.8
3    11.3
4    12.7
5    11.3
6    14.3
7     3.7
8     5.2
9    12.2
dtype: float64

In [52]:
# 列の標本分散
df4.var(ddof=False)

0     1.20
1     0.24
2     5.44
3     9.04
4    10.16
5     9.04
6    11.44
7     2.96
8     4.16
9     9.76
dtype: float64

In [53]:
# 列の不偏標準偏差
df4.std()

0    1.224745
1    0.547723
2    2.607681
3    3.361547
4    3.563706
5    3.361547
6    3.781534
7    1.923538
8    2.280351
9    3.492850
dtype: float64

In [54]:
# 列の標本標準偏差
df4.std(ddof=False)

0    1.095445
1    0.489898
2    2.332381
3    3.006659
4    3.187475
5    3.006659
6    3.382307
7    1.720465
8    2.039608
9    3.124100
dtype: float64

In [55]:
# 行の合計値
df4.sum(axis = 1)

0    38
1    37
2    43
3    39
4    35
dtype: int64

In [56]:
# 行の平均値
df4.mean(axis = 1)

0    3.8
1    3.7
2    4.3
3    3.9
4    3.5
dtype: float64

## 行列の正規化（標準化）

正規化 (normalize) とは、異なる基準のデータを一定の基準にしたがって変形し利用しやすくすることです。

In [57]:
df4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,6,2,6,1,0,6,9,2,6,0
1,7,1,5,1,8,3,0,7,5,0
2,7,2,2,1,1,8,3,6,7,6
3,6,1,8,8,1,0,0,4,5,6
4,4,2,2,6,6,1,1,5,1,7


In [58]:
# 一般的には平均 0 、分散 (及び標準偏差) が 1 になるように値を変換することを指します。
df4.apply(lambda x: (x-x.mean())/x.std(), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.730297,0.536875,-0.713957,-0.897942,0.713957,1.692435,-1.455651,0.526235,-1.087937
1,0.816497,-1.095445,0.153393,-0.713957,1.346912,-0.178489,-0.687552,1.143726,0.087706,-1.087937
2,0.816497,0.730297,-0.997054,-0.713957,-0.617335,1.308921,0.105777,0.62385,0.964764,0.629858
3,0.0,-1.095445,1.30384,1.368417,-0.617335,-1.070935,-0.687552,-0.4159,0.087706,0.629858
4,-1.632993,0.730297,-0.997054,0.773453,0.785699,-0.773453,-0.423109,0.103975,-1.66641,0.916157


In [59]:
# 最大値を1、最小値を0にするような正規化もできます。
df4.apply(lambda x: (x-x.min())/(x.max() - x.min()), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.666667,1,0.666667,0.0,0.0,0.75,1.0,0.0,0.833333,0.0
1,1.0,0,0.5,0.0,1.0,0.375,0.0,1.0,0.666667,0.0
2,1.0,1,0.0,0.0,0.125,1.0,0.333333,0.8,1.0,0.857143
3,0.666667,0,1.0,1.0,0.125,0.0,0.0,0.4,0.666667,0.857143
4,0.0,1,0.0,0.714286,0.75,0.125,0.111111,0.6,0.0,1.0


In [60]:
# 合計値が１になるような正規化もできます。
df4.apply(lambda x: x/x.sum(), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.2,0.25,0.26087,0.058824,0.0,0.333333,0.692308,0.083333,0.25,0.0
1,0.233333,0.125,0.217391,0.058824,0.5,0.166667,0.0,0.291667,0.208333,0.0
2,0.233333,0.25,0.086957,0.058824,0.0625,0.444444,0.230769,0.25,0.291667,0.315789
3,0.2,0.125,0.347826,0.470588,0.0625,0.0,0.0,0.166667,0.208333,0.315789
4,0.133333,0.25,0.086957,0.352941,0.375,0.055556,0.076923,0.208333,0.041667,0.368421


## 相関行列

相関行列とは、各要素間の相関係数を並べたものであり、その性質から必ず対称行列である。

In [66]:
# まずランダムな行列を作ってみる
df5 = pd.DataFrame(np.random.rand(5, 10))

In [67]:
df5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.324367,0.295487,0.789002,0.670284,0.186813,0.687147,0.357131,0.517505,0.00766,0.408408
1,0.875114,0.881997,0.78892,0.463239,0.279901,0.386093,0.935882,0.199737,0.182531,0.518496
2,0.802815,0.040331,0.702398,0.332371,0.229991,0.000815,0.808325,0.871482,0.152839,0.830445
3,0.371105,0.667306,0.009406,0.724364,0.264124,0.185885,0.011729,0.869214,0.464575,0.781362
4,0.080434,0.906499,0.961661,0.959128,0.567927,0.499556,0.567537,0.78555,0.774333,0.939279


In [70]:
# 行間の相関行列
pd.DataFrame(np.corrcoef(df5.dropna().as_matrix().tolist()))

Unnamed: 0,0,1,2,3,4
0,1.0,0.164606,0.188667,-0.117389,0.252974
1,0.164606,1.0,0.283199,-0.380199,-0.18794
2,0.188667,0.283199,1.0,0.043817,-0.102089
3,-0.117389,-0.380199,0.043817,1.0,0.37794
4,0.252974,-0.18794,-0.102089,0.37794,1.0


In [69]:
# 列間の相関行列
pd.DataFrame(np.corrcoef(df5.dropna().T.as_matrix().tolist()))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,-0.291882,-0.009232,-0.947088,-0.571585,-0.534347,0.640535,-0.418724,-0.624041,-0.293979
1,-0.291882,1.0,0.03557,0.583042,0.650636,0.340431,-0.019296,-0.351663,0.644769,0.12569
2,-0.009232,0.03557,1.0,0.024951,0.377078,0.467597,0.71626,-0.390279,-0.049337,-0.11036
3,-0.947088,0.583042,0.024951,1.0,0.713652,0.559717,-0.543443,0.245966,0.75509,0.3058
4,-0.571585,0.650636,0.377078,0.713652,1.0,0.194943,0.091842,0.211824,0.905057,0.660466
5,-0.534347,0.340431,0.467597,0.559717,0.194943,1.0,-0.127492,-0.496641,-0.023427,-0.536414
6,0.640535,-0.019296,0.71626,-0.543443,0.091842,-0.127492,1.0,-0.484546,-0.2364,-0.060959
7,-0.418724,-0.351663,-0.390279,0.245966,0.211824,-0.496641,-0.484546,1.0,0.45155,0.772215
8,-0.624041,0.644769,-0.049337,0.75509,0.905057,-0.023427,-0.2364,0.45155,1.0,0.789639
9,-0.293979,0.12569,-0.11036,0.3058,0.660466,-0.536414,-0.060959,0.772215,0.789639,1.0
