In [1]:
%run hello.py

In [2]:
%run prog.py -h

In [3]:
%run prog.py 1 2 3 4

4


In [4]:
%run prog.py 1 2 3 4 --sum

10


In [5]:
%run prog.py 1 2 5 --sum

8


In [6]:
%run hello_argparse.py

Hello, World!
RESULT : 102.4


In [7]:
%run hello_argparse.py  --display --alpha 0.3 --text "Hello, argparse!" --num_iters 5

Hello, argparse!
0.6
1.2
2.4
4.8
9.6
RESULT : 9.6


In [8]:
%run hello_argparse.py  --alpha 1.0 --text "引数面白い" --num_iters 3

引数面白い
RESULT : 8.0


In [9]:
# 公式ドキュメントより https://docs.python.jp/3/library/argparse.html
# 'store_true' は 'store_const' の、それぞれ True と False を格納する特別版。デフォルト値を順に False と True にする。
# 引数として指定されていると、store_trueであればtrueに。store_falseであればfalseにする。
# 引数として指定されていないと、デフォルト値はその逆の状態となる。
parser = argparse.ArgumentParser()
parser.add_argument('--foo', action='store_false')
parser.add_argument('--bar', action='store_false')
parser.add_argument('--baz', action='store_true')
parser.parse_args('--foo --bar'.split())

Namespace(bar=False, baz=False, foo=False)

## 【問題1】train_test_splitのスクラッチ
scikit-learnのtrain_test_splitを自作する

In [10]:
import numpy as np
X, y = np.arange(10).reshape((5, 2)), range(5)
print(X)
print(y)
len(y)

[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
range(0, 5)


5

### スクラッチ実装ファイルを呼び出し実行する


In [11]:
# ndarray の使い方おさらい
np.ndarray(shape=(1,1), dtype=float, order='F')

array([[0.]])

In [12]:
np.ndarray((2,), buffer=np.array([1,2,3]),
           offset=np.int_().itemsize,
           dtype=int) # offset = 1*itemsize, i.e. skip first element

array([2, 3])

In [13]:
data = [1, 2, 3, 10]
array = np.array(data)
array

array([ 1,  2,  3, 10])

In [14]:
data2 = [[1, 2, 3], [4, 5, 6]]
array2 = np.array(data2)
array2

array([[1, 2, 3],
       [4, 5, 6]])

In [15]:
array2.ndim

2

In [16]:
array2.shape

(2, 3)

In [17]:
print(X)
print(y)
np.insert(X, 2, y, axis=1)

[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
range(0, 5)


array([[0, 1, 0],
       [2, 3, 1],
       [4, 5, 2],
       [6, 7, 3],
       [8, 9, 4]])

In [18]:
y = np.reshape(y, [5, 1])
y

array([[0],
       [1],
       [2],
       [3],
       [4]])

In [19]:
Xy = np.hstack((X, y))
Xy

array([[0, 1, 0],
       [2, 3, 1],
       [4, 5, 2],
       [6, 7, 3],
       [8, 9, 4]])

In [20]:
Xy = np.concatenate((X, y), axis=1)
Xy

array([[0, 1, 0],
       [2, 3, 1],
       [4, 5, 2],
       [6, 7, 3],
       [8, 9, 4]])

In [21]:
np.random.shuffle(Xy)
Xy

array([[6, 7, 3],
       [8, 9, 4],
       [0, 1, 0],
       [2, 3, 1],
       [4, 5, 2]])

In [22]:
Xy.shape[1]-1

2

In [23]:
X, y = np.hsplit(Xy, [2])

In [24]:
print(X)
print(y)

[[6 7]
 [8 9]
 [0 1]
 [2 3]
 [4 5]]
[[3]
 [4]
 [0]
 [1]
 [2]]


In [25]:
len(X)

5

In [26]:
X_train, X_test = np.vsplit(X, [int(len(X)*0.8)])
print(X_train)
print(X_test)

[[6 7]
 [8 9]
 [0 1]
 [2 3]]
[[4 5]]


In [27]:
y_train, y_test = np.vsplit(y, [4])
print(y_train)
print(y_test)

[[3]
 [4]
 [0]
 [1]]
[[2]]


In [28]:
from ml_scratch.utils.split import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [34]:
X_train, y_train, X_test, y_test

(array([[2, 3],
        [4, 5],
        [8, 9],
        [0, 1]]), array([[1],
        [2],
        [4],
        [0]]), array([[6, 7]]), array([[3]]))

### scikit-learn の当該メソッドを実行する
同一の仕様になるように実装するため、確認用

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8)
X_train, y_train, X_test, y_test

(array([[6, 7],
        [2, 3],
        [8, 9],
        [0, 1]]), array([[3],
        [1],
        [4],
        [0]]), array([[4, 5]]), array([[2]]))

In [32]:
train_test_split(y, shuffle=False)

[array([[3],
        [4],
        [0]]), array([[1],
        [2]])]

## 【問題2】分類パイプラインの作成
pyファイルで実行できる分類のパイプラインを作成
* ロジスティック回帰
* SVM
* 決定木

**アヤメデータセット**  
virgicolorとvirginicaのみ

In [45]:
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target_names[iris.target]
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [47]:
df = df[df.target != "setosa"]

**シンプルデータセット1**  
X, y

In [37]:
import numpy as np

np.random.seed(seed=0)
n_samples = 500
f0= [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]

f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))

X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)

random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

**シンプルデータセット2**  
X, y

In [38]:
X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

## パイプライン作成
1. RandomForestの特徴重要度を使って特徴選択
1. PCAで累積寄与率pに次元削減
1. 任意のモデルで分類
1. グリッドサーチでパラメタチューニング
1. 交差検証して性能評価

In [48]:
# RandomForestの特徴重要度を使って特徴選択
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.feature_selection import SelectFromModel

rfc = RFC(n_estimators=100, n_jobs=-1)
fs = SelectFromModel(rfc)

In [49]:
# PCAで累積寄与率pに次元削減
from sklearn.decomposition import PCA

pca = PCA()

In [50]:
# 任意のモデルで分類
from sklearn.svm import SVC

svm = SVC()

In [51]:
# パイプライン化
from sklearn.pipeline import Pipeline

estimators = zip(["feature_selection", "pca", "svm"], 
                 [fs, pca, svm])
pl = Pipeline(estimators)

In [52]:
# パラメタチューニング
from sklearn.model_selection import GridSearchCV

parameters = {"feature_selection__threshold" : ["mean", "median"],
              "pca__n_components" :[0.8, 0.5],
              "svm__gamma" : [0.001, 0.01, 0.05],
              "svm__C": [1, 10]}

clf = GridSearchCV(pl, parameters)