# Sprint 機械学習スクラッチ入門

テキスト全体の意図は「今後の機械学習スクラッチ課題で作成するモデルを、scikit-learnを用いて一度動かしておきます。これまでの復習を兼ねたスクラッチ課題の準備です。」です。（以下の画像の箇所です）
今回の問題2は「scikit-learnを使ってロジスティック回帰、SVM、決定木のコードを作成する」という意味で、
問題3も「scikit-learnを使って線形回帰のコードを作成する」という意味です。
これ以降のSprintで、3種類の分類モデルと1種類の回帰モデルをスクラッチするため、それらと対比するためここでは機械学習ライブラリscikit-learnで実装していただく、というのが今回のSprintの狙いです。
問題2と問題3を局所的に読むと「スクラッチで実装するのかscikit-learnを使って実装するのか」わかりかねると思いました。

## 【問題1】train_test_splitのスクラッチ


スクラッチの練習として、scikit-learnのtrain_test_splitを自作してみます。以下の雛形をベースとして関数を完成させてください。


sklearn.model_selection.train_test_split — scikit-learn 0.21.3 documentation


なお、作成した関数がscikit-learnのtrain_test_splitと同じ動作をしているか必ず確認をするようにしましょう。



In [399]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

In [457]:
#自分で作ったがうまく動かなかった

# def scratch_train_test_split(X, y, train_size=0.8, random_state = 0):
#     rng = np.random.RandomState(seed =random_state)
#     Xrng = rng.permutation(X)
#     yrng = rng.permutation(y)
    
#     a = round(Xrng.shape[0]*train_size)
#     X_train = Xrng[:a]
#     X_test = Xrng[a:]
    
#     b = round(yrng.shape[0]*train_size)
#     y_train = yrng[:b]
#     y_test = yrng[b:]
    
#     return X_train, X_test, y_train, y_test

In [452]:
#参考URL: https://qiita.com/kztsh/items/5afae48b2c2fd7dc20a5

def scratch_train_test_split(X, y, test_size=0.3, random_state = 0):
    n_samples = X.shape[0]
    #floorは切り下げ
    n_train = np.floor((1-test_size) * n_samples).astype(int)
    n_test = n_samples - n_train
    
    rng = np.random.RandomState(seed=random_state)
    #n_samples(Xの行数100個(0~99))をランダムに並べる
    permutation = rng.permutation(n_samples)
    
    #ランダムに並べられたn_samplesをn_test分上から選んでいる
    ind_test = permutation[:n_test]
    
    #n_samplesだとerrorになってしまう 0~99だから
    ind_train = permutation[n_test:(n_test + n_train)]
    
    #X[[2,5,86,45,0]]←下の行のはこんな感じになっている
    X_train = X[ind_train]
    X_test = X[ind_test]
    y_train = y[ind_train]
    y_test = y[ind_test]
    
    return X_train, X_test, y_train, y_test

## 【問題2】 分類問題を解くコードの作成
上記3種類の手法で3種類のデータセットを学習・推定するコードを作成してください。

## ロジスティック回帰

In [471]:
from sklearn.datasets import load_iris
iris = load_iris()

In [472]:
X = iris.data[50:, :3]
y = iris.target[50:]

In [473]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, test_size =0.3, random_state=0)

In [474]:
from sklearn.preprocessing import StandardScaler

In [475]:
sc = StandardScaler()
X_std = sc.fit_transform(X_train)

#検証用のデータにtransformを行います。
X_test_std = sc.transform(X_test)

In [476]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
# sgdc = make_pipeline(StandardScaler(),SGDClassifier(loss ='log'))
# sgdc.fit(X_train, y_train)
# Y_pred = sgdc.predict(X_test)

In [477]:
sgdc = SGDClassifier(loss='log')
sgdc.fit(X_std, y_train)
Y_pred = sgdc.predict(X_test_std)

In [478]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix


print('accuracy = {}'.format(accuracy_score(y_test, Y_pred)))
print('precision = {}'.format(precision_score(y_test, Y_pred)))
print('recall = {}'.format(recall_score(y_test, Y_pred)))
print('f1 score = {}'.format(f1_score(y_test, Y_pred)))
print('confusion matrix = \n{}'.format(confusion_matrix(y_test, Y_pred)))

accuracy = 0.9
precision = 0.9285714285714286
recall = 0.8666666666666667
f1 score = 0.896551724137931
confusion matrix = 
[[13  2]
 [ 1 14]]


## SVM

In [479]:
import numpy as np
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))
X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)
random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

In [480]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, test_size =0.3, random_state=0)

In [481]:
from sklearn.svm import SVC

svm = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svm.fit(X_train, y_train)
Y_pred = svm.predict(X_test)

In [482]:
print('accuracy = {}'.format(accuracy_score(y_test, Y_pred)))
print('precision = {}'.format(precision_score(y_test, Y_pred)))
print('recall = {}'.format(recall_score(y_test, Y_pred)))
print('f1 score = {}'.format(f1_score(y_test, Y_pred)))
print('confusion matrix = \n{}'.format(confusion_matrix(y_test, Y_pred)))

accuracy = 1.0
precision = 1.0
recall = 1.0
f1 score = 1.0
confusion matrix = 
[[65  0]
 [ 0 85]]


## 決定木

In [492]:
X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [493]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, test_size =0.3, random_state=0)

In [494]:
from sklearn.tree import DecisionTreeClassifier

In [495]:
# sc = StandardScaler()
# sc.fit(X_train)
# X_std =sc.transform(X_train)

# #検証用のデータにtransformを行う
# X_test_std = sc.transform(X_test)

In [496]:
clf = DecisionTreeClassifier(random_state=0)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
Y_pred = clf.predict(X_test)

In [497]:
print('accuracy = {}'.format(accuracy_score(y_test, Y_pred)))
print('precision = {}'.format(precision_score(y_test, Y_pred)))
print('recall = {}'.format(recall_score(y_test, Y_pred)))
print('f1 score = {}'.format(f1_score(y_test, Y_pred)))
print('confusion matrix = \n{}'.format(confusion_matrix(y_test, Y_pred)))

accuracy = 0.9166666666666666
precision = 0.875
recall = 1.0
f1 score = 0.9333333333333333
confusion matrix = 
[[4 1]
 [0 7]]


## 【問題3】 回帰問題を解くコードの作成
線形回帰でHouse Pricesデータセットを学習・推定するコードを作成してください。

In [498]:
df_train = pd.read_csv('train.csv')

In [499]:
df = df_train.loc[:, ['GrLivArea','YearBuilt', 'SalePrice']]

In [500]:
df1 = np.log(df)

## 線形回帰

In [501]:
X_train, X_test, y_train, y_test = scratch_train_test_split(np.array(df1.iloc[:, [0, 1]])
                                                         , np.array(df1.iloc[:, [2]]), random_state = 0)

In [510]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr = LinearRegression()

#線形回帰で学習させる
lr.fit(X_train, y_train)

#予測を出す
Y_pred = lr.predict(X_test)

In [511]:
#https://pythondatascience.plavox.info/scikit-learn/%E5%9B%9E%E5%B8%B0%E3%83%A2%E3%83%87%E3%83%AB%E3%81%AE%E8%A9%95%E4%BE%A1

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

print('平均絶対誤差(MAE) ={}'.format(mean_absolute_error(y_test, Y_pred)))
print('平均二乗誤差(MSE) = {}'.format(mean_squared_error(y_test, Y_pred)))
print('二乗平均平方根誤差 (RMSE) ={}'.format(np.sqrt(mean_squared_error(y_test, Y_pred))))
print('決定係数 (R2) ={}'.format(r2_score(y_test, Y_pred))) 

平均絶対誤差(MAE) =0.15353180375096603
平均二乗誤差(MSE) = 0.044392523340521815
二乗平均平方根誤差 (RMSE) =0.21069533298229892
決定係数 (R2) =0.7123462478163527
