### 【問題1】train_test_split のスクラッチ

In [3]:
# サンプル
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import pandas as pd

iris_dataset = load_iris()
xx = pd.DataFrame(iris_dataset.data, columns=["sepal_length", "sepal_width", "petal_length", "petal_width"])
y = pd.Series(iris_dataset.target, name="y")
df_train = x.join(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, shuffle=True, random_state=0)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
61,5.9,3.0,4.2,1.5
92,5.8,2.6,4.0,1.2
112,6.8,3.0,5.5,2.1
2,4.7,3.2,1.3,0.2
141,6.9,3.1,5.1,2.3
...,...,...,...,...
9,4.9,3.1,1.5,0.1
103,6.3,2.9,5.6,1.8
67,5.8,2.7,4.1,1.0
117,7.7,3.8,6.7,2.2


In [67]:
import random

def scratch_train_test_split(x, y, train_size=0.8, shuffle=True):
    """
    検証データを分割する。
    Parameters
    ----------
    X : ndarray
      訓練データ (n_samples, n_features)
    y : ndarray
      正解値 (n_samples,)
    train_size : float
      何割をtrainとするか指定 (0 < train_size < 1)
    shuffle: bool
      結果をシャッフルして返すか
    Returns
    -------
    x_train : ndarray
      訓練データ (n_samples, n_features)
    x_test : ndarray
      検証データ (n_samples, n_features)
    y_train : ndarray
      訓練データの正解値 (n_samples,)
    y_test : ndarray
      検証データの正解値 (n_samples,)
    """
    if len(x) != len(y):
        raise
    if train_size < 0.0 or 1.0 < train_size:
        raise
        
    # シャッフル
    if shuffle:
        random.shuffle(x)
        random.shuffle(y)

    tgt_idx = int(len(x) * train_size) # TODO 切り捨て
    x_train = x[:tgt_idx]
    y_train = y[:tgt_idx]
    x_test = x[tgt_idx:]
    y_test = y[tgt_idx:]
    
    return x_train, x_test, y_train, y_test

x_train2, x_test2, y_train2, y_test2 = scratch_train_test_split(x.values, y.values, train_size=0.75)
display(x_train2)

array([[1710, 2003],
       [1262, 1976],
       [1262, 1976],
       ...,
       [1040, 1950],
       [1040, 1950],
       [1412, 1988]])

### 【問題2】 分類問題を解くコードの作成

In [None]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

iris_dataset = load_iris()
x = pd.DataFrame(iris_dataset.data, columns=["sepal_length", "sepal_width", "petal_length", "petal_width"])
y = pd.Series(iris_dataset.target, name="y")
df_train = x.join(y)

x_train, x_test, y_train, y_test = scratch_train_test_split(x.values, y.values, train_size=0.75, shuffle=True)

knc = KNeighborsClassifier(n_neighbors=5)
knc.fit(x_train, y_train) # 学習
knc_p = knc.predict(x_test) # 学習結果を元に予測

display(accuracy_score(y_test, knc_p))

In [75]:
import numpy as np
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, n_samples // 2)
f1 = np.random.multivariate_normal(f1, cov, n_samples // 2)
x = np.concatenate([f0, f1])
y = np.concatenate([
    np.full(n_samples // 2, 1),
    np.full(n_samples // 2, -1)
])

x_train, x_test, y_train, y_test = scratch_train_test_split(x, y, train_size=0.75, shuffle=True)

knc = KNeighborsClassifier(n_neighbors=5)
knc.fit(x_train, y_train) # 学習
knc_p = knc.predict(x_test) # 学習結果を元に予測

display(accuracy_score(y_test, knc_p))

0.512

In [76]:
x = np.array([
    [-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
    [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
    [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
    [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
    [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
    [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
    [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
    [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
    [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
    [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
    [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
    [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
    [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
    [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
    [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
    [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
    [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
    [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
    [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
    [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ],
])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

x_train, x_test, y_train, y_test = scratch_train_test_split(x, y, train_size=0.75, shuffle=True)

knc = KNeighborsClassifier(n_neighbors=5)
knc.fit(x_train, y_train) # 学習
knc_p = knc.predict(x_test) # 学習結果を元に予測

display(accuracy_score(y_test, knc_p))

0.3

### 【問題3】 回帰問題を解くコードの作成

In [68]:
df_base = pd.read_csv("../pre_learning/week3/house-prices-advanced-regression-techniques/train.csv")
df = df_base.loc[:, ["GrLivArea", "YearBuilt", "SalePrice"]]
feature_names = ["GrLivArea", "YearBuilt"]
y_name = "SalePrice"
x = df_base.loc[:, feature_names]
y = df_base[y_name]

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

x_train, x_test, y_train, y_test = scratch_train_test_split(x.values, y.values, train_size=0.75, shuffle=True)

# 学習
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

# 評価: MSE
lr_mse = mean_absolute_error(y_test, y_pred)
display(lr_mse)

61029.00153250159