# 【問題1】train_test_splitのスクラッチ

In [1]:
import random
import math
import numpy as np

In [2]:
#作成した関数
def scratch_train_test_split(X, y, train_size=0.8, random_state=None, shuffle=True, stratify=None):

    #seedを固定
    np.random.seed(random_state)
    random.seed(random_state)
    
    #shuffleがTrueのときは各要素をシャッフル
    if shuffle is True:
        np.random.shuffle(X) 
        
        np.random.seed(random_state)   
        np.random.shuffle(y)
    else:
        pass
    
    np.random.seed(random_state)
    #指定されたデータ要素数を均等に
    if stratify is None:
        X_train = random.choices(X,k=math.ceil(len(X)*(train_size)))
        X_test = random.choices(X,k=math.ceil(len(X)*(1-train_size)))
        y_train =  random.choices(y,k=math.ceil(len(y)*(train_size)))
        y_test = random.choices(y,k=math.ceil(len(y)*(1-train_size)))
    
    elif stratify is X:
        w = np.ones(len(X)).tolist()
        X_train = random.choices(X,k=math.ceil(len(X)*(train_size)),weights=w) 
        X_test = random.choices(X,k=math.ceil(len(X)*(1-train_size)),weights=w)
        y_train =  random.choices(y,k=math.ceil(len(y)*(train_size)))
        y_test = random.choices(y,k=math.ceil(len(y)*(1-train_size)))
        
    elif stratify is y:
        w = np.ones(len(y)).tolist()
        X_train = random.choices(X,k=math.ceil(len(X)*(train_size)))
        X_test = random.choices(X,k=math.ceil(len(X)*(1-train_size)))
        y_train =  random.choices(y,k=math.ceil(len(y)*(train_size)),weights=w)
        y_test = random.choices(y,k=math.ceil(len(y)*(1-train_size)),weights=w)
    
    #random_choiceはリストで返すため、再度ndarray化
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train, X_test, y_train, y_test

In [3]:
import numpy as np
#アイリスデータを使ってテストする
from sklearn.datasets import load_iris
dataset = load_iris()
X = dataset.data
y = dataset.target

#作成した関数実行
X_train,X_test,y_train,y_test = scratch_train_test_split(X,y,random_state=42,stratify=y)

#結果
print("dataset:{}".format(len(X)))
print('X_train:{}'.format(len(X_train)))
print('y_test:{}'.format(len(y_test)))
print(type(y_train))

print(y_train)
np.unique(y_train,return_counts=True)

dataset:150
X_train:120
y_test:30
<class 'numpy.ndarray'>
[2 0 0 1 1 0 1 0 0 0 2 2 0 1 0 2 2 0 1 2 1 0 2 2 0 0 0 1 0 2 1 0 0 0 2 1 1
 0 0 2 0 1 0 1 1 1 0 2 2 1 1 1 2 2 2 1 2 1 0 0 0 2 0 0 2 2 1 2 2 0 1 2 0 1
 1 0 1 2 2 0 0 2 1 0 2 1 2 1 1 1 0 0 0 1 0 2 2 1 0 0 2 1 2 0 2 2 2 0 0 2 0
 2 1 2 1 0 2 1 2 0]


(array([0, 1, 2]), array([45, 35, 40]))

# 【問題2】 分類問題を解くコードの作成
## ①irisデータセット

In [5]:
#データセット
from sklearn.datasets import load_iris
dataset = load_iris()
import pandas as pd
x_df = pd.DataFrame(dataset.data,columns = ["sepal_length","sepal_width","petal_length","petal_width"])
y_df = pd.DataFrame(dataset.target,columns=["species"])
df = pd.concat([x_df,y_df],axis=1)
df = df[df["species"].isin([1,2])].reset_index(drop=True)

#特徴量と目的変数を分ける
X = df.iloc[:,0:4].values
y = df["species"].values

#訓練・検証データに分割
X_train,X_test,y_train,y_test = scratch_train_test_split(X,y,random_state=0,stratify=y)



#インスタンス作成
from sklearn import linear_model
SDG = linear_model.SGDClassifier(loss="log")
from sklearn.svm import SVC
SVC = SVC(kernel='linear')
from sklearn import tree
DTC = tree.DecisionTreeClassifier()

#インスタンスをリストに格納
way_lst = [SDG,SVC,DTC]

#ロジスティック・SVM・決定木の正解率（accurary_score）を算出
from sklearn.metrics import accuracy_score
for way in way_lst:
    way.fit(X_train, y_train)#学習
    way_pred = way.predict(X_test) #推定 
    print("正解率は{:.2f}".format(accuracy_score(y_test,way_pred)))#評価
    print(way_pred)

正解率は0.60
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
正解率は0.55
[2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1]
正解率は0.60
[2 1 1 2 1 2 1 1 2 2 1 1 1 1 1 2 2 2 1 1]


## ②シンプルデータセット１

In [7]:
#データセット
import numpy as np
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))
X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)
random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

#訓練・検証データに分割
X_train,X_test,y_train,y_test = scratch_train_test_split(X,y,random_state=0,stratify=y)

#インスタンス化
from sklearn import linear_model
SDG = linear_model.SGDClassifier(loss="log")
from sklearn.svm import SVC
SVC = SVC(kernel='linear')
from sklearn import tree
DTC = tree.DecisionTreeClassifier()

way_lst = [SDG,SVC,DTC]

#ロジスティック・SVM・決定木の正解率（accurary_score）を算出
from sklearn.metrics import accuracy_score

for way in way_lst:
    way.fit(X_train, y_train)#学習
    way_pred = way.predict(X_test) #推定 
    print("正解率は{:.2f}".format(accuracy_score(y_test,way_pred)))#評価
    print(way_pred)

正解率は0.52
[ 1 -1 -1  1 -1  1 -1 -1 -1 -1  1  1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1  1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1 -1 -1  1 -1 -1 -1 -1 -1
 -1 -1  1 -1  1 -1 -1 -1  1 -1 -1  1 -1 -1 -1 -1 -1  1 -1  1 -1 -1 -1 -1
  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1  1  1 -1  1  1  1 -1 -1 -1 -1  1  1
 -1 -1 -1 -1]
正解率は0.50
[-1  1 -1 -1 -1 -1  1 -1 -1  1 -1 -1  1 -1 -1  1  1  1  1 -1  1  1 -1 -1
 -1 -1  1 -1 -1  1 -1 -1  1 -1  1 -1 -1 -1 -1 -1  1 -1 -1  1 -1 -1  1  1
 -1  1 -1 -1 -1 -1 -1  1 -1  1  1 -1  1 -1 -1 -1  1 -1  1 -1 -1  1 -1  1
 -1  1 -1  1 -1 -1 -1  1  1  1 -1 -1 -1 -1 -1  1  1 -1  1 -1  1  1 -1 -1
  1  1 -1 -1]
正解率は0.51
[ 1 -1  1  1 -1 -1 -1  1 -1  1 -1 -1  1  1 -1  1  1  1  1 -1 -1  1  1  1
 -1  1  1 -1  1  1  1 -1 -1  1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1  1 -1 -1 -1 -1  1  1  1  1 -1 -1 -1 -1  1  1 -1 -1 -1  1 -1
  1  1 -1  1  1 -1  1 -1 -1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
  1  1  1 -1]


## ③シンプルデータセット２

In [12]:
#データセット
X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

#訓練・検証データに分割
X_train,X_test,y_train,y_test = scratch_train_test_split(X,y,random_state=0,stratify=y)

#インスタンス化
from sklearn import linear_model
SDG = linear_model.SGDClassifier(loss="log")
from sklearn.svm import SVC
SVC = SVC(kernel='linear')
from sklearn import tree
DTC = tree.DecisionTreeClassifier()

way_lst = [SDG,SVC,DTC]
print("＝＝＝＝正解ラベル＝＝＝＝")
print(y_test)

#ロジスティック・SVM・決定木の正解率（accurary_score）を算出
from sklearn.metrics import accuracy_score

for way in way_lst:
    way.fit(X_train, y_train)#学習
    way_pred = way.predict(X_test) #推定 
    print("正解率は{:.2f}".format(accuracy_score(y_test,way_pred)))#評価
    print(way_pred)

＝＝＝＝正解ラベル＝＝＝＝
[1 1 0 0 1 0 0 1]
正解率は0.62
[1 0 1 0 1 0 0 0]
正解率は0.50
[0 0 0 0 0 0 0 0]
正解率は0.88
[1 1 0 0 1 0 1 1]


# 【問題3】 回帰問題を解くコードの作成

In [7]:
import pandas as pd
data = pd.read_csv("/Users/nobu/Documents/データセット/house-prices-advanced-regression-techniques/train.csv")

X = data.loc[:,["GrLivArea","YearBuilt"]].values
y = data["SalePrice"].values

In [8]:
#訓練・検証データに分割
X_train,X_test,y_train,y_test = scratch_train_test_split(X,y,random_state=0,stratify=y)

from sklearn import linear_model
#インスタンスを作成
clf = linear_model.SGDRegressor()
#学習
clf.fit(X_train, y_train)
#推定
clf_train_pred = clf.predict(X_train)
#
clf_test_pred = clf.predict(X_test)

#評価
from sklearn.metrics import mean_squared_error
print("訓練データMSE：{}".format(mean_squared_error(y_train,clf_train_pred)))
print("検証データMSE：{}".format(mean_squared_error(y_test,clf_test_pred)))


訓練データMSE：9.989688701280117e+30
検証データMSE：1.0531913244807714e+31
