# 概要
ここでは、ボストンデータセットをもとに、線形回帰の実装を行います。

In [13]:
from sklearn.datasets import load_boston
import pandas as pd

boston = load_boston()

df = pd.DataFrame(boston.data, columns=boston.feature_names)

df["MEDV"] = boston.target


df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [14]:
#3つの特徴量、今回は犯罪率(CRIM)、平均部屋数(RM)、低所得者の割合(LSTAT)を、変数yに格納
x = df.loc[:, ["CRIM", "RM", "LSTAT"]]
y = df.loc[:, "MEDV"]

#train_test_splitを使って、データを訓練用データとテスト用データに分割します。
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8)

#それぞれのデータのサイズを表示
print(len(x_train), len(x_test), len(y_train), len(y_test))

404 102 404 102


この結果が意味していること
506個の全てのデータがあり、８割を訓練用データ（train_size=0.8）、２割をテスト用データ（test_size=0.2）としているので、訓練用データは404個、テスト用データは102個になる。ということ

## sklearnの線形モデルを使ってみる

In [16]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(y_pred)


#実行結果は、それぞれの結果から予測された住宅価格

[29.74646835 25.61111424 29.32156787 19.32836268 17.46250188 22.35521655
  8.53813937 36.92189989 40.39796888 36.18989986 21.73191941 25.77499212
 24.69087535 18.99093502 27.64092458 25.23250836 22.78140364  6.46362364
 -0.10146937 35.37703524 19.07030899 32.34981487 19.99835595 15.84278593
 35.13026605 19.44529873  9.02273371 21.06797485 22.24147182 20.39116648
 15.8754829  32.46665589 20.0444201  32.85679823 29.36726715 21.0085168
 19.41984031 22.60674499 25.2443936  29.60860955 38.36602021 25.09453589
 36.88773446 22.64698934 19.32862368 37.565555   23.69906184 27.7384194
 24.40328805 -0.277748   18.7878799  21.04181009 30.66671111 23.7078216
 17.58468993 34.22157708 18.18534925 26.20662525 18.6615279  28.66207013
 13.91598679 18.1150244  20.37643775 28.04162025 28.87791143 33.8475618
 26.16687836 25.886579   23.56432327 31.66155091 20.34261205  8.21838175
 27.33336929 20.33973086 27.84619355 17.56319773 15.42540507 18.05026376
 15.17095243 17.18916005 20.39292448 30.7974093  16.607

## 決定係数(Rの二乗)
決定係数（Rの二乗）という線形回帰モデルの評価指標

この値は１に近ければ機械学習で得られた予想と本当の答えが一致していて、0に近いと一致していないという値になります。
決定係数の使い方は、LinearRegressionと同様にライブラリからインポートできる

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
r2_score(y_test, y_pred)

#実行結果は0.5ということで、どっちつかずの状態の結果になりました。確度があがれば1に近づきますし、はな

0.532196819564556