# 線形回帰モデルとニューラルネットワークの比較

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
boston = load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [None]:
print(boston.DESCR)

In [None]:
print("説明変数")
df_data = pd.DataFrame(boston.data, columns=boston.feature_names)
print(df_data.shape)
display(df_data.head())

print("目的変数")
display(boston.target[:10])

In [None]:
df = pd.concat([df_data, pd.DataFrame(boston.target, columns=["MEDV"])], axis=1)
display(df.corr().round(2))
pd.plotting.scatter_matrix(df, figsize=(12,12))
plt.show()

### データセット

In [None]:
X = df_data[["LSTAT"]].values
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True,test_size=0.3,  random_state=1234)

# 標準化
std = StandardScaler()
X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)
std = StandardScaler()
y_train = std.fit_transform(y_train.reshape(-1, 1)).reshape(1, -1)[0]
y_test = std.transform(y_test.reshape(-1, 1)).reshape(1, -1)[0]

### 線形回帰でモデリングする

In [None]:
# 学習
reg = LinearRegression()
reg.fit(X_train, y_train)

In [None]:
# 予測
y_pred =reg.predict(X_test)
df_result = pd.DataFrame(y_pred, columns=["linearRegression"])
print("MSE=",mean_squared_error(y_test, y_pred))
print("切片=", reg.intercept_, "回帰係数=", reg.coef_)

# 予測結果の描画
df1 = pd.DataFrame(X_test, index=range(len(X_test)), columns=["LSTAT_std"])
df1["y"] = y_test
df1["label"] = "y_test"
df2 = pd.DataFrame(X_test, index=range(len(X_test)), columns=["LSTAT_std"])
df2["y"] = y_pred
df2["label"] = "y_pred"
df = pd.concat([df1, df2], axis=0)

sns.lmplot(x="LSTAT_std", y="y", hue="label", data=df, fit_reg=False)
plt.show()

### ニューラルネットワークでモデリングする

In [None]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD

In [None]:
batch_size = 32
epochs = 200

# ネットワークの設定
model = Sequential()
model.add(Dense(10, activation='linear', input_shape=(1,)))
# model.add(Dense(5, activation='linear'))
model.add(Dense(1, activation='linear')) #この活性化関数は変更してはならない
model.summary()
model.compile(loss='mean_squared_error',
              optimizer=SGD(),
              metrics=['mse'])

# 計算
history = model.fit(X_train, y_train, 
                    batch_size=batch_size,
                    epochs=epochs,     # エポック数
                    verbose=1,         # ログ出力
                    validation_data=(X_test, y_test))

# 計算過程の描画
pd.DataFrame(history.history).plot()
plt.show()


In [None]:
# 予測
y_pred = model.predict(X_test)
df_result["NN"] = y_pred

# 予測結果の描画
df1 = pd.DataFrame(X_test, index=range(len(X_test)), columns=["LSTAT_std"])
df1["y"] = y_test
df1["label"] = "y_test"
df2 = pd.DataFrame(X_test, index=range(len(X_test)), columns=["LSTAT_std"])
df2["y"] = y_pred
df2["label"] = "y_pred"
df = pd.concat([df1, df2], axis=0)

sns.lmplot(x="LSTAT_std", y="y", hue="label", data=df, fit_reg=False)
plt.show()


### [演習]
* 中間層の数を3層(ノード数は10と5)に変更してみましょう
* 中間層の活性化関数をsigmoidに変更してみましょう