# データの準備

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
import japanize_matplotlib


In [4]:
df = pd.read_csv('california_housing_cleansing.csv')
df = df.drop(columns = ['Unnamed: 0'])
df.head()

Unnamed: 0,所得,築年数,地域人口,緯度,経度,住宅価格,部屋数/人,寝室数/人
0,8.3252,41.0,322.0,37.88,-122.23,4.526,2.732919,0.400621
1,8.3014,21.0,2401.0,37.86,-122.22,3.585,2.956685,0.460641
2,2.0804,42.0,1206.0,37.84,-122.26,2.267,2.118574,0.55141
3,2.125,50.0,697.0,37.85,-122.26,1.4,1.606887,0.406026
4,1.9911,50.0,990.0,37.84,-122.26,1.587,2.261616,0.459596


In [5]:
# 説明変数の定義
X = df.drop(columns=['住宅価格']).to_numpy()
# 目的変数の定義
y = df['住宅価格'].to_numpy()

# 学習データとテストデータに分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


# 回帰手法の改善

In [6]:
# StandardScalerのインポート
from sklearn.preprocessing import StandardScaler


In [7]:
scaler = StandardScaler()

In [8]:
# 標準化の変換モデルの生成
scaler.fit(X_train)

In [9]:
#  標準化の変換モデルを利用したスケールの変換
X_train_scaled = scaler.transform(X_train)

In [10]:
# 変換前
df_X_train = pd.DataFrame(X_train, columns=['所得', '築年数', '地域人口', '緯度', '経度', '部屋数', '寝室数'])
df_X_train.head()


Unnamed: 0,所得,築年数,地域人口,緯度,経度,部屋数,寝室数
0,3.75,29.0,1390.0,34.04,-117.98,1.056115,0.223022
1,3.1187,25.0,588.0,33.01,-117.07,3.605442,0.647959
2,4.6406,45.0,1180.0,33.9,-118.07,1.505085,0.299153
3,5.901,18.0,2882.0,34.33,-118.43,2.044067,0.319223
4,2.6167,42.0,1271.0,37.99,-122.34,1.204563,0.256491


In [11]:
# 変換後
df_X_train_scaled = pd.DataFrame(X_train_scaled, columns=['所得', '築年数', '地域人口', '緯度', '経度', '部屋数', '寝室数'])
df_X_train_scaled.head()


Unnamed: 0,所得,築年数,地域人口,緯度,経度,部屋数,寝室数
0,0.034653,0.178247,-0.075532,-0.723337,0.760166,-0.837592,-0.809192
1,-0.368102,-0.170455,-0.767887,-1.203976,1.220266,1.614325,1.182965
2,0.602835,1.573057,-0.256822,-0.788667,0.714662,-0.405778,-0.452281
3,1.406941,-0.780684,1.212489,-0.588012,0.532645,0.11261,-0.35819
4,-0.688367,1.31153,-0.178263,1.119887,-1.444266,-0.694816,-0.652284


In [12]:
# チェック
df_X_train_scaled.describe()

Unnamed: 0,所得,築年数,地域人口,緯度,経度,部屋数,寝室数
count,13000.0,13000.0,13000.0,13000.0,13000.0,13000.0,13000.0
mean,4.533263e-15,7.651999000000001e-17,8.280556000000001e-17,-3.489093e-14,-8.354643e-14,1.134163e-14,-1.848504e-15
std,1.000038,1.000038,1.000038,1.000038,1.000038,1.000038,1.000038
min,-2.038837,-2.26267,-1.268592,-1.41863,-2.420081,-1.850904,-1.851601
25%,-0.7374752,-0.8678601,-0.5701945,-0.7793341,-1.034727,-0.4232155,-0.3998158
50%,-0.1394514,0.003896079,-0.229197,-0.6300096,0.5225325,-0.02454245,-0.1384252
75%,0.5805695,0.7884766,0.2622279,0.9752284,0.7652223,0.2989548,0.185558
max,7.211958,2.096111,29.52823,2.935112,2.615732,37.90064,39.16628


In [13]:
X_test_scaled  = scaler.transform(X_test)
