# House Price Prediction

```Data/house_price.csv```には、住宅の価格(```SalePrice```)と住宅の関連情報が格納されています。

- このデータを用いて、住宅の価格(```SalePrice```)を予測するモデルを構築、評価しなさい。
- データ中身を確認しながら、適切な特徴量エンジニアリングやモデル選定ための考えもまとめてください。

In [44]:
import pandas as pd 
import numpy as ny


In [45]:
#データの読み込み&確認
df_house=pd.read_csv("Data/house_price.csv")
df_house.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [46]:
#コラムの型を確認
print(df_house.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [47]:
df = df_house.select_dtypes(["object"])
print(df.columns)

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')


In [48]:
df_house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [49]:
#SalesPriseとの相関の強さを確認
corr = df_house.corr()
corr.sort_values('SalePrice', ascending=False)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
SalePrice,-0.021917,-0.084284,0.351799,0.263843,0.790982,-0.077856,0.522897,0.507101,0.477493,0.38642,...,0.324413,0.315856,-0.128578,0.044584,0.111447,0.092404,-0.02119,0.046432,-0.028923,1.0
OverallQual,-0.028365,0.032628,0.251646,0.105806,1.0,-0.091932,0.572323,0.550684,0.411876,0.239666,...,0.238923,0.308819,-0.113937,0.030371,0.064886,0.065166,-0.031406,0.070815,-0.027347,0.790982
GrLivArea,0.008273,0.074853,0.402797,0.263116,0.593007,-0.079686,0.19901,0.287389,0.390857,0.208171,...,0.247433,0.330224,0.009113,0.020643,0.10151,0.170205,-0.002416,0.05024,-0.036526,0.708624
GarageCars,0.01657,-0.04011,0.285691,0.154871,0.600671,-0.185758,0.53785,0.420622,0.364204,0.224054,...,0.226342,0.213569,-0.151434,0.035765,0.050494,0.020934,-0.04308,0.040522,-0.039117,0.640409
GarageArea,0.017634,-0.098672,0.344997,0.180403,0.562022,-0.151521,0.478954,0.3716,0.373066,0.29697,...,0.224666,0.241435,-0.121777,0.035087,0.051412,0.061047,-0.0274,0.027974,-0.027378,0.623431
TotalBsmtSF,-0.015415,-0.238518,0.392075,0.260833,0.537808,-0.171098,0.391452,0.291066,0.363936,0.522396,...,0.232019,0.247264,-0.095478,0.037384,0.084489,0.126053,-0.018479,0.013196,-0.014969,0.613581
1stFlrSF,0.010496,-0.251758,0.457181,0.299475,0.476224,-0.144203,0.281986,0.240379,0.344501,0.445863,...,0.235459,0.211671,-0.065292,0.056104,0.088758,0.131525,-0.021096,0.031372,-0.013604,0.605852
FullBath,0.005587,0.131608,0.198769,0.126031,0.5506,-0.194149,0.468271,0.439046,0.276833,0.058543,...,0.187703,0.259977,-0.115093,0.035353,-0.008106,0.049604,-0.01429,0.055872,-0.019669,0.560664
TotRmsAbvGrd,0.027239,0.04038,0.352096,0.190015,0.427452,-0.057583,0.095589,0.19174,0.280682,0.044316,...,0.165984,0.234192,0.004151,-0.006683,0.059383,0.083757,0.024763,0.036907,-0.034516,0.533723
YearBuilt,-0.012713,0.02785,0.123349,0.014228,0.572323,-0.375983,1.0,0.592855,0.315707,0.249503,...,0.22488,0.188686,-0.387268,0.031355,-0.050364,0.00495,-0.034383,0.012398,-0.013618,0.522897


まとめ
・目的：住宅の価格を予測するモデルの構築
・教師あり学習(特徴量・ラベルデータをモデル学習用の訓練データとモデル評価用のテストデータに分ける必要あり)
・タスク：重回帰分析
・モデルのクラスの選択
・目的変数：SalesPrice
・説明変数：SalesPriceと相関関係の大きい5つ
・特徴量エンジニアリング：①欠損値の処理②スケーリング③テキスト処理
・ラベルデータ：SalePrice
・欠損値の処理が必要

In [62]:
#欠損値の確認
df_house.isnull().sum().sort_values(ascending=False)

PoolQC           1453
MiscFeature      1406
Alley            1369
Fence            1179
FireplaceQu       690
LotFrontage       259
GarageYrBlt        81
GarageCond         81
GarageType         81
GarageFinish       81
GarageQual         81
BsmtFinType2       38
BsmtExposure       38
BsmtQual           37
BsmtCond           37
BsmtFinType1       37
MasVnrArea          8
MasVnrType          8
Electrical          1
Id                  0
Functional          0
Fireplaces          0
KitchenQual         0
KitchenAbvGr        0
BedroomAbvGr        0
HalfBath            0
FullBath            0
BsmtHalfBath        0
TotRmsAbvGrd        0
GarageCars          0
GrLivArea           0
GarageArea          0
PavedDrive          0
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
MiscVal             0
MoSold              0
YrSold              0
SaleType            0
SaleCondition       0
BsmtFullBath        0
HeatingQC 

In [66]:
#欠損自体に意味を持っているobjectデータは”None”で補完
for col in ("PoolQC","MiscFeature","Alley","Fence","FireplaceQu",'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond','BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',"MasVnrType"):
    df_house[col] = df_house[col].fillna('None')

In [67]:
#欠損自体に意味を持っている数値（int, float型）データは0で補完する
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath',"MasVnrArea","GarageYrBlt",'GarageArea', 'GarageCars'):
   df_house[col] = df_house[col].fillna(0)

In [68]:
 #欠損自体に意味を持たないobjectデータは最頻値で補完
df_house['MSZoning'] = df_house['MSZoning'].fillna(df_house['MSZoning'].mode()[0])
df_house["Functional"] = df_house["Functional"].fillna("Typ")
df_house['Electrical'] = df_house['Electrical'].fillna(df_house['Electrical'].mode()[0])
df_house['KitchenQual'] = df_house['KitchenQual'].fillna(df_house['KitchenQual'].mode()[0])
df_house['Exterior1st'] = df_house['Exterior1st'].fillna(df_house['Exterior1st'].mode()[0])
df_house['Exterior2nd'] = df_house['Exterior2nd'].fillna(df_house['Exterior2nd'].mode()[0])
df_house['SaleType'] = df_house['SaleType'].fillna(df_house['SaleType'].mode()[0])

In [71]:
#データとして意味をなさないものは削除する
df_house['Utilities'].value_counts()
df_house = df_house.drop(['Utilities'], axis=1)

In [72]:
#欠損値の有無を確認
df_house.isnull().sum().sort_values(ascending=False)

LotFrontage      259
Id                 0
KitchenAbvGr       0
GarageYrBlt        0
GarageType         0
FireplaceQu        0
Fireplaces         0
Functional         0
TotRmsAbvGrd       0
KitchenQual        0
BedroomAbvGr       0
GarageCars         0
HalfBath           0
FullBath           0
BsmtHalfBath       0
BsmtFullBath       0
GrLivArea          0
LowQualFinSF       0
2ndFlrSF           0
GarageFinish       0
GarageArea         0
Electrical         0
PoolQC             0
SaleCondition      0
SaleType           0
YrSold             0
MoSold             0
MiscVal            0
MiscFeature        0
Fence              0
PoolArea           0
GarageQual         0
ScreenPorch        0
3SsnPorch          0
EnclosedPorch      0
OpenPorchSF        0
WoodDeckSF         0
PavedDrive         0
GarageCond         0
1stFlrSF           0
CentralAir         0
MSSubClass         0
Neighborhood       0
YearBuilt          0
OverallCond        0
OverallQual        0
HouseStyle         0
BldgType     

In [73]:
#テキストデータの処理
from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')

for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(df_house[c].values)) 
    df_house[c] = lbl.transform(list(df_house[c].values))

    

In [74]:
#label encoding処理
df_house[['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold']].head(10)

Unnamed: 0,FireplaceQu,BsmtQual,BsmtCond,GarageQual,GarageCond,ExterQual,ExterCond,HeatingQC,PoolQC,KitchenQual,BsmtFinType1,BsmtFinType2,Functional,Fence,BsmtExposure,GarageFinish,LandSlope,LotShape,PavedDrive,Street,Alley,CentralAir,MSSubClass,OverallCond,YrSold,MoSold
0,3,2,4,5,5,2,4,0,3,2,2,6,6,4,3,2,0,3,2,1,1,1,5,4,2,1
1,5,2,4,5,5,3,4,0,3,3,0,6,6,4,1,2,0,3,2,1,1,1,0,7,1,4
2,5,2,4,5,5,2,4,0,3,2,2,6,6,4,2,2,0,0,2,1,1,1,5,4,2,8
3,2,4,1,5,5,3,4,2,3,2,0,6,6,4,3,3,0,0,2,1,1,1,6,4,0,1
4,5,2,4,5,5,2,4,0,3,2,2,6,6,4,0,2,0,0,2,1,1,1,5,4,2,11
5,3,2,4,5,5,3,4,0,3,3,2,6,6,2,3,3,0,0,2,1,1,1,4,4,3,9
6,2,0,4,5,5,2,4,0,3,2,2,6,6,4,0,2,0,3,2,1,1,1,0,4,1,7
7,5,2,4,5,5,3,4,0,3,3,0,1,6,4,2,2,0,0,2,1,1,1,5,5,3,10
8,5,4,4,1,5,3,4,2,3,3,6,6,2,4,3,3,0,3,2,1,1,1,4,4,2,3
9,5,4,4,2,5,3,4,0,3,3,2,6,6,4,3,2,0,3,2,1,1,1,14,5,2,0


In [76]:
#one-hot encoding処理
df_house = pd.get_dummies(df_house, drop_first=True)

In [9]:
# クラス読み込み、インスタンス作成
from sklearn.linear_model import LinearRegression
clf = LinearRegression(normalize=True)
X = df_house['ScreenPorch'].values.reshape(-1,1)


In [80]:
#説明変数　SalesPrice以外
X = df_house.drop("SalePrice", axis=1).values.reshape(-1,1)
#目的変数　SalePrice
Y = df_house["SalePrice"].values

In [82]:
from sklearn.model_selection import train_test_split
# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
clf.fit(X_train,y_train) # モデルを訓練データに適合
y_predicted=clf.predict(X_test) # テストデータで予測


ValueError: Found input variables with inconsistent numbers of samples: [293460, 1460]

In [83]:
#予測制度の評価
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_predicted) # 予測精度（平均二乗誤差）の評価

7128369650.936838