In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msn

from sklearn.neighbors import KNeighborsRegressor
import scipy.stats
from sklearn.preprocessing import StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
#pd.pandas.set_option("display.max_columns", None)
#pd.pandas.set_option("display.max_rows", None)

In [3]:
train = pd.read_csv("/Users/lukasmetz/Desktop/Lukas/Code/FullProjects/BostonHousePrice/src/notebook/data/train.csv")
test = pd.read_csv("/Users/lukasmetz/Desktop/Lukas/Code/FullProjects/BostonHousePrice/src/notebook/data/test.csv")

In [5]:
print(train.shape)
print(test.shape)


(1460, 81)
(1459, 80)


In [6]:
train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [None]:
df = train[["OverallQual", "GrLivArea", "TotalBsmtSF", "GarageCars", "GarageArea", "YearBuilt", "YearRemodAdd", "SalePrice",
            "Neighborhood", "MSZoning", "BldgType", "HouseStyle", "OverallCond", "ExterQual", "KitchenQual", "HeatingQC",
            "CentralAir", "SaleCondition"]]

In [8]:
df.head()

Unnamed: 0,OverallQual,GrLivArea,TotalBsmtSF,GarageCars,GarageArea,YearBuilt,YearRemodAdd,SalePrice,Neighborhood,MSZoning,BldgType,HouseStyle,OverallCond,ExterQual,KitchenQual,HeatingQC,CentralAir,SaleCondition
0,7,1710,856,2,548,2003,2003,208500,CollgCr,RL,1Fam,2Story,5,Gd,Gd,Ex,Y,Normal
1,6,1262,1262,2,460,1976,1976,181500,Veenker,RL,1Fam,1Story,8,TA,TA,Ex,Y,Normal
2,7,1786,920,2,608,2001,2002,223500,CollgCr,RL,1Fam,2Story,5,Gd,Gd,Ex,Y,Normal
3,7,1717,756,3,642,1915,1970,140000,Crawfor,RL,1Fam,2Story,5,TA,Gd,Gd,Y,Abnorml
4,8,2198,1145,3,836,2000,2000,250000,NoRidge,RL,1Fam,2Story,5,Gd,Gd,Ex,Y,Normal


In [13]:
df.isnull().sum()

OverallQual      0
GrLivArea        0
TotalBsmtSF      0
GarageCars       0
GarageArea       0
YearBuilt        0
YearRemodAdd     0
SalePrice        0
Neighborhood     0
MSZoning         0
BldgType         0
HouseStyle       0
OverallCond      0
ExterQual        0
KitchenQual      0
HeatingQC        0
CentralAir       0
SaleCondition    0
dtype: int64

PIPELINE

In [27]:
#selecting columns
df = train[["OverallQual", "GrLivArea", "TotalBsmtSF", "GarageCars", "GarageArea", "YearBuilt", "YearRemodAdd", "SalePrice",
            "Neighborhood", "MSZoning", "BldgType", "HouseStyle", "OverallCond", "ExterQual", "KitchenQual", "HeatingQC",
            "CentralAir", "SaleCondition"]]

#defining numerical and categorical features
num_feature = [
    "OverallQual", "GrLivArea", "TotalBsmtSF",
    "GarageCars", "GarageArea", "YearBuilt", "YearRemodAdd"
]

cat_feature = [
    "Neighborhood", "MSZoning", "BldgType", "HouseStyle",
    "OverallCond", "ExterQual", "KitchenQual",
    "HeatingQC", "CentralAir", "SaleCondition"
]

X = df[num_feature + cat_feature]
y = df["SalePrice"]



# Imputing and scaling
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_feature),
    ("cat", cat_pipeline, cat_feature)
])


model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R²:", r2)

RMSE: 32925.044003651936
R²: 0.8586685242113107


In [16]:
df.shape

(1460, 18)

In [17]:
df.head(2)

Unnamed: 0,OverallQual,GrLivArea,TotalBsmtSF,GarageCars,GarageArea,YearBuilt,YearRemodAdd,SalePrice,Neighborhood,MSZoning,BldgType,HouseStyle,OverallCond,ExterQual,KitchenQual,HeatingQC,CentralAir,SaleCondition
0,7,1710,856,2,548,2003,2003,208500,CollgCr,RL,1Fam,2Story,5,Gd,Gd,Ex,Y,Normal
1,6,1262,1262,2,460,1976,1976,181500,Veenker,RL,1Fam,1Story,8,TA,TA,Ex,Y,Normal


In [18]:
X_preprocessed = preprocessor.fit_transform(df)

In [19]:
X_preprocessed

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 24820 stored elements and shape (1460, 80)>

In [21]:
feature_names = preprocessor.get_feature_names_out()

In [25]:
feature_names

array(['num__OverallQual', 'num__GrLivArea', 'num__TotalBsmtSF',
       'num__GarageCars', 'num__GarageArea', 'num__YearBuilt',
       'num__YearRemodAdd', 'cat__Neighborhood_Blmngtn',
       'cat__Neighborhood_Blueste', 'cat__Neighborhood_BrDale',
       'cat__Neighborhood_BrkSide', 'cat__Neighborhood_ClearCr',
       'cat__Neighborhood_CollgCr', 'cat__Neighborhood_Crawfor',
       'cat__Neighborhood_Edwards', 'cat__Neighborhood_Gilbert',
       'cat__Neighborhood_IDOTRR', 'cat__Neighborhood_MeadowV',
       'cat__Neighborhood_Mitchel', 'cat__Neighborhood_NAmes',
       'cat__Neighborhood_NPkVill', 'cat__Neighborhood_NWAmes',
       'cat__Neighborhood_NoRidge', 'cat__Neighborhood_NridgHt',
       'cat__Neighborhood_OldTown', 'cat__Neighborhood_SWISU',
       'cat__Neighborhood_Sawyer', 'cat__Neighborhood_SawyerW',
       'cat__Neighborhood_Somerst', 'cat__Neighborhood_StoneBr',
       'cat__Neighborhood_Timber', 'cat__Neighborhood_Veenker',
       'cat__MSZoning_C (all)', 'cat__MSZoni