In [39]:
import pandas as pd
import numpy as np
import sklearn, sklearn.model_selection
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [40]:
data = pd.read_csv("housing_prices.csv")
print(data.shape)

(1460, 81)


In [41]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [44]:
drop_cols = ["Id","PoolQC", "MiscFeature", "Alley", "Fence"]
data = data.drop(columns=drop_cols)

In [45]:
na_per_col = data.isna().sum()
na_per_col = na_per_col[na_per_col > 0]
print(na_per_col)

LotFrontage     259
MasVnrType      872
MasVnrArea        8
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Electrical        1
FireplaceQu     690
GarageType       81
GarageYrBlt      81
GarageFinish     81
GarageQual       81
GarageCond       81
dtype: int64


In [46]:
# Categorical fills
fill_none = [
    "MasVnrType", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
    "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond"
]
data[fill_none] = data[fill_none].fillna("None")

# Numeric: median or 0
data["LotFrontage"]  = data["LotFrontage"].fillna(data["LotFrontage"].median())
data["MasVnrArea"]   = data["MasVnrArea"].fillna(0)
data["GarageYrBlt"]  = data["GarageYrBlt"].fillna(0)

# Electrical: fill with most common value
data["Electrical"] = data["Electrical"].fillna(data["Electrical"].mode()[0])


In [47]:
cat_cols = fill_none + ["Electrical", "YrSold"]
data[cat_cols] = data[cat_cols].astype("category")

In [49]:
X = data.drop(columns="SalePrice")
y = data["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

In [50]:
num_sel = selector(dtype_include="number")
cat_sel = selector(dtype_exclude="number")

preprocess = ColumnTransformer([
    # Numeric: median-impute → scale
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  StandardScaler())
    ]), num_sel),

    # Categorical: mode-impute → one-hot
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe",     OneHotEncoder(handle_unknown="ignore"))
    ]), cat_sel)
])

In [51]:
linreg_pipe = Pipeline([
    ("prep", preprocess),
    ("linreg", LinearRegression())
])

linreg_pipe.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('linreg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [52]:
y_pred = linreg_pipe.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:,.0f}")
print(f"Test R²  : {r2:.3f}")

Test RMSE: 29,689
Test R²  : 0.885


In [54]:
print(
    "Test R² : 0.885 means your model explains 88.5% of the variance in SalePrice on the test data\n"
    "On average the model’s prediction is about $29 700 away from the true sale price."
)

Test R² : 0.885 means your model explains 88.5% of the variance in SalePrice on the test data
On average the model’s prediction is about $29 700 away from the true sale price.
