In [5]:
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
import joblib

ROOT = Path("..").resolve()
DATA_TRAIN = ROOT / "data" / "train" / "housing_train.csv"
MODEL_OUT = ROOT / "models" / "linear_regression_model.pkl"

# Load
housing = pd.read_csv(DATA_TRAIN)

# Separate features/target
X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"]

# One-hot encode categorical
X = pd.get_dummies(X, columns=["ocean_proximity"])

# Impute any missing values in numeric columns
imputer = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# 2. Model Fitting
lin_reg = LinearRegression().fit(X, y)

# 3. Cross-Validation
scores = cross_val_score(lin_reg, X, y, scoring="neg_mean_squared_error", cv=10)
rmse_scores = (-scores) ** 0.5
print("CV RMSE mean:", rmse_scores.mean(), " std:", rmse_scores.std())

# 5. Save model
joblib.dump(lin_reg, MODEL_OUT)
print("Saved:", MODEL_OUT)


CV RMSE mean: 69204.32275494756  std: 2372.0707910559104
Saved: C:\Users\MaxGillum\Desktop\492 ML PJ\cmse492_aml\ca_housing_project\models\linear_regression_model.pkl
