In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score

In [2]:
path = '../data/raw/'
df = pd.read_csv(os.path.join(path, 'housing.csv'))
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [4]:
df['total_bedrooms'].mean()

np.float64(537.8705525375618)

## Splitting Data

In [5]:
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Handling Missing Values

In [6]:
X.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY


In [7]:
X.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

In [8]:
imputer = SimpleImputer(strategy="median")

X_train["total_bedrooms"] = imputer.fit_transform(
    X_train[["total_bedrooms"]]
)

X_test["total_bedrooms"] = imputer.transform(
    X_test[["total_bedrooms"]]
)

In [9]:
X_train['total_bedrooms'].isnull().sum(), X_test['total_bedrooms'].isnull().sum(),

(np.int64(0), np.int64(0))

---

## Doing Standardization

In [10]:
numeric_features = [
    'longitude',
    'latitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income'
]

In [11]:
scalar = StandardScaler()

X_train[numeric_features] = scalar.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scalar.transform(X_test[numeric_features])

In [12]:
X_train.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14196,1.272587,-1.372811,0.34849,0.222569,0.211228,0.768276,0.322906,-0.326196,NEAR OCEAN
8267,0.709162,-0.876696,1.618118,0.340293,0.593094,-0.098901,0.672027,-0.035843,NEAR OCEAN
17445,-0.447603,-0.460146,-1.95271,-0.342597,-0.495226,-0.449818,-0.430461,0.144701,NEAR OCEAN


---

## Encoding for Categorical Data

In [13]:
print("Original ocean_proximity unique values:")
print(X_train["ocean_proximity"].value_counts())

Original ocean_proximity unique values:
ocean_proximity
<1H OCEAN     7341
INLAND        5227
NEAR OCEAN    2086
NEAR BAY      1854
ISLAND           4
Name: count, dtype: int64


In [14]:
# Create encoder
encoder = OneHotEncoder(
    sparse_output=False,      # Get dense array (not sparse matrix)
    handle_unknown='ignore'   # Ignore new categories in test
)

# Fit on train, transform train
encoded_train = encoder.fit_transform(X_train[["ocean_proximity"]])

# Transform test (uses train categories only)
encoded_test = encoder.transform(X_test[["ocean_proximity"]])

# Get proper column names
encoded_cols = encoder.get_feature_names_out()

print("Encoded column names:", encoded_cols)
print("Train shape:", encoded_train.shape)  # (16512, 5)
print("Test shape:", encoded_test.shape)    # (4128, 5)


Encoded column names: ['ocean_proximity_<1H OCEAN' 'ocean_proximity_INLAND'
 'ocean_proximity_ISLAND' 'ocean_proximity_NEAR BAY'
 'ocean_proximity_NEAR OCEAN']
Train shape: (16512, 5)
Test shape: (4128, 5)


In [15]:
# Convert to DataFrames with correct indices
encoded_train_df = pd.DataFrame(
    encoded_train, columns=encoded_cols, index=X_train.index
)
encoded_test_df = pd.DataFrame(
    encoded_test, columns=encoded_cols, index=X_test.index
)

print("\nFirst few rows:")
encoded_train_df.head()


First few rows:


Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
14196,0.0,0.0,0.0,0.0,1.0
8267,0.0,0.0,0.0,0.0,1.0
17445,0.0,0.0,0.0,0.0,1.0
14265,0.0,0.0,0.0,0.0,1.0
2271,0.0,1.0,0.0,0.0,0.0


In [16]:
# Drop original ocean_proximity and add encoded columns
X_train = X_train.drop("ocean_proximity", axis=1).join(encoded_train_df)
X_test = X_test.drop("ocean_proximity", axis=1).join(encoded_test_df)

print("Final shapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

print(encoded_train)

Final shapes:
X_train: (16512, 13)
X_test: (4128, 13)
[[0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]]


In [17]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
14196,1.272587,-1.372811,0.34849,0.222569,0.211228,0.768276,0.322906,-0.326196,0.0,0.0,0.0,0.0,1.0
8267,0.709162,-0.876696,1.618118,0.340293,0.593094,-0.098901,0.672027,-0.035843,0.0,0.0,0.0,0.0,1.0
17445,-0.447603,-0.460146,-1.95271,-0.342597,-0.495226,-0.449818,-0.430461,0.144701,0.0,0.0,0.0,0.0,1.0
14265,1.232698,-1.382172,0.586545,-0.56149,-0.409306,-0.007434,-0.380587,-1.017864,0.0,0.0,0.0,0.0,1.0
2271,-0.108551,0.532084,1.142008,-0.119565,-0.256559,-0.485877,-0.314962,-0.171488,0.0,1.0,0.0,0.0,0.0


---

## Model Training and Evaluation

In [18]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [19]:
from sklearn.metrics import root_mean_squared_error

y_train_pred = lin_reg.predict(X_train)
y_test_pred  = lin_reg.predict(X_test)

train_rmse = root_mean_squared_error(y_train, y_train_pred)
test_rmse  = root_mean_squared_error(y_test,  y_test_pred)

print("Train RMSE:", train_rmse)
print("Test RMSE :", test_rmse)

Train RMSE: 68433.93736666226
Test RMSE : 70059.19333925014


### Calculate target variance explained by your model

In [20]:
train_r2 = lin_reg.score(X_train, y_train)
test_r2 = lin_reg.score(X_test, y_test)

print("Train R²:", train_r2)
print("Test R²:", test_r2)

Train R²: 0.6496648627123223
Test R²: 0.6254382675296266


---

## Save Model and Metrics

In [21]:
import joblib

# Save model
joblib.dump(lin_reg, "../artifacts/linear/linear_model.joblib")

['../artifacts/linear/linear_model.joblib']

In [22]:
import json
from pathlib import Path

metrics_path = Path("../artifacts/linear/metrics.json")

metrics = {}

# Load existing OR create new
if metrics_path.exists():
    with open(metrics_path, "r") as f:
        metrics = json.load(f)
    print(f"Loaded existing metrics from {metrics_path}")
else:
    metrics = {}
    print(f"Created new metrics file at {metrics_path}")

# Ensure base structure
metrics.setdefault("model_family", "linear_models")
metrics.setdefault("models", {})

# Add Linear Regression metrics
metrics["models"]["linear_regression"] = {
    "train": {
        "rmse": train_rmse,
        "r2": train_r2
    },
    "test": {
        "rmse": test_rmse,
        "r2": test_r2
    }
}

# Save back
with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=4)

Loaded existing metrics from ../artifacts/linear/metrics.json


---

## Trying Regularization

In [23]:
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error

ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train, y_train)

0,1,2
,"alpha  alpha: {float, ndarray of shape (n_targets,)}, default=1.0 Constant that multiplies the L2 term, controlling regularization strength. `alpha` must be a non-negative float i.e. in `[0, inf)`. When `alpha = 0`, the objective is equivalent to ordinary least squares, solved by the :class:`LinearRegression` object. For numerical reasons, using `alpha = 0` with the `Ridge` object is not advised. Instead, you should use the :class:`LinearRegression` object. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number.",1.0
,"fit_intercept  fit_intercept: bool, default=True Whether to fit the intercept for this model. If set to false, no intercept will be used in calculations (i.e. ``X`` and ``y`` are expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"max_iter  max_iter: int, default=None Maximum number of iterations for conjugate gradient solver. For 'sparse_cg' and 'lsqr' solvers, the default value is determined by scipy.sparse.linalg. For 'sag' solver, the default value is 1000. For 'lbfgs' solver, the default value is 15000.",
,"tol  tol: float, default=1e-4 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for each solver: - 'svd': `tol` has no impact. - 'cholesky': `tol` has no impact. - 'sparse_cg': norm of residuals smaller than `tol`. - 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr,  which control the norm of the residual vector in terms of the norms of  matrix and coefficients. - 'sag' and 'saga': relative change of coef smaller than `tol`. - 'lbfgs': maximum of the absolute (projected) gradient=max|residuals|  smaller than `tol`. .. versionchanged:: 1.2  Default value changed from 1e-3 to 1e-4 for consistency with other linear  models.",0.0001
,"solver  solver: {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto' Solver to use in the computational routines: - 'auto' chooses the solver automatically based on the type of data. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge  coefficients. It is the most stable solver, in particular more stable  for singular matrices than 'cholesky' at the cost of being slower. - 'cholesky' uses the standard :func:`scipy.linalg.solve` function to  obtain a closed-form solution. - 'sparse_cg' uses the conjugate gradient solver as found in  :func:`scipy.sparse.linalg.cg`. As an iterative algorithm, this solver is  more appropriate than 'cholesky' for large-scale data  (possibility to set `tol` and `max_iter`). - 'lsqr' uses the dedicated regularized least-squares routine  :func:`scipy.sparse.linalg.lsqr`. It is the fastest and uses an iterative  procedure. - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses  its improved, unbiased version named SAGA. Both methods also use an  iterative procedure, and are often faster than other solvers when  both n_samples and n_features are large. Note that 'sag' and  'saga' fast convergence is only guaranteed on features with  approximately the same scale. You can preprocess the data with a  scaler from :mod:`sklearn.preprocessing`. - 'lbfgs' uses L-BFGS-B algorithm implemented in  :func:`scipy.optimize.minimize`. It can be used only when `positive`  is True. All solvers except 'svd' support both dense and sparse data. However, only 'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when `fit_intercept` is True. .. versionadded:: 0.17  Stochastic Average Gradient descent solver. .. versionadded:: 0.19  SAGA solver.",'auto'
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. Only 'lbfgs' solver is supported in this case.",False
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag' or 'saga' to shuffle the data. See :term:`Glossary ` for details. .. versionadded:: 0.17  `random_state` to support Stochastic Average Gradient.",


In [24]:
y_train_pred_ridge = ridge_reg.predict(X_train)
y_test_pred_ridge  = ridge_reg.predict(X_test)

train_rmse_ridge = root_mean_squared_error(y_train, y_train_pred_ridge)
test_rmse_ridge  = root_mean_squared_error(y_test, y_test_pred_ridge)

print("Ridge Train RMSE:", train_rmse_ridge)
print("Ridge Test RMSE :", test_rmse_ridge)

Ridge Train RMSE: 68434.99589612993
Ridge Test RMSE : 70066.02112093243


In [25]:
train_r2_ridge = ridge_reg.score(X_train, y_train)
test_r2_ridge = ridge_reg.score(X_test, y_test)

print("Ridge Train R²:", train_r2_ridge)
print("Ridge Test R²:", test_r2_ridge)

Ridge Train R²: 0.6496540247290437
Ridge Test R²: 0.625365256401925


In [26]:
import json
from pathlib import Path

metrics_path = Path("../artifacts/linear/metrics.json")

# Load existing OR create new
if metrics_path.exists():
    with open(metrics_path, "r") as f:
        metrics = json.load(f)
    print(f"Loaded existing metrics from {metrics_path}")
else:
    metrics = {}
    print(f"Created new metrics file at {metrics_path}")

# Ensure ridge container exists
metrics["models"].setdefault("ridge_regression", {})

# Add Ridge metrics for alpha = 1.0
metrics["models"]["ridge_regression"]["alpha_1.0"] = {
    "train": {
        "rmse": train_rmse_ridge,
        "r2": train_r2_ridge
    },
    "test": {
        "rmse": test_rmse_ridge,
        "r2": test_r2_ridge
    }
}

# Save back
with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=4)

Loaded existing metrics from ../artifacts/linear/metrics.json


---