In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import joblib

ROOT = Path("..").resolve()
DATA_PROC = ROOT / "data" / "train" / "housing_train_processed.csv"  # use if fully numeric
DATA_RAW  = ROOT / "data" / "train" / "housing_train.csv"            # fallback
MODEL_OUT = ROOT / "models" / "svr_model.pkl"


In [2]:
# Prefer processed if it's numeric; else fallback to raw
try:
    housing = pd.read_csv(DATA_PROC)
    if len(housing.select_dtypes(include="object").columns) > 0:
        raise ValueError("Processed file still has object dtypes.")
except Exception:
    print("[INFO] Using RAW; will one-hot encode inside the pipeline.")
    housing = pd.read_csv(DATA_RAW)

housing.head()


[INFO] Using RAW; will one-hot encode inside the pipeline.


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,82700.0,INLAND
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN


In [3]:
y = housing["median_house_value"].copy()
X = housing.drop(columns=["median_house_value"])

# If 'ocean_proximity' still exists as strings, we'll handle it with get_dummies() here
# (Alternatively, you can one-hot inside the pipeline with ColumnTransformer; this is simpler.)
if "ocean_proximity" in X.columns and X["ocean_proximity"].dtype == "object":
    X = pd.get_dummies(X, columns=["ocean_proximity"])

X.shape, y.shape


((16512, 13), (16512,))

In [4]:
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("svr", SVR())  # default: rbf kernel, C=1.0, gamma='scale'
])

scores = cross_val_score(pipe, X, y,
                         scoring="neg_mean_squared_error",
                         cv=5, n_jobs=-1)
rmse = np.sqrt(-scores)
print("Baseline SVR 5-fold CV RMSE  mean:", rmse.mean(), " std:", rmse.std())


Baseline SVR 5-fold CV RMSE  mean: 118574.95234732796  std: 856.8782557106289


In [5]:
from scipy.stats import loguniform

rand = RandomizedSearchCV(
    estimator=pipe,
    param_distributions={
        "svr__kernel": ["linear", "rbf"],
        "svr__C": loguniform(1e-2, 1e2),     # sample C ~ log-uniform(0.01, 100)
        # gamma is used for 'rbf'; harmless for 'linear'
        "svr__gamma": ["scale", "auto"]
    },
    n_iter=30,               # try 20–50 depending on speed
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=1
)
rand.fit(X, y)

best_rmse = np.sqrt(-rand.best_score_)
print("Best params:", rand.best_params_)
print("Best CV RMSE:", best_rmse)

best_svr = rand.best_estimator_


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best params: {'svr__C': np.float64(8.471801418819979), 'svr__gamma': 'scale', 'svr__kernel': 'linear'}
Best CV RMSE: 83445.14147258109
