In [16]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, LassoCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from scipy.stats import randint

In [3]:
housing = pd.read_csv("housing.csv")

# Preparing Data

### Creating Transformers

In [6]:
# Handling Missing Values
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

# Handling Categorical Data
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Handling Heavy-Tailed features
class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, base=np.e):
        self.base = base
        

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_list()
        else:
            self.feature_names_in_ = [f"feature_{i}" for i in range(X.shape[1])]
        return self

    def transform(self, X):
        return np.emath.logn(self.base, X)
    
    def get_feature_names_out(self, input_features=None):
        if input_features == None:
            input_features = self.feature_names_in_
        return [f"log_{name}" for name in input_features]

    def inverse_transform(self, X):
        return self.base ** X
    
log_transformer = LogTransformer()

# Handling Geographic features


class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        
    
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters,random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self
    
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]
    
cluster_simil_transformer = ClusterSimilarity(random_state=42)

# Transforming housing_median_age
quantile_transformer = QuantileTransformer(output_distribution="normal", random_state=42)

# Creating Additional Features (bedrooms (bedrooms per room), rooms_per_house, people_per_house)
class RatioFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_list()
        else:
            self.feature_names_in_ = [f"feature_{i}" for i in range(X.shape[1])]
        return self
    
    def transform(self, X):
        if X.shape[1] < 2:             
            raise ValueError("Ratio transformation requires at least two columns.")
        transformed_X = X[:, 0] / X[:, 1]
        return transformed_X.reshape(-1,1)

    def get_feature_names_out(self, input_features=None):
        if input_features == None:
            input_features = self.feature_names_in_
        else:
            return [f"additional_{name}_ratio" for name in input_features]


ratio_features_transformer = RatioFeaturesTransformer()

# Scaling
standard_scaler = StandardScaler()

### Creating Pipeline

In [7]:
deafult_number_pipeline = make_pipeline(num_imputer, standard_scaler)

# Heavy-Tailed Features
log_pipeline = make_pipeline(num_imputer, log_transformer, standard_scaler)

# Categorical Features
cat_pipeline = make_pipeline(cat_imputer, cat_encoder)

# Geographic Features
geo_pipeline = Pipeline([("cluster_simil_transformer", cluster_simil_transformer)])

# median_house_values
quantile_pipeline = make_pipeline(num_imputer, quantile_transformer, standard_scaler)

# Additional Features
ratio_features_pipeline = make_pipeline(num_imputer, ratio_features_transformer,standard_scaler) 

# Piping!
preprocessing = ColumnTransformer([
    ("bedrooms",            ratio_features_pipeline,    ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house",     ratio_features_pipeline,    ["total_rooms", "households"]),
    ("people_per_house" ,   ratio_features_pipeline,    ["population", "households"]),
    ("log",                 log_pipeline,               ["total_rooms", "total_bedrooms", "population", "households", "median_income"]),
    ("geo",                 geo_pipeline,               ["latitude", "longitude"]),
    ("cat",                 cat_pipeline,               make_column_selector(dtype_include=object)),
    ("age",                 quantile_pipeline,          ["housing_median_age"])
    ], remainder=deafult_number_pipeline)

### Splitting Data

In [17]:
features = housing.drop(["median_house_value"], axis=1)
labels = housing["median_house_value"]


# Creating a custom stratification based on the median house value
labels_binned = pd.cut(labels, bins=20, labels=[x for x in range(1,21)])

# Splitting the data into training and testing sets using stratified sampling
X, X_test, y, y_test = train_test_split(features, labels, test_size=0.2, stratify=labels_binned, random_state=42)


# Training Models

In [130]:
class Model():
    def __init__(self, preprocessor, predictor):
        self.predictor = make_pipeline(preprocessor, predictor)
        self.predictor_name = predictor.__class__.__name__

    def fit_predict(self, dataset, labels):
        self.predictor.fit(dataset, labels)
        self.predictions = self.predictor.predict(dataset)
        self.cv_rmses = -cross_val_score(self.predictor, dataset, labels, scoring="neg_root_mean_squared_error", cv=5)
        self.cv_rmse = self.cv_rmses.mean()
        self.rmse = root_mean_squared_error(labels, self.predictions)
        return f"{self.predictor_name}\tRMSE: {self.rmse:,.0f}\tCV_RMSE: {self.cv_rmse:,.0f}"



lin_reg = Model(preprocessing, LinearRegression())
tree_reg = Model(preprocessing, DecisionTreeRegressor(random_state=42))
forest_reg = Model(preprocessing, RandomForestRegressor(random_state=42))
sv_reg = Model(preprocessing, SVR(C=1.0, epsilon=0.1, kernel="rbf", degree=3, gamma="scale"))

results = []
results.append(lin_reg.fit_predict(dataset=X, labels=y))
results.append(tree_reg.fit_predict(dataset=X, labels=y))
results.append(forest_reg.fit_predict(dataset=X, labels=y))
results.append(sv_reg.fit_predict(dataset=X, labels=y))

for result in results:
    print(result)

LinearRegression	RMSE: 69,017	CV_RMSE: 75,540
DecisionTreeRegressor	RMSE: 0	CV_RMSE: 88,269
RandomForestRegressor	RMSE: 17,373	CV_RMSE: 62,389
SVR	RMSE: 117,976	CV_RMSE: 119,095


# Fine-Tuning Models

## Randomized Search

### Decision Tree Regressor Randomized Search

In [18]:
# Define the pipeline for Decision Tree Regressor
tree_pipeline = Pipeline([
    ("preprocessing", preprocessing),  # Step for preprocessing
    ("decision_tree", DecisionTreeRegressor(random_state=42))  # Decision Tree Regressor with fixed random state for reproducibility
])

# Define the parameter distributions for randomized search
tree_param_distribs = {
    "decision_tree__max_depth": randint(low=3, high=15),       # Range of possible tree depths
    "decision_tree__min_samples_split": randint(low=2, high=20), # Minimum number of samples to split an internal node
    "decision_tree__min_samples_leaf": randint(low=3, high=15),  # Minimum number of samples allowed in a leaf node
    "decision_tree__max_features": randint(low=5, high=20),      # Number of features to consider for best split
    "preprocessing__geo__cluster_simil_transformer__n_clusters": randint(low=3, high=50)  # Number of clusters in the geo feature preprocessing step
}

# Set up RandomizedSearchCV with 5-fold cross-validation
tree_rnd_search = RandomizedSearchCV(
    tree_pipeline, param_distributions=tree_param_distribs, n_iter=10,
    cv=5, scoring='neg_root_mean_squared_error', random_state=42  # Using negative RMSE as scoring metric
)

# Fit the model to the data
tree_rnd_search.fit(X, y)

#### Utility function

In [9]:
def show_result(search):
    for param, value in search.best_params_.items():
        print(f"best {param.split('__')[-1]} found: {value}")
    print(f"best RMSE: {-search.best_score_:,.0f}")

In [19]:
show_result(tree_rnd_search)

best max_depth found: 14
best max_features found: 18
best min_samples_leaf found: 8
best min_samples_split found: 3
best n_clusters found: 23
best RMSE: 55,332


### Random Forest Randomized Search

In [21]:
# Define the pipeline for Random Forest Regressor
forest_pipeline = Pipeline([
    ("preprocessing", preprocessing),  # Step for preprocessing
    ("random_forest", RandomForestRegressor(random_state=42))  # Random Forest Regressor with fixed random state for reproducibility
])

# Define the parameter distributions for randomized search
forest_param_distribs = {
    "random_forest__max_depth": randint(low=2, high=15),         # Range of possible tree depths in the forest
    "random_forest__max_features": randint(low=1, high=10),      # Number of features to consider at each split
    "random_forest__min_samples_split": randint(low=2, high=6),  # Minimum number of samples to split an internal node
    "random_forest__min_samples_leaf": randint(low=1, high=10),  # Minimum number of samples allowed in a leaf node
    "preprocessing__geo__cluster_simil_transformer__n_clusters": randint(low=2, high=15)  # Number of clusters in geo feature preprocessing
}

# Set up RandomizedSearchCV with 5-fold cross-validation
forest_rnd_search = RandomizedSearchCV(
    forest_pipeline, param_distributions=forest_param_distribs,
    n_iter=10, cv=5, scoring="neg_root_mean_squared_error", random_state=42  # Using negative RMSE as scoring metric
)

# Fit the model to the data
forest_rnd_search.fit(X, y)

show_result(forest_rnd_search)

best n_clusters found: 10
best max_depth found: 11
best max_features found: 5
best min_samples_leaf found: 2
best min_samples_split found: 5
best RMSE: 47,584


In [22]:
# Define the pipeline for Random Forest Regressor
forest_pipeline = Pipeline([
    ("preprocessing", preprocessing),  # Step for preprocessing
    ("random_forest", RandomForestRegressor(random_state=42))  # Random Forest Regressor with fixed random state for reproducibility
])

# Define the parameter distributions for randomized search
forest_param_distribs = {
    "preprocessing__geo__cluster_simil_transformer__n_clusters": randint(low=2, high=15)  # Number of clusters in geo feature preprocessing
}

# Set up RandomizedSearchCV with 5-fold cross-validation
forest_rnd_search = RandomizedSearchCV(
    forest_pipeline, param_distributions=forest_param_distribs,
    n_iter=10, cv=5, scoring="neg_root_mean_squared_error", random_state=42  # Using negative RMSE as scoring metric
)

# Fit the model to the data
forest_rnd_search.fit(X, y)
show_result(forest_rnd_search)

### Support Vector Regressor Randomized Search

In [10]:
# Define the pipeline for Support Vector Regressor (SVR)
svr_pipeline = Pipeline([
    ("preprocessing", preprocessing),  # Step for preprocessing
    ("svr", SVR())  # SVR model without fixed random state (no random_state parameter in SVR)
])

# Define the parameter distributions for randomized search
svr_param_distribs = {
    "svr__kernel": ["linear", "poly", "rbf", "sigmoid"],         # Various kernel types to explore
    "svr__C": np.logspace(-3, 3, 7),                             # Regularization parameter, using logspace for wide range of values
    "svr__epsilon": np.logspace(-4, 0, 5),                       # Epsilon-tube parameter for the loss function
    "svr__degree": randint(low=2, high=6),                       # Degree of the polynomial kernel function (if kernel='poly')
    "preprocessing__geo__cluster_simil_transformer__n_clusters": randint(low=2, high=15)  # Number of clusters in geo feature preprocessing
}

# Set up RandomizedSearchCV with 5-fold cross-validation
svr_rnd_search = RandomizedSearchCV(
    svr_pipeline, param_distributions=svr_param_distribs,
    n_iter=10, cv=5, scoring="neg_root_mean_squared_error", random_state=42  # Using negative RMSE as scoring metric
)

# Fit the model to the data
svr_rnd_search.fit(X, y)

show_result(svr_rnd_search)

best n_clusters found: 7
best C found: 10.0
best degree found: 5
best epsilon found: 0.0001
best kernel found: linear
best RMSE: 89,334


### Linear Regressor Randomized Search

#### For Linear Regression, there are generally fewer hyperparameters to tune compared to models like Decision Trees, Random Forests, or SVR. Linear regression doesn't have parameters like max_depth or kernel, but we can still incorporate hyperparameter tuning by focusing on regularization techniques, like Ridge and Lasso regression, which are extensions of linear regression with L2 and L1 regularization, respectively.

##### SImple linear regression Randomized Search

In [271]:
# Define the pipeline for Linear Regression
linear_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("linear_regression", LinearRegression())
])

# Define the parameter distribution (Linear Regression doesn't have many hyperparameters)
linear_param_distribs = {
    "preprocessing__geo__cluster_simil_transformer__n_clusters": randint(low=2, high=15),
}

# Set up RandomizedSearchCV
linear_rnd_search = RandomizedSearchCV(
    linear_pipeline,
    param_distributions=linear_param_distribs,
    n_iter=10, cv=5,
    scoring="neg_root_mean_squared_error",
    random_state=42
)

# Fit the randomized search to the data
linear_rnd_search.fit(X, y)
show_result(linear_rnd_search)

best n_clusters found: 14
best RMSE: 75,254


##### Ridge Regression Randomized Search

In [14]:
ridge_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("ridge", Ridge())
])

ridge_param_distribs = {
    "ridge__alpha": np.logspace(-4, 4, 20),  # Regularization strength (alpha)
    "preprocessing__geo__cluster_simil_transformer__n_clusters": randint(low=2, high=15),
}

# Set up RandomizedSearchCV
ridge_rnd_search = RandomizedSearchCV(
    ridge_pipeline,
    param_distributions=ridge_param_distribs,
    n_iter=10, cv=5,
    scoring="neg_root_mean_squared_error",
    random_state=42
)

# Fit the randomized search to the data
ridge_rnd_search.fit(X, y)
show_result(ridge_rnd_search)

best n_clusters found: 14
best alpha found: 78.47599703514607
best RMSE: 74,923


##### Lasso Regression Randomized Search

In [269]:
# Define the pipeline for Lasso Regression
lasso_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("scaler", StandardScaler()),
    ("lasso", LassoCV(cv=5, random_state=42, max_iter=10000, tol=1e-3))
])

# Define the parameter distribution for randomized search
lasso_param_distribs = {
    "preprocessing__geo__cluster_simil_transformer__n_clusters": randint(low=2, high=15),
}

# Set up RandomizedSearchCV
lasso_rnd_search = RandomizedSearchCV(
    lasso_pipeline,
    param_distributions=lasso_param_distribs,
    n_iter=10, cv=5,
    scoring="neg_root_mean_squared_error",
    random_state=42
)

# Fit the randomized search to the data
lasso_rnd_search.fit(X, y)
show_result(lasso_rnd_search)

best n_clusters found: 11
best RMSE: 75,112


## Grid Search

### Decision Tree Regressor Grid Search

In [257]:
# These parameters are fine-tuned based on results from the previous random search
tree_param_grid = [
    {
        "decision_tree__max_depth": [9, 11], # Max depth of the tree: limits how deep the tree can grow
        "decision_tree__max_features": [13, 14], # Number of features to consider at each split
        "decision_tree__min_samples_leaf": [5, 6],  # Controls overfitting by ensuring minimum samples at leaf nodes
        "decision_tree__min_samples_split": [11, 12], # Ensures a node can only split if it has enough samples
        "preprocessing__geo__cluster_simil_transformer__n_clusters": [12, 13],
    }
]

# This is a follow-up to the random search, aiming to find the optimal parameters in a more focused manner
tree_grid_search = GridSearchCV(
    tree_pipeline,  # Pipeline that includes preprocessing and model
    tree_param_grid,  # Parameter grid to search through
    refit=True,  # Refits the best model found on the entire dataset after grid search
    scoring="neg_root_mean_squared_error",  # Scoring metric to evaluate performance
    cv=5,  # Number of cross-validation folds
    error_score="raise"  # Raise an error if fitting fails for a given parameter combination
)

tree_grid_search.fit(X, y)
show_result(tree_grid_search)

best max_depth found: 9
best max_features found: 13
best min_samples_leaf found: 6
best min_samples_split found: 11
best n_clusters found: 12
best RMSE: 73,016


In [258]:
(tree_md, tree_mf, tree_msl, tree_mss, tree_nc) = (0, 0, 0, 0, 0)  # Initial parameter values

tree_best_params = tree_grid_search.best_params_  # Retrieve best parameters from the initial grid search

# Initialize flag to control the loop (set to False initially to ensure loop runs)
flag = False  

while not flag:
    # Update parameters with the best ones found so far using max() to ensure valid values
    tree_md = max(tree_best_params['decision_tree__max_depth'], 2)  # Ensure max_depth is at least 1 (we use [tree_med-1,...] in the param grid)
    tree_mss = max(tree_best_params['decision_tree__min_samples_split'], 3)  # Ensure min_samples_split is at least 2
    tree_msl = max(tree_best_params['decision_tree__min_samples_leaf'], 2)  # Ensure min_samples_leaf is at least 1
    tree_mf = max(tree_best_params['decision_tree__max_features'], 2)  # Ensure max_features is at least 1
    tree_nc = max(tree_best_params['preprocessing__geo__cluster_simil_transformer__n_clusters'], 3)  # Ensure n_clusters is at least 2
    
    # Define the new, smaller grid around the current best parameters
    tree_param_grid = [
        {
            "decision_tree__max_depth": [tree_md-1, tree_md, tree_md+1],     
            "decision_tree__min_samples_split": [tree_mss-1, tree_mss, tree_mss+1],     
            "decision_tree__min_samples_leaf": [tree_msl-1, tree_msl, tree_msl+1],     
            "decision_tree__max_features": [tree_mf-1, tree_mf, tree_mf+1],
            "preprocessing__geo__cluster_simil_transformer__n_clusters": [tree_nc-1, tree_nc, tree_nc+1],
        }
    ]
    
    # Perform a new grid search with the refined parameter grid
    tree_grid_search = GridSearchCV(
        tree_pipeline, tree_param_grid, refit=True, scoring="neg_root_mean_squared_error", cv=5, error_score="raise"
    )
    
    tree_grid_search.fit(X, y)
    
    # Get the new best parameters after this iteration
    new_best_params = tree_grid_search.best_params_
    
    # Check if parameters have stopped changing (i.e., they are equal to the previous best)
    flag = tree_best_params == new_best_params
    
    # Update the best parameters for the next iteration if flag is still False
    tree_best_params = new_best_params

# After the loop exits, we will have the most fine-tuned parameters in tree_best_params
show_result(tree_grid_search)

Best parameters after refinement: {'decision_tree__max_depth': 10, 'decision_tree__max_features': 14, 'decision_tree__min_samples_leaf': 5, 'decision_tree__min_samples_split': 12, 'preprocessing__geo__cluster_simil_transformer__n_clusters': 12}


### Random Forest Grid Search

In [273]:
# These parameters are fine-tuned based on results from the previous Random Search for Random Forest
forest_param_grid = [
    {
        "random_forest__max_depth": [9, 11],
        "random_forest__max_features": [4, 5],
        "random_forest__min_samples_leaf": [2, 3], 
        "random_forest__min_samples_split": [4, 5],
        "preprocessing__geo__cluster_simil_transformer__n_clusters": [10, 8],
    }
]

# Initialize GridSearchCV to perform grid search over the specified parameters
forest_grid_search = GridSearchCV(
    forest_pipeline,
    forest_param_grid,
    refit=True,
    scoring="neg_root_mean_squared_error",  # cross-validation is being applied to evaluate the model's performance
                                            #using the negative root mean squared error (RMSE) as the scoring metric
    cv=5,  # Number of cross-validation folds
    error_score="raise"  # Raise an error if fitting fails for a given parameter combination
)

forest_grid_search.fit(X, y)
show_result(forest_grid_search)
#13m 33.4s

best n_clusters found: 8
best max_depth found: 11
best max_features found: 5
best min_samples_leaf found: 2
best min_samples_split found: 4
best RMSE: 65,947


#### The following code would take too much time to run, so I only wrote it to provide another step in the fune-tuning process

In [274]:
(forest_md, forest_mf, forest_msl, forest_mss, forest_nc) = (0, 0, 0, 0, 0) # Initial parameter values

# Retrieve the best parameters from the previous grid search
forest_best_params = forest_grid_search.best_params_

# Initialize flag to control the loop (set to False initially to ensure loop runs)
flag = False 

# Loop until no better parameters are found
while not flag:
    # Update parameter values based on the best parameters found
    forest_md = max(forest_best_params['random_forest__max_depth'], 2)
    forest_mss = max(forest_best_params['random_forest__min_samples_split'], 3)
    forest_msl = max(forest_best_params['random_forest__min_samples_leaf'], 2)
    forest_mf = max(forest_best_params['random_forest__max_features'], 2)
    forest_nc = max(forest_best_params['preprocessing__geo__cluster_simil_transformer__n_clusters'], 3)
    
    # Define the parameter grid for further optimization
    forest_param_grid = [
        {
            # Explore values for max_depth around the best found value
            "random_forest__max_depth": [forest_md - 1, forest_md, forest_md + 1],     
            # Explore values for min_samples_split around the best found value
            "random_forest__min_samples_split": [forest_mss - 1, forest_mss, forest_mss + 1],     
            # Explore values for min_samples_leaf around the best found value
            "random_forest__min_samples_leaf": [forest_msl - 1, forest_msl, forest_msl + 1],     
            # Explore values for max_features around the best found value
            "random_forest__max_features": [forest_mf - 1, forest_mf, forest_mf + 1],
            # Explore values for n_clusters for the geo clustering transformer
            "preprocessing__geo__cluster_simil_transformer__n_clusters": [forest_nc - 1, forest_nc, forest_nc + 1],
        }
    ]

    # Initialize GridSearchCV to perform a grid search over the updated parameters
    forest_grid_search = GridSearchCV(
        forest_pipeline,  # The pipeline that includes preprocessing and the model
        forest_param_grid,  # The parameter grid to search through
        refit=True,  # Refits the best model found on the entire dataset after grid search
        scoring="neg_root_mean_squared_error",  # Scoring metric for evaluating performance
        cv=5,  # Number of cross-validation folds
        error_score="raise"  # Raise an error if fitting fails for a given parameter combination
    )
    
    # Fit the grid search on the provided dataset (X, y)
    forest_grid_search.fit(X, y)

   # Get the new best parameters after this iteration
    new_best_params = forest_grid_search.best_params_
    
    # Check if parameters have stopped changing (i.e., they are equal to the previous best)
    flag = forest_best_params == new_best_params
    
    # Update the best parameters for the next iteration if flag is still False
    forest_best_params = new_best_params
    show_result(forest_grid_search)

best n_clusters found: 9
best max_depth found: 12
best max_features found: 6
best min_samples_leaf found: 1
best min_samples_split found: 3
best RMSE: 64,096
best n_clusters found: 9
best max_depth found: 12
best max_features found: 7
best min_samples_leaf found: 2
best min_samples_split found: 2
best RMSE: 63,000
best n_clusters found: 9
best max_depth found: 12
best max_features found: 8
best min_samples_leaf found: 1
best min_samples_split found: 3
best RMSE: 62,763


#### Although the algorithm could not complete its execution, the first three iterations of the loop were successful.

### Support Vector Regressor Grid Search

In [12]:
# Use the best parameters from RandomizedSearchCV
best_n_clusters = 7
best_C = 10.0
best_degree = 5
best_epsilon = 0.0001
best_kernel = 'linear'

# Define the parameter grid for GridSearchCV
svr_param_grid = {
    "svr__kernel": [best_kernel],  # Fixed to the best kernel ('linear' in this case)
    "svr__C": [best_C / 2, best_C, best_C * 2],  # Refine around the best C found
    "svr__epsilon": [best_epsilon / 10, best_epsilon, best_epsilon * 10],  # Fine-tune epsilon
    "svr__degree": [best_degree - 1, best_degree, best_degree + 1],  # Only matters for 'poly', kept for generality
    "preprocessing__geo__cluster_simil_transformer__n_clusters": [best_n_clusters - 1, best_n_clusters, best_n_clusters + 1]  # Narrow search for clusters
}

# Set up GridSearchCV with 5-fold cross-validation
svr_grid_search = GridSearchCV(
    svr_pipeline, param_grid=svr_param_grid,
    scoring="neg_root_mean_squared_error", cv=5, error_score="raise"
)

# Fit the model to the data
svr_grid_search.fit(X, y)

# Display the results
show_result(svr_grid_search)

best n_clusters found: 6
best C found: 20.0
best degree found: 4
best epsilon found: 0.001
best kernel found: linear
best RMSE: 84,432


### Linear Regression (Ridge) Grid Search 

In [15]:
# Use initial values for parameters to ensure the while loop runs
ridge_alpha, ridge_nc = (0, 0)  # Initial values for alpha and n_clusters

# Retrieve best parameters from RandomizedSearchCV
ridge_best_params = ridge_rnd_search.best_params_

# Define a flag to check if parameters have changed
flag = (ridge_alpha == ridge_best_params['ridge__alpha'] and 
        ridge_nc == ridge_best_params['preprocessing__geo__cluster_simil_transformer__n_clusters'])

while not flag:
    # Update the parameters
    ridge_alpha = ridge_best_params['ridge__alpha']
    ridge_nc = ridge_best_params['preprocessing__geo__cluster_simil_transformer__n_clusters']

    # Define the parameter grid for GridSearchCV
    ridge_param_grid = {
        "ridge__alpha": [ridge_alpha / 2, ridge_alpha, ridge_alpha * 2],  # Refine around the best alpha found
        "preprocessing__geo__cluster_simil_transformer__n_clusters": [ridge_nc - 1, ridge_nc, ridge_nc + 1]  # Narrow search for clusters
    }

    # Set up GridSearchCV with 5-fold cross-validation
    ridge_grid_search = GridSearchCV(
        ridge_pipeline, param_grid=ridge_param_grid,
        scoring="neg_root_mean_squared_error", cv=5, error_score="raise"
    )

    # Fit the model to the data
    ridge_grid_search.fit(X, y)

    # Update best parameters
    ridge_best_params = ridge_grid_search.best_params_

    # Check if the best parameters have changed
    flag = (ridge_alpha == ridge_best_params['ridge__alpha'] and 
            ridge_nc == ridge_best_params['preprocessing__geo__cluster_simil_transformer__n_clusters'])

show_result(ridge_grid_search)

best n_clusters found: 14
best alpha found: 78.47599703514607
best RMSE: 74,923
