In [21]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from scipy.stats import randint

In [22]:
housing = pd.read_csv("housing.csv")

# Preparing Data

### Creating Transformers

In [23]:
# Handling Missing Values
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

# Handling Categorical Data
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Handling Heavy-Tailed features
class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, base=np.e):
        self.base = base
        

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_list()
        else:
            self.feature_names_in_ = [f"feature_{i}" for i in range(X.shape[1])]
        return self

    def transform(self, X):
        return np.emath.logn(self.base, X)
    
    def get_feature_names_out(self, input_features=None):
        if input_features == None:
            input_features = self.feature_names_in_
        return [f"log_{name}" for name in input_features]

    def inverse_transform(self, X):
        return self.base ** X
    
log_transformer = LogTransformer()

# Handling Geographic features


class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        
    
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters,random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self
    
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]
    
cluster_simil_transformer = ClusterSimilarity(random_state=42)

# Transforming housing_median_age
quantile_transformer = QuantileTransformer(output_distribution="normal", random_state=42)

# Creating Additional Features (bedrooms (bedrooms per room), rooms_per_house, people_per_house)
class RatioFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_list()
        else:
            self.feature_names_in_ = [f"feature_{i}" for i in range(X.shape[1])]
        return self
    
    def transform(self, X):
        if X.shape[1] < 2:             
            raise ValueError("Ratio transformation requires at least two columns.")
        transformed_X = X[:, 0] / X[:, 1]
        return transformed_X.reshape(-1,1)

    def get_feature_names_out(self, input_features=None):
        if input_features == None:
            input_features = self.feature_names_in_
        else:
            return [f"additional_{name}_ratio" for name in input_features]


ratio_features_transformer = RatioFeaturesTransformer()

# Scaling
standard_scaler = StandardScaler()

### Creating Pipeline

In [24]:
deafult_number_pipeline = make_pipeline(num_imputer, standard_scaler)

# Heavy-Tailed Features
log_pipeline = make_pipeline(num_imputer, log_transformer, standard_scaler)

# Categorical Features
cat_pipeline = make_pipeline(cat_imputer, cat_encoder)

# Geographic Features
geo_pipeline = Pipeline([("cluster_simil_transformer", cluster_simil_transformer)])

# median_house_values
quantile_pipeline = make_pipeline(num_imputer, quantile_transformer, standard_scaler)

# Additional Features
ratio_features_pipeline = make_pipeline(num_imputer, ratio_features_transformer,standard_scaler) 

# Piping!
preprocessing = ColumnTransformer([
    ("bedrooms",            ratio_features_pipeline,    ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house",     ratio_features_pipeline,    ["total_rooms", "households"]),
    ("people_per_house" ,   ratio_features_pipeline,    ["population", "households"]),
    ("log",                 log_pipeline,               ["total_rooms", "total_bedrooms", "population", "households", "median_income"]),
    ("geo",                 geo_pipeline,               ["latitude", "longitude"]),
    ("cat",                 cat_pipeline,               make_column_selector(dtype_include=object)),
    ("age",                 quantile_pipeline,          ["housing_median_age"])
    ], remainder=deafult_number_pipeline)

### Splitting Data

In [25]:
X = housing.drop(["median_house_value"], axis=1)
y = housing["median_house_value"]

# Training Models

In [130]:
class Model():
    def __init__(self, preprocessor, predictor):
        self.predictor = make_pipeline(preprocessor, predictor)
        self.predictor_name = predictor.__class__.__name__

    def fit_predict(self, dataset, labels):
        self.predictor.fit(dataset, labels)
        self.predictions = self.predictor.predict(dataset)
        self.cv_rmses = -cross_val_score(self.predictor, dataset, labels, scoring="neg_root_mean_squared_error", cv=5)
        self.cv_rmse = self.cv_rmses.mean()
        self.rmse = root_mean_squared_error(labels, self.predictions)
        return f"{self.predictor_name}\tRMSE: {self.rmse:,.0f}\tCV_RMSE: {self.cv_rmse:,.0f}"



lin_reg = Model(preprocessing, LinearRegression())
tree_reg = Model(preprocessing, DecisionTreeRegressor(random_state=42))
forest_reg = Model(preprocessing, RandomForestRegressor(random_state=42))
sv_reg = Model(preprocessing, SVR(C=1.0, epsilon=0.1, kernel="rbf", degree=3, gamma="scale"))

results = []
results.append(lin_reg.fit_predict(dataset=X, labels=y))
results.append(tree_reg.fit_predict(dataset=X, labels=y))
results.append(forest_reg.fit_predict(dataset=X, labels=y))
results.append(sv_reg.fit_predict(dataset=X, labels=y))

for result in results:
    print(result)

LinearRegression	RMSE: 69,017	CV_RMSE: 75,540
DecisionTreeRegressor	RMSE: 0	CV_RMSE: 88,269
RandomForestRegressor	RMSE: 17,373	CV_RMSE: 62,389
SVR	RMSE: 117,976	CV_RMSE: 119,095


# Fine-Tuning Models

## Randomized Search

### Decision Tree Regressor Randomized Search

In [14]:
tree_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("decision_tree", DecisionTreeRegressor(random_state=42))
])


tree_param_distribs = {
     "decision_tree__max_depth": randint(low=3, high=15),     
     "decision_tree__min_samples_split": randint(low=2, high=20),     
     "decision_tree__min_samples_leaf": randint(low=3, high=15),     
     "decision_tree__max_features": randint(low=5, high=20),
     "preprocessing__geo__cluster_simil_transformer__n_clusters": randint(low=3, high=50),
}


tree_rnd_search = RandomizedSearchCV(
    tree_pipeline,
    param_distributions=tree_param_distribs, n_iter=10,
    cv=5, scoring='neg_root_mean_squared_error',
    random_state=42
)


tree_rnd_search.fit(X, y)

In [15]:
def show_result_tree(search):
    for param, value in search.best_params_.items():
        print(f"best {param.split('__')[-1]} found: {value}")
    print(f"best RMSE: {-search.best_score_:,.0f}")
    
    
show_result_tree(tree_rnd_search)

best max_depth found: 9
best max_features found: 14
best min_samples_leaf found: 5
best min_samples_split found: 12
best n_clusters found: 13
best RMSE: 74,903


### Random Forest Randomized Search

In [26]:
forest_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forset", RandomForestRegressor()),
])

forest_param_distribs = {
    "random_forest__max_depth": randint(low=2, high=15),
    "random_forest__max_features": randint(low=1, high=10),
    "random_forest__min_sample_split": randint(low=2, high=6),
    "random_forest__min_saple_leaf": randint(low=1, high=10),
    "preprocessing__geo__cluster_simil_transformer__n_clusters": randint(low=2, high=15),
}

forest_rnd_search = RandomizedSearchCV(
    forest_pipeline,
    param_distributions=forest_param_distribs,
    n_iter=10, cv=5,
    scoring="neg_root_mean_squared_error",
    random_state=42
    )

forest_rnd_search.fit(X, y)

ValueError: Invalid parameter 'random_forest' for estimator Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                              SimpleImputer(strategy='median')),
                                                             ('standardscaler',
                                                              StandardScaler())]),
                                   transformers=[('bedrooms',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('ratiofeaturestransformer',
                                                                   RatioFeaturesTransformer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())...
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7d526db4bee0>),
                                                 ('age',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('quantiletransformer',
                                                                   QuantileTransformer(output_distribution='normal',
                                                                                       random_state=42)),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['housing_median_age'])])),
                ('random_forset', RandomForestRegressor())]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [216]:
y

0        452600.0
1        358500.0
2        352100.0
3        341300.0
4        342200.0
           ...   
20635     78100.0
20636     77100.0
20637     92300.0
20638     84700.0
20639     89400.0
Name: median_house_value, Length: 20640, dtype: float64

## Grid Search

### Decision Tree Regressor Grid Search

In [16]:
param_grid = [
    {
        "decision_tree__max_depth": [9, 11],
        "decision_tree__max_features": [13, 14],
        "decision_tree__min_samples_leaf": [5, 6], 
        "decision_tree__min_samples_split": [11, 12],     
        "preprocessing__geo__cluster_simil_transformer__n_clusters": [12, 13],
    }
]

tree_grid_search = GridSearchCV(tree_pipeline , param_grid, refit=True, scoring="neg_root_mean_squared_error", cv=5, error_score="raise")
tree_grid_search.fit(X, y)

In [17]:
show_result_tree(tree_grid_search)

best max_depth found: 9
best max_features found: 13
best min_samples_leaf found: 6
best min_samples_split found: 11
best n_clusters found: 12
best RMSE: 73,016


In [18]:
(md, mf, msl, mss, nc) = (0,0,0,0,0) #Initial values for the params to make sure that the while loop will run

bp = tree_grid_search.best_params_ #bp (best parameters)

flag = (md == bp['decision_tree__max_depth'] and mss == bp['decision_tree__min_samples_split'] and msl ==bp['decision_tree__min_samples_leaf'] and mf == bp['decision_tree__max_features'] and nc == bp['preprocessing__geo__cluster_simil_transformer__n_clusters'])
while(not flag):
    md = bp['decision_tree__max_depth']
    mss = bp['decision_tree__min_samples_split']
    msl = bp['decision_tree__min_samples_leaf']
    mf = bp['decision_tree__max_features']
    nc = bp['preprocessing__geo__cluster_simil_transformer__n_clusters']
    param_grid = [
        {
            "decision_tree__max_depth": [md-1, md, md+1],     
            "decision_tree__min_samples_split": [mss-1, mss, mss+1],     
            "decision_tree__min_samples_leaf": [msl-1, msl, msl+1],     
            "decision_tree__max_features": [mf-1, mf,  mf+1],
            "preprocessing__geo__cluster_simil_transformer__n_clusters": [nc-1, nc, nc+1],
        }
    ]

    tree_grid_search = GridSearchCV(tree_pipeline , param_grid, refit=True, scoring="neg_root_mean_squared_error", cv=5, error_score="raise")
    tree_grid_search.fit(X, y)
    bp = tree_grid_search.best_params_
    flag = (md == bp['decision_tree__max_depth'] and mss == bp['decision_tree__min_samples_split'] and msl ==bp['decision_tree__min_samples_leaf'] and mf == bp['decision_tree__max_features'] and nc == bp['preprocessing__geo__cluster_simil_transformer__n_clusters'])

In [20]:
show_result_tree(tree_grid_search)

best max_depth found: 10
best max_features found: 14
best min_samples_leaf found: 5
best min_samples_split found: 12
best n_clusters found: 12
best RMSE: 71,657
