In [105]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


In [88]:
housing = pd.read_csv("housing.csv")

# Preparing Data

### Creating Transformers

In [89]:
# Handling Missing Values
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

# Handling Categorical Data
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Handling Heavy-Tailed features
class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, base=np.e):
        self.base = base
        

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_list()
        else:
            self.feature_names_in_ = [f"feature_{i}" for i in range(X.shape[1])]
        return self

    def transform(self, X):
        return np.emath.logn(self.base, X)
    
    def get_feature_names_out(self, input_features=None):
        if input_features == None:
            input_features = self.feature_names_in_
        return [f"log_{name}" for name in input_features]

    def inverse_transform(self, X):
        return self.base ** X
    
log_transformer = LogTransformer()

# Handling Geographic features


class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        
    
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters,random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self
    
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]
    
cluster_simil_transformer = ClusterSimilarity(random_state=42)

# Transforming housing_median_age
quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)

# Creating Additional Features (bedrooms (bedrooms per room), rooms_per_house, people_per_house)
class RatioFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_list()
        else:
            self.feature_names_in_ = [f"feature_{i}" for i in range(X.shape[1])]
        return self
    
    def transform(self, X):
        if X.shape[1] < 2:             
            raise ValueError("Ratio transformation requires at least two columns.")
        transformed_X = X[:, 0] / X[:, 1]
        return transformed_X.reshape(-1,1)

    def get_feature_names_out(self, input_features=None):
        if input_features == None:
            input_features = self.feature_names_in_
        else:
            return [f"additional_{name}_ratio" for name in input_features]


ratio_features_transformer = RatioFeaturesTransformer()

# Scaling
standard_scaler = StandardScaler()

### Creating Pipeline

In [90]:
deafult_number_pipeline = make_pipeline(num_imputer, standard_scaler)

# Heavy-Tailed Features
log_pipeline = make_pipeline(num_imputer, log_transformer, standard_scaler)

# Categorical Features
cat_pipeline = make_pipeline(cat_imputer, cat_encoder)

# Geographic Features
geo_pipeline = make_pipeline(cluster_simil_transformer)

# median_house_values
quantile_pipeline = make_pipeline(num_imputer, quantile_transformer, standard_scaler)

# Additional Features
ratio_features_pipeline = make_pipeline(num_imputer, ratio_features_transformer,standard_scaler) 

# Piping!
preprocessing = ColumnTransformer([
    ("bedrooms",            ratio_features_pipeline,    ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house",     ratio_features_pipeline,    ["total_rooms", "households"]),
    ("people_per_house" ,   ratio_features_pipeline,    ["population", "households"]),
    ("log",                 log_pipeline,               ["total_rooms", "total_bedrooms", "population", "households", "median_income"]),
    ("geo",                 geo_pipeline,               ["latitude", "longitude"]),
    ("cat",                 cat_pipeline,               make_column_selector(dtype_include=object)),
    ("age",                 quantile_pipeline,          ["housing_median_age"])
    ], remainder=deafult_number_pipeline)

### Splitting Data

In [102]:
X = housing.drop(["median_house_value"], axis=1)
y = housing["median_house_value"]

# Training Models

In [103]:
class Model():
    def __init__(self, preprocessor, predictor):
        self.predictor = make_pipeline(preprocessor, predictor)
        self.predictor_name = predictor.__class__.__name__

    def fit_predict(self, dataset, labels):
        self.predictor.fit(dataset, labels)
        self.predictions = self.predictor.predict(dataset)
        self.cv_rmses = -cross_val_score(self.predictor, dataset, labels, scoring="neg_root_mean_squared_error", cv=5)
        self.cv_rmse = self.cv_rmses.mean()
        self.rmse = root_mean_squared_error(labels, self.predictions)
        return f"{self.predictor_name}\tRMSE: {self.rmse:,.0f}\tCV_RMSE: {self.cv_rmse:,.0f}"


results = []
lin_reg = Model(preprocessing, LinearRegression())
results.append(lin_reg.fit_predict(dataset=X, labels=y))
tree_reg = Model(preprocessing, DecisionTreeRegressor(random_state=42))
results.append(tree_reg.fit_predict(dataset=X, labels=y))
forest_reg = Model(preprocessing, RandomForestRegressor(random_state=42))
results.append(forest_reg.fit_predict(dataset=X, labels=y))
sv_reg = Model(preprocessing, SVR(C=1.0, epsilon=0.1, kernel="rbf", degree=3, gamma="scale"))
results.append(sv_reg.fit_predict(dataset=X, labels=y))

for result in results:
    print(result)

LinearRegression:	RMSE: 69,020	CV_RMSE: 75,552
DecisionTreeRegressor:	RMSE: 0	CV_RMSE: 88,310
RandomForestRegressor:	RMSE: 17,372	CV_RMSE: 62,387
SVR:	RMSE: 117,976	CV_RMSE: 119,095


# Fine-Tuning Models

### Grid Search

'SVR: \tRMSE: 117,976\tCV_RMSE: 119,095'