In [1]:
import os

import pandas as pd
import numpy as np
import os

df = pd.read_csv('housing.csv')



df["income_cat"] = pd.cut(df["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.3, stratify=df['income_cat'], random_state=42)
train.drop('income_cat', axis=1, inplace=True)
test.drop('income_cat',  axis=1, inplace=True)

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

class Preprocessor(BaseEstimator, TransformerMixin):
    # Train our custom preprocessors
    numerical_columns = [
        'longitude',
        'latitude',
        'housing_median_age',
        'total_rooms',
        'total_bedrooms',
        'population',
        'households',
        'median_income',
    ]
    categorical_columns = [
        'ocean_proximity'
    ]

    def fit(self, X, y=None):

        # Create and fit simple imputer
        self.imputer = SimpleImputer(strategy='median')
        self.imputer.fit(X[self.numerical_columns])

        # Create and fit Standard Scaler
        self.scaler = StandardScaler()
        self.scaler.fit(X[self.numerical_columns])

        # Create and fit one hot encoder
        self.onehot = OneHotEncoder(handle_unknown='ignore')
        self.onehot.fit(X[self.categorical_columns])

        return self

    def transform(self, X):

        # Apply simple imputer
        imputed_cols = self.imputer.transform(X[self.numerical_columns])
        onehot_cols = self.onehot.transform(X[self.categorical_columns])

        # Copy the df
        transformed_df = X.copy()

        # Apply transformed columns
        transformed_df[self.numerical_columns] = imputed_cols
        transformed_df[self.numerical_columns] = self.scaler.transform(transformed_df[self.numerical_columns])

        # Drop existing categorical columns and replace with one hot equivalent
        transformed_df = transformed_df.drop(self.categorical_columns, axis=1)
        transformed_df[self.onehot.get_feature_names_out()] = onehot_cols.toarray().astype(int)

        return transformed_df


from sklearn.pipeline import make_pipeline

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [5]:
RandomForestRegressor?

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# from xgboost.sklearn import XGBRegressor

y_train = train['median_house_value']
X_train = train.drop('median_house_value', axis=1)

y_test = test['median_house_value']
X_test = test.drop('median_house_value', axis=1)

pipelines = {
    'ridge': make_pipeline(Preprocessor(), Ridge()),
    'rf': make_pipeline(Preprocessor(), RandomForestRegressor()),
    'gb': make_pipeline(Preprocessor(), GradientBoostingRegressor()),
    # 'xg': make_pipeline(Preprocessor(), XGBRegressor()),
}


grid = {
    'ridge':{'ridge__alpha':[0.05, 0.25, 0.5, 1.0]},
    'rf':{
        'randomforestregressor__n_estimators':[50,100,150],
        'randomforestregressor__max_depth':[5,6,7,None]
    },
    'gb':{
        'gradientboostingregressor__n_estimators':[50,100,150],
        'gradientboostingregressor__max_depth':[5,6,7, None]
    },
    # 'xg':{
    #    'xgbregressor__n_estimators':[50,100,150],
    #    'xgbregressor__max_depth':[5,6,7,None]
    # }
}

In [7]:
from sklearn.model_selection import GridSearchCV
import time
total_start = time.monotonic()
fit_models = {}
for algo, pipeline in pipelines.items():
    try:
        print(algo)
        start = time.monotonic()
        model = GridSearchCV(pipeline, grid[algo], n_jobs=-1, cv=10, scoring='r2')
        model.fit(X_train, y_train)
        fit_models[algo] = model
        end = time.monotonic()
        print(algo, end-start)
    except Exception as e:
        print(f'Model {algo} had an error {e}')

end = time.monotonic()
print(end-total_start)

ridge
ridge 4.012654983000175
rf
rf 560.2899177019999
gb
gb 856.9477929930001
1421.2571128270001


In [32]:
print(fit_models['ridge'].best_estimator_[1], fit_models['ridge'].best_score_)
print(fit_models['rf'].best_estimator_[1], fit_models['rf'].best_score_)
print(fit_models['gb'].best_estimator_[1], fit_models['gb'].best_score_)



Ridge(alpha=0.25) 0.6402134848239246
RandomForestRegressor(n_estimators=150) 0.8171769026724179
GradientBoostingRegressor(max_depth=7, n_estimators=150) 0.8294081050646742


In [25]:
fit_models['rf'].best_estimator_[1].feature_names_in_

array(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'], dtype=object)

In [49]:
model.best_params_

{'gradientboostingregressor__max_depth': 7,
 'gradientboostingregressor__n_estimators': 150}

"GridSearchCV(cv=10,\n             estimator=Pipeline(steps=[('preprocessor', Preprocessor()),\n                                       ('gradientboostingregressor',\n                                        GradientBoostingRegressor())]),\n             n_jobs=-1,\n             param_grid={'gradientboostingregressor__max_depth': [5, 6, 7,\n                                                                  None],\n                         'gradientboostingregressor__n_estimators': [50, 100,\n                                                                     150]},\n             scoring='r2')"

In [27]:
classifier = fit_models['rf'].best_estimator_[1]
for score, name in sorted(zip(classifier.feature_importances_, classifier.feature_names_in_), reverse=True):
    print(round(score, 2), name)

0.48 median_income
0.15 ocean_proximity_INLAND
0.11 longitude
0.1 latitude
0.05 housing_median_age
0.03 population
0.03 total_rooms
0.02 total_bedrooms
0.02 households
0.01 ocean_proximity_NEAR OCEAN
0.0 ocean_proximity_<1H OCEAN
0.0 ocean_proximity_NEAR BAY
0.0 ocean_proximity_ISLAND


In [31]:
fit_models['rf'].best_score_

0.8171769026724179

In [54]:
!pip install mlflow

Collecting mlflow
  Using cached mlflow-2.12.1-py3-none-any.whl (20.2 MB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Using cached alembic-1.13.1-py3-none-any.whl (233 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.0.0-py3-none-any.whl (147 kB)
Collecting gitpython<4,>=3.1.9 (from mlflow)
  Using cached GitPython-3.1.43-py3-none-any.whl (207 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.3-py2.py3-none-any.whl (128 kB)
Collecting querystring-parser<2 (from mlflow)
  Using cached querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting gunicorn<22 (from mlflow)
  Using cached gunicorn-21.2.0-py3-none-any.whl (80 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Using cached Mako-1.3.3-py3-none-any.whl (78 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython<4,>=3.1.9->mlflow)
  Using cached gitdb-4.0.11-py3-none-any.whl (62 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Using cached graphql_core-3.2.3-py3-none-any.w

In [57]:
import os

MLFLOW_TRACKING_URI="https://dagshub.com/mkzia/house_models.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME']='mkzia'
os.environ['MLFLOW_TRACKING_PASSWORD']='bbbc0c41d162cdc2a38ab1aacdc1a8ff6987d08f'


import mlflow
from mlflow.models import infer_signature

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)

# Create a new MLflow Experiment
mlflow.set_experiment("median_house_pricing")

# Start an MLflow run
for algo, model in fit_models.items():
  score = model.best_score_
  params = model.best_params_
  with mlflow.start_run():
      # Log the hyperparameters
      mlflow.log_params(params)

      # Log metrics
      mlflow.log_metric("r2", score)
      # Infer the model signature
      signature = infer_signature(X_train, model.best_estimator_.predict(X_train))

      # Log the model
      model_info = mlflow.sklearn.log_model(
          sk_model=model,
          artifact_path="housing_model",
          signature=signature,
          input_example=X_train,
          registered_model_name=algo,
      )


Registered model 'ridge' already exists. Creating a new version of this model...
2024/05/02 23:32:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge, version 2
Created version '2' of model 'ridge'.
Successfully registered model 'rf'.
2024/05/02 23:32:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf, version 1
Created version '1' of model 'rf'.
Successfully registered model 'gb'.
2024/05/02 23:33:09 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: gb, version 1
Created version '1' of model 'gb'.
