In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
import mlflow
from mlflow.models import infer_signature
import dagshub
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor
import pandas as pd
import numpy as np

In [None]:
training_data = pd.read_csv("./regression/training_data.csv")
tresting_data = pd.read_csv("./regression/tresting_data.csv")

In [None]:
X_train = training_data.drop(labels=["resale_price"], axis=1)
y_train = training_data["resale_price"]
X_test = tresting_data.drop(labels=["resale_price"], axis=1)
y_test = tresting_data["resale_price"]

In [39]:
dagshub.init(repo_owner='naveenkrishnan840', repo_name='DS-Singapore-resale-flat-price', mlflow=True)

In [40]:
mlflow.set_experiment(experiment_id="0")

<Experiment: artifact_location='mlflow-artifacts:/74fe181eba6641f6b5b9dccf6ffb5402', creation_time=1727205859782, experiment_id='0', last_update_time=1727205859782, lifecycle_stage='active', name='Regression-Task', tags={}>

In [19]:
mlflow.get_tracking_uri()

'https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow'

In [20]:
HistGradientBoostingRegressorParams = {
    "loss": ["quantile"],
    "quantile": [0.1, 0.2, 0.3, 0.4, 0.5],
    "max_iter": [5000, 6000, 7000],
    "learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
    "max_leaf_nodes": [25, 50, 75],
    "max_depth": list(range(28, 32)),
    "min_samples_leaf": [64, 128, 256, 512],
    "l2_regularization": [0.1, 0.2, 0.3, 0.4, 0.5],
    "max_bins": [100, 150, 255],
    "interaction_cst": ["pairwise", "no_interactions"],
    "random_state": list(range(10, 50))
}

In [21]:
with mlflow.start_run(experiment_id="0", run_name="HistGradientBoostingRegressor-2") as run_2:
    mlflow.sklearn.autolog()
    mlflow.doctor()
    histgradientboostingregressorcv = RandomizedSearchCV(estimator=HistGradientBoostingRegressor(), 
                                                            param_distributions=HistGradientBoostingRegressorParams, 
                                    cv=KFold(n_splits=20, shuffle=True, random_state=None), n_iter=30, 
                                    verbose=3, n_jobs=-1)
    histgradientboostingregressorcv.fit(X_train, y_train)
    y_pred = histgradientboostingregressorcv.predict(X_test)
    mlflow.log_metric("testing_mean_squared_error", mean_squared_error(y_true=y_test, y_pred=y_pred))
    mlflow.log_metric("testing_mean_absolute_error", mean_absolute_error(y_true=y_test, y_pred=y_pred))
    mlflow.log_metric("testing_root_mean_squared_error", root_mean_squared_error(y_true=y_test, y_pred=y_pred))
    mlflow.log_metric("testing_r2_score", r2_score(y_true=y_test, y_pred=y_pred))

    signature = infer_signature(X_test, y_pred)

    # # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=histgradientboostingregressorcv,
        artifact_path="HistGradientBoostingRegressor-2",
        signature=signature,
        input_example=X_train,
        registered_model_name="Regression-Task",
    )



[34mSystem information[0m: Windows 10.0.22631
[34mPython version[0m: 3.12.5
[34mMLflow version[0m: 2.16.2
[34mMLflow module location[0m: C:\Users\NavaneethanJeyapraka\AppData\Roaming\Python\Python312\site-packages\mlflow\__init__.py
[34mTracking URI[0m: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow
[34mRegistry URI[0m: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow
[34mActive experiment ID[0m: 0
[34mActive run ID[0m: eba3768c6ccc4a4fbda8d75dc4f4cfe8
[34mActive run artifact URI[0m: mlflow-artifacts:/74fe181eba6641f6b5b9dccf6ffb5402/eba3768c6ccc4a4fbda8d75dc4f4cfe8/artifacts
[34mMLflow environment variables[0m: 
  MLFLOW_TRACKING_PASSWORD: 6602c43190bbbd93140e6846d47f88777f01baae
  MLFLOW_TRACKING_URI: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow
  MLFLOW_TRACKING_USERNAME: 6602c43190bbbd93140e6846d47f88777f01baae
[34mMLflow dependencies[0m: 
  Flask: 3.0.3
  Jinja2: 3.1.4
  al

2024/09/27 00:45:27 INFO mlflow.sklearn.utils: Logging the 5 best runs, 25 runs will be omitted.
2024/09/27 00:45:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run sassy-stag-29 at: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow/#/experiments/0/runs/d953da96050d4bfaabac989503cbe686.
2024/09/27 00:45:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow/#/experiments/0.
2024/09/27 00:45:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run whimsical-gnu-22 at: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow/#/experiments/0/runs/e066333cfa0448e19663a9d04233f254.
2024/09/27 00:45:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow/#/experiments/0.
2024/09/27 00:45:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run lang

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/09/27 00:51:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run HistGradientBoostingRegressor-2 at: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow/#/experiments/0/runs/eba3768c6ccc4a4fbda8d75dc4f4cfe8.
2024/09/27 00:51:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow/#/experiments/0.


In [44]:
XGBRegressorParams = {
    # "n_estimators": [1000, 1500],
    "learning_rate": [0.1, 0.8, 0.9],
    "subsample": [0.2, 0.4, 0.6, 0.8, 0.9],
    "booster": ["gbtree", "gblinear", "dart"],
    "max_depth": [28, 32],
    "max_leaves": [28, 32],
    "random_state": [10, 20, 30, 40, 50],
    "num_parallel_tree": [5, 6, 7, 8, 9, 10],
    "grow_policy": ["depthwise", "lossguide"],
    "reg_alpha": [0.2, 0.5, 1.0, 1.5, 2.0, 2.5], # L1 regulaization
    "reg_lambda" : [1, 5, 10, 20, 50], # L2 regulaization
    "gamma": [0.1, 0.9],
    "importance_type": ["gain", "weight", "cover", "total_gain", "total_cover"],
    "gamma": [5, 10, 15, 20],
    "min_child_weight": [10, 20, 30, 40, 50],
    "multi_strategy": ["one_output_per_tree", "multi_output_tree"],

    # "max_bin": [100, 155, 255]

}

In [23]:
import xgboost as xgb

In [45]:
with mlflow.start_run(experiment_id="0", run_name="XGBRegressor-2") as run_2:
    mlflow.sklearn.autolog()
    mlflow.doctor()
    XGBRegressorModel = xgb.XGBRegressor(n_estimators=800, base_score=503417.1593060477, objective="reg:squarederror", verbosity=3, booster="dart", tree_method="hist")
    XGBRegressorModel_cv = RandomizedSearchCV(estimator=XGBRegressorModel, param_distributions=XGBRegressorParams, verbose=3, n_jobs=-1, 
                    cv=KFold(n_splits=25, shuffle=True, random_state=None), random_state=99, n_iter=5)
    XGBRegressorModel_cv.fit(X_train, y_train)
    y_pred = XGBRegressorModel_cv.predict(X_test)
    mlflow.log_metric("testing_mean_squared_error", mean_squared_error(y_true=y_test, y_pred=y_pred))
    mlflow.log_metric("testing_mean_absolute_error", mean_absolute_error(y_true=y_test, y_pred=y_pred))
    # mlflow.log_metric("testing_root_mean_squared_error", root_mean_squared_error(y_true=y_test, y_pred=y_pred))
    mlflow.log_metric("testing_r2_score", r2_score(y_true=y_test, y_pred=y_pred))

    signature = infer_signature(X_test, y_pred)

    # # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=XGBRegressorModel_cv,
        artifact_path="XGBRegressor-2",
        signature=signature,
        input_example=X_train,
        registered_model_name="Regression-Task",
    )



[34mSystem information[0m: Windows 10.0.22631
[34mPython version[0m: 3.12.5
[34mMLflow version[0m: 2.16.2
[34mMLflow module location[0m: C:\Users\NavaneethanJeyapraka\AppData\Roaming\Python\Python312\site-packages\mlflow\__init__.py
[34mTracking URI[0m: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow
[34mRegistry URI[0m: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow
[34mActive experiment ID[0m: 0
[34mActive run ID[0m: fd8d70a4b85e4c868b89c5c9173dfdee
[34mActive run artifact URI[0m: mlflow-artifacts:/74fe181eba6641f6b5b9dccf6ffb5402/fd8d70a4b85e4c868b89c5c9173dfdee/artifacts
[34mMLflow environment variables[0m: 
  MLFLOW_TRACKING_PASSWORD: 6602c43190bbbd93140e6846d47f88777f01baae
  MLFLOW_TRACKING_URI: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow
  MLFLOW_TRACKING_USERNAME: 6602c43190bbbd93140e6846d47f88777f01baae
[34mMLflow dependencies[0m: 
  Flask: 3.0.3
  Jinja2: 3.1.4
  al

25 fits failed out of a total of 125.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\NavaneethanJeyapraka\anaconda3\envs\copper-set\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\NavaneethanJeyapraka\anaconda3\envs\copper-set\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\NavaneethanJeyapraka\anaconda3\envs\copper-set\Lib\site-packages\xgboost\sklearn.py", line 1108, in fit
    self._Booster = train(
                    ^^^^^^
  File "c:\Users\NavaneethanJeyapraka

[10:55:34] AllReduce: 0.005472s, 1 calls @ 5472us

[10:55:34] MakeCuts: 0.006544s, 1 calls @ 6544us

[10:55:34] DEBUG: C:\b\abs_90_bwj_86a\croot\xgboost-split_1724073762025\work\src\gbm\gbtree.cc:130: Using tree method: 3
[10:58:59] Configure: 0.003428s, 1 calls @ 3428us

[10:58:59] EvalOneIter: 0.010719s, 800 calls @ 10719us

[10:58:59] GetGradient: 2.30643s, 800 calls @ 2306433us

[10:58:59] PredictRaw: 44.534s, 800 calls @ 44533959us

[10:58:59] UpdateOneIter: 204.868s, 800 calls @ 204868255us

[10:58:59] BoostNewTrees: 158.011s, 800 calls @ 158010664us

[10:58:59] CommitModel: 0.006743s, 800 calls @ 6743us

[10:58:59] BuildHistogram: 33.5163s, 192000 calls @ 33516326us

[10:58:59] EvaluateSplits: 11.9645s, 198400 calls @ 11964536us

[10:58:59] InitData: 10.9592s, 6400 calls @ 10959217us

[10:58:59] InitRoot: 18.2297s, 6400 calls @ 18229653us

[10:58:59] LeafPartition: 0.00153s, 6400 calls @ 1530us

[10:58:59] UpdatePosition: 28.8938s, 198400 calls @ 28893778us

[10:58:59] UpdateTre

2024/09/28 11:01:43 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
Registered model 'Regression-Task' already exists. Creating a new version of this model...
2024/09/28 11:06:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Regression-Task, version 4
Created version '4' of model 'Regression-Task'.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[11:07:35] DEBUG: C:\b\abs_90_bwj_86a\croot\xgboost-split_1724073762025\work\src\gbm\gbtree.cc:130: Using tree method: 3


2024/09/28 11:08:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBRegressor-2 at: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow/#/experiments/0/runs/fd8d70a4b85e4c868b89c5c9173dfdee.
2024/09/28 11:08:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/naveenkrishnan840/DS-Singapore-resale-flat-price.mlflow/#/experiments/0.
