In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import seaborn as sns
sns.set()
# ML Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
# tune
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import mlflow.xgboost
import mlflow.sklearn
import mlflow
from mlflow.models.signature import infer_signature
from urllib.parse import urlparse

import logging
import sys
import warnings
# logging.basicConfig()
# logger = logging.getLogger(__name__)


* 'schema_extra' has been renamed to 'json_schema_extra'


## Loading data

In [2]:
# loading data
df = pd.read_csv('data/php_data_all.csv', index_col=0)
# selecting data according to temperature range
# NOTE: Data selected between [300, 355]
df = df[(df['Te[K]'] > 300) & (df['Te[K]'] < 355)]
display(df)

Unnamed: 0,Te[K],Tc[K],dT[K],P[bar],Q[W],Fluid,FR,TR[K/W]
3,300.150000,296.40,3.750000,0.413299,80,DI_Water,60,0.046875
4,302.150000,296.40,5.750000,0.413299,80,DI_Water,60,0.071875
5,302.816667,296.65,6.166667,0.413299,80,DI_Water,60,0.077083
6,305.150000,296.90,8.250000,0.413299,80,DI_Water,60,0.103125
7,306.816667,297.40,9.416667,0.413299,80,DI_Water,60,0.117708
...,...,...,...,...,...,...,...,...
7574,353.350000,338.90,14.450000,0.879927,80,DI_Water,60,0.180625
7575,353.550000,338.15,15.400000,0.879927,80,DI_Water,60,0.192500
7576,354.150000,338.15,16.000000,0.879927,80,DI_Water,60,0.200000
7577,354.350000,338.15,16.200000,0.946588,80,DI_Water,60,0.202500


## 1. Predicting TR

In [3]:
# data split
x = df[['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'Fluid', 'FR']]
y = df['TR[K/W]']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [4]:
# data pipeline preparation
numeric_features = ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]','FR']
categorical_features = ['Fluid']

numeric_transformer = make_pipeline(StandardScaler())
categorical_tranformer = make_pipeline(OneHotEncoder(sparse_output=False))

preprocessor = ColumnTransformer(
    transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_tranformer, categorical_features)
    ])

In [5]:
def evaluate(y_test, y_pred, k=6):
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    n = y_test.shape[0]
    k = k
    r2_adj = 1 - (((1-r2)*(n-1)) / (n-k-1))
    return rmse, mae, r2, r2_adj

In [6]:
mlflow.sklearn.autolog()

def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'xgb')
        mlflow.log_params(params=params)

        model_xgb = xgb.XGBRegressor(**params)
        data_pipeline_rfr = Pipeline(steps=[('Preprocessing', preprocessor),
                                ('RFR_model', model_xgb)])
        
        data_pipeline_rfr.fit(x_train, y_train)

        pred = data_pipeline_rfr.predict(x_test)
        
        rmse, ame, r2, r2_adj = evaluate(y_test=y_test, y_pred=pred) # NEED TO CHECK WITH ONE TARGET VARIABLE
        
        signature = infer_signature(x_train, pred)

        #trackin_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        # if trackin_url_type_store != "file":
        #     mlflow.sklearn.log_model(model_xgb, 'model', registered_model_name='XGBRegressor', signature=signature)
        # else:
        #     mlflow.sklearn.log_model(model_xgb, 'model', signature=signature)
        mlflow.sklearn.log_model(
        sk_model=model_xgb,
        artifact_path="sklearn-model",
        signature=signature,
        registered_model_name="xgb-regressor",
    )
    return {'loss': rmse, 'status': STATUS_OK}



In [6]:
# mlflow manual logging of metrics and model

def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'xgb')
        mlflow.log_params(params=params)

        model_xgb = xgb.XGBRegressor(**params)
        data_pipeline_rfr = Pipeline(steps=[('Preprocessing', preprocessor),
                                ('RFR_model', model_xgb)])
        
        data_pipeline_rfr.fit(x_train, y_train)

        pred = data_pipeline_rfr.predict(x_test)
        
        rmse, ame, r2, r2_adj = evaluate(y_test=y_test, y_pred=pred) # NEED TO CHECK WITH ONE TARGET VARIABLE
        
        signature = infer_signature(x_train, pred)

        mlflow.log_metric('rmse', rmse)
        mlflow.log_metric('ame', ame)
        mlflow.log_metric('r2', r2)
        mlflow.log_metric('r2_adj', r2_adj)
        mlflow.sklearn.log_model(
        sk_model=model_xgb,
        artifact_path="sklearn-model",
        signature=signature,
        registered_model_name="xgb-regressor",
    )
    return {'loss': rmse, 'status': STATUS_OK}

In [7]:
search_space = {'eta': hp.uniform('eta', 0.1,1), 
                'max_depth': hp.randint('max_depth', 2,8)}

In [8]:
best_results = fmin(fn=objective,
                    space=search_space,
                    algo=tpe.suggest,
                    max_evals=10,
                    trials=Trials())

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

  inputs = _infer_schema(model_input) if model_input is not None else None

Registered model 'xgb-regressor' already exists. Creating a new version of this model...
2023/11/04 18:10:51 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgb-regressor, version 11



 10%|█         | 1/10 [00:02<00:19,  2.19s/trial, best loss: 0.004072389551928847]

Created version '11' of model 'xgb-regressor'.
  inputs = _infer_schema(model_input) if model_input is not None else None



Registered model 'xgb-regressor' already exists. Creating a new version of this model...
2023/11/04 18:10:52 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgb-regressor, version 12



 20%|██        | 2/10 [00:03<00:12,  1.59s/trial, best loss: 0.004072389551928847]

Created version '12' of model 'xgb-regressor'.
  inputs = _infer_schema(model_input) if model_input is not None else None



Registered model 'xgb-regressor' already exists. Creating a new version of this model...
2023/11/04 18:10:54 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgb-regressor, version 13



 30%|███       | 3/10 [00:04<00:10,  1.45s/trial, best loss: 0.004072389551928847]

Created version '13' of model 'xgb-regressor'.
  inputs = _infer_schema(model_input) if model_input is not None else None



Registered model 'xgb-regressor' already exists. Creating a new version of this model...
2023/11/04 18:10:55 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgb-regressor, version 14



 40%|████      | 4/10 [00:05<00:08,  1.34s/trial, best loss: 0.0037327195604600445]

Created version '14' of model 'xgb-regressor'.
  inputs = _infer_schema(model_input) if model_input is not None else None



Registered model 'xgb-regressor' already exists. Creating a new version of this model...
2023/11/04 18:10:56 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgb-regressor, version 15



 50%|█████     | 5/10 [00:07<00:06,  1.29s/trial, best loss: 0.002590209558383032] 

Created version '15' of model 'xgb-regressor'.
  inputs = _infer_schema(model_input) if model_input is not None else None



Registered model 'xgb-regressor' already exists. Creating a new version of this model...
2023/11/04 18:10:57 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgb-regressor, version 16



 60%|██████    | 6/10 [00:08<00:05,  1.25s/trial, best loss: 0.002590209558383032]

Created version '16' of model 'xgb-regressor'.
  inputs = _infer_schema(model_input) if model_input is not None else None



Registered model 'xgb-regressor' already exists. Creating a new version of this model...
2023/11/04 18:10:58 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgb-regressor, version 17



 70%|███████   | 7/10 [00:09<00:03,  1.23s/trial, best loss: 0.002590209558383032]

Created version '17' of model 'xgb-regressor'.
  inputs = _infer_schema(model_input) if model_input is not None else None



Registered model 'xgb-regressor' already exists. Creating a new version of this model...
2023/11/04 18:11:00 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgb-regressor, version 18



 80%|████████  | 8/10 [00:10<00:02,  1.22s/trial, best loss: 0.002590209558383032]

Created version '18' of model 'xgb-regressor'.
  inputs = _infer_schema(model_input) if model_input is not None else None



Registered model 'xgb-regressor' already exists. Creating a new version of this model...
2023/11/04 18:11:01 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgb-regressor, version 19



 90%|█████████ | 9/10 [00:11<00:01,  1.20s/trial, best loss: 0.002590209558383032]

Created version '19' of model 'xgb-regressor'.
  inputs = _infer_schema(model_input) if model_input is not None else None



Registered model 'xgb-regressor' already exists. Creating a new version of this model...
2023/11/04 18:11:02 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgb-regressor, version 20



100%|██████████| 10/10 [00:12<00:00,  1.30s/trial, best loss: 0.0017898752574609157]


Created version '20' of model 'xgb-regressor'.
