In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import seaborn as sns
sns.set()
# ML Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import mlflow.xgboost

import logging
import sys
import warnings
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [21]:
# loading data
df = pd.read_csv('data/php_data_all.csv', index_col=0)
# selecting data according to temperature range
df = df[(df['Te[K]'] > 300) & (df['Te[K]'] < 355)]
display(df)

Unnamed: 0,Te[K],Tc[K],dT[K],P[bar],Q[W],Fluid,FR,TR[K/W]
3,300.150000,296.40,3.750000,0.413299,80,DI_Water,60,0.046875
4,302.150000,296.40,5.750000,0.413299,80,DI_Water,60,0.071875
5,302.816667,296.65,6.166667,0.413299,80,DI_Water,60,0.077083
6,305.150000,296.90,8.250000,0.413299,80,DI_Water,60,0.103125
7,306.816667,297.40,9.416667,0.413299,80,DI_Water,60,0.117708
...,...,...,...,...,...,...,...,...
7574,353.350000,338.90,14.450000,0.879927,80,DI_Water,60,0.180625
7575,353.550000,338.15,15.400000,0.879927,80,DI_Water,60,0.192500
7576,354.150000,338.15,16.000000,0.879927,80,DI_Water,60,0.200000
7577,354.350000,338.15,16.200000,0.946588,80,DI_Water,60,0.202500


In [22]:
# data split
x = df[['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'Fluid', 'FR']]
y = df[['Tc[K]', 'TR[K/W]']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [23]:
numeric_features = ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]','FR']
categorical_features = ['Fluid']

numeric_transformer = make_pipeline(StandardScaler())
categorical_tranformer = make_pipeline(OneHotEncoder(sparse_output=False))

preprocessor = ColumnTransformer(
    transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_tranformer, categorical_features)
    ])

In [24]:
def evaluate(y_test, y_pred, k=6):
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    n = y_test.shape[0]
    k = k
    r2_adj = 1 - (((1-r2)*(n-1)) / (n-k-1))
    return rmse, mae, r2, r2_adj

In [25]:
mlflow.sklearn.autolog()

def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'xgb')
        mlflow.log_params(params=params)

        model_xgb = xgb.XGBRegressor(**params)
        data_pipeline_rfr = Pipeline(steps=[('Preprocessing', preprocessor),
                                ('RFR_model', model_xgb)])
        
        data_pipeline_rfr.fit(x_train, y_train)

        pred = data_pipeline_rfr.predict(x_test)
        
        rmse, ame, r2, r2_adj = evaluate(y_test=y_test, y_pred=pred) # NEED TO CHECK WITH ONE TARGET VARIABLE
        # mlflow.log_metric('rmse', rmse)
        # mlflow.log_metric('ame', ame)
        # mlflow.log_metric('r2', r2)
        # mlflow.log_metric('r2_adj', r2_adj)

    return {'loss': rmse, 'status': STATUS_OK}



In [26]:
search_space = {'eta': hp.uniform('eta', 0.1,1), 
                'max_depth': hp.randint('max_depth', 2,8)}

In [27]:
best_results = fmin(fn=objective,
                    space=search_space,
                    algo=tpe.suggest,
                    max_evals=20,
                    trials=Trials())

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.6174193447721021, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








  5%|▌         | 1/20 [00:02<00:42,  2.22s/trial, best loss: 0.35201406101299043]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.5301394416129365, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 10%|█         | 2/20 [00:03<00:29,  1.66s/trial, best loss: 0.3156840642218367] 

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.4296707778907304, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 15%|█▌        | 3/20 [00:04<00:25,  1.50s/trial, best loss: 0.3156840642218367]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.8712006636900629, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 20%|██        | 4/20 [00:06<00:22,  1.41s/trial, best loss: 0.3156840642218367]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.5334410034753561, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 25%|██▌       | 5/20 [00:07<00:20,  1.40s/trial, best loss: 0.3156840642218367]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.2877194881524264, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 30%|███       | 6/20 [00:08<00:18,  1.35s/trial, best loss: 0.3156840642218367]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.6063538389151425, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 35%|███▌      | 7/20 [00:09<00:17,  1.33s/trial, best loss: 0.3156840642218367]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.6334719247252952, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 40%|████      | 8/20 [00:11<00:15,  1.28s/trial, best loss: 0.3156840642218367]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.14910324571166778,
             eval_metric=None, feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_ca...`








 45%|████▌     | 9/20 [00:12<00:14,  1.31s/trial, best loss: 0.3156840642218367]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.32174464457556806,
             eval_metric=None, feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_ca...`








 50%|█████     | 10/20 [00:13<00:12,  1.28s/trial, best loss: 0.3156840642218367]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.2197269745670981, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 55%|█████▌    | 11/20 [00:14<00:11,  1.25s/trial, best loss: 0.3156840642218367]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.877336097741809, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_...`








 60%|██████    | 12/20 [00:16<00:10,  1.26s/trial, best loss: 0.3156840642218367]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.3530414055007469, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 65%|██████▌   | 13/20 [00:17<00:09,  1.30s/trial, best loss: 0.29789915299936354]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.5118043625122918, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 70%|███████   | 14/20 [00:18<00:07,  1.29s/trial, best loss: 0.29789915299936354]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.4208376780294063, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 75%|███████▌  | 15/20 [00:20<00:06,  1.29s/trial, best loss: 0.29789915299936354]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.156550159108913, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_...`








 80%|████████  | 16/20 [00:21<00:05,  1.26s/trial, best loss: 0.29789915299936354]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.46796766947338375,
             eval_metric=None, feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_ca...`








 85%|████████▌ | 17/20 [00:22<00:03,  1.27s/trial, best loss: 0.29007215476852555]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.8316704623967727, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 90%|█████████ | 18/20 [00:24<00:02,  1.30s/trial, best loss: 0.29007215476852555]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.7440943339784996, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








 95%|█████████▌| 19/20 [00:25<00:01,  1.31s/trial, best loss: 0.29007215476852555]

                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 ['Te[K]', 'dT[K]', 'P[bar]', 'Q[W]', 'FR']),
                                ('cat',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False))]),
                         ...`

             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.3826133468217825, eval_metric=None,
             feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat...`








100%|██████████| 20/20 [00:26<00:00,  1.33s/trial, best loss: 0.29007215476852555]
