In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('Data_CHF_Zhao_2020_ATE.csv')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1865 entries, 0 to 1864
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1865 non-null   int64  
 1   author               1865 non-null   object 
 2   geometry             1865 non-null   object 
 3   pressure [MPa]       1865 non-null   float64
 4   mass_flux [kg/m2-s]  1865 non-null   int64  
 5   x_e_out [-]          1865 non-null   float64
 6   D_e [mm]             1865 non-null   float64
 7   D_h [mm]             1865 non-null   float64
 8   length [mm]          1865 non-null   int64  
 9   chf_exp [MW/m2]      1865 non-null   float64
dtypes: float64(5), int64(3), object(2)
memory usage: 145.8+ KB


## Preprocessing

In [16]:
for col in df.columns:
    print(f"{col}: {df[col].nunique()}")

id: 1865
author: 10
geometry: 3
pressure [MPa]: 114
mass_flux [kg/m2-s]: 578
x_e_out [-]: 1360
D_e [mm]: 36
D_h [mm]: 41
length [mm]: 54
chf_exp [MW/m2]: 109


In [17]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop id and author columns
    df = df.drop(['id', 'author'], axis=1)
    
    # Shuffle the dataset
    df = df.sample(frac=1.0, random_state=1)
    
    # Split df into X and y
    y = df['chf_exp [MW/m2]']
    X = df.drop('chf_exp [MW/m2]', axis=1)
    
    return X, y

In [19]:
X, y = preprocess_inputs(df)

In [20]:
X

Unnamed: 0,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm]
400,tube,6.89,1967,0.1203,12.8,12.8,1930
571,tube,11.03,2034,0.0616,9.3,9.3,762
351,tube,7.00,4069,0.0636,10.8,10.8,864
1479,annulus,13.79,2729,0.0537,5.6,15.2,2134
1371,tube,13.79,690,-0.4854,11.1,11.1,457
...,...,...,...,...,...,...,...
905,tube,13.79,4910,0.1131,4.7,4.7,318
1791,annulus,6.85,2292,0.0330,8.5,24.6,1778
1096,tube,13.79,2116,0.0926,7.8,7.8,591
235,tube,6.89,4042,-0.0008,10.3,10.3,762


In [25]:
y

400     2.8
571     3.2
351     3.2
1479    1.7
1371    3.6
       ... 
905     3.0
1791    4.0
1096    2.1
235     4.0
1061    2.0
Name: chf_exp [MW/m2], Length: 1865, dtype: float64

## Building Pipeline

In [22]:
def build_model():
    
    nominal_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('nominal', nominal_transformer, ['geometry'])
    ], remainder='passthrough')
    
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=1))
    ])
    
    return model

## Training

In [23]:
kf = KFold(n_splits=5)

rmses = []

for train_idx, test_idx in kf.split(X):
    
    X_train = X.iloc[train_idx, :]
    X_test = X.iloc[test_idx, :]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]
    
    model = build_model()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    
    rmses.append(rmse)

final_rmse = np.mean(rmses)

In [24]:
print("RMSE: {:.2f}".format(final_rmse))

RMSE: 0.63
