In [155]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer, enable_hist_gradient_boosting
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold, cross_validate
from sklearn.svm import LinearSVR
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from scipy.stats import *
from datetime import datetime as dt

# Dataset

In [156]:
df = pd.read_csv('data.csv')
df.set_index('id', inplace = True)
df.head(3)

Unnamed: 0_level_0,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Thompson,tube,7.0,3770.0,0.1754,,10.8,432.0,3.6
1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5


In [157]:
df_test = df[df['x_e_out [-]'].isna()]
df_train = df[df['x_e_out [-]'].notna()]

y_train = df_train['x_e_out [-]']
X_train = df_train.drop('x_e_out [-]', axis = 1)
X_test = df_test.drop('x_e_out [-]', axis = 1)

del df_test, df_train, df

## Author

In [158]:
X_train['author'].fillna('Other', inplace = True)
X_test['author'].fillna('Other', inplace = True)

## Geometry

In [159]:
X_train['geometry'].fillna('Other', inplace = True)
X_test['geometry'].fillna('Other', inplace = True)

## One Hot Encode

In [160]:

col_to_ohe = ['geometry', 'author']
ohe = OneHotEncoder(sparse_output=False)
ohe.set_output(transform='pandas')

ohe_train = ohe.fit_transform(X_train[col_to_ohe])
ohe_test = ohe.transform(X_test[col_to_ohe])

X_train.drop(col_to_ohe, axis = 1, inplace = True)
X_test.drop(col_to_ohe, axis = 1, inplace = True)

X_train = pd.concat([X_train, ohe_train], axis = 1)
X_test = pd.concat([X_test, ohe_test], axis = 1)

In [161]:
X_train.columns

Index(['pressure [MPa]', 'mass_flux [kg/m2-s]', 'D_e [mm]', 'D_h [mm]',
       'length [mm]', 'chf_exp [MW/m2]', 'geometry_Other', 'geometry_annulus',
       'geometry_plate', 'geometry_tube'],
      dtype='object')

## Missing Imputer

In [162]:
impute = IterativeImputer(initial_strategy='mean', max_iter=50, tol = 1e-6)
impute.set_output(transform = 'pandas')

X_train = impute.fit_transform(X_train)
X_test = impute.transform(X_test)

## New Feature

In [164]:
X_train['D_area'] = X_train['D_e [mm]'] * X_train['D_h [mm]']
X_test['D_area'] = X_test['D_e [mm]'] * X_test['D_h [mm]']

X_train['Volume'] = X_train['D_area'] * X_train['length [mm]']
X_test['Volume'] = X_test['D_area'] * X_test['length [mm]']

# Added after 1st trial
# X_train['De/Pr'] = X_train['D_e [mm]'] / X_train['pressure [MPa]']
# X_test['De/Pr'] = X_test['D_e [mm]'] / X_test['pressure [MPa]']

# X_train['Dh/Pr'] = X_train['D_h [mm]'] / X_train['pressure [MPa]']
# X_test['Dh/Pr'] = X_test['D_h [mm]'] / X_test['pressure [MPa]']

# X_train['Dh/De'] = X_train['D_h [mm]'] / X_train['D_e [mm]']
# X_test['Dh/De'] = X_test['D_h [mm]'] / X_test['D_e [mm]']

# X_train['Len/Dh'] = X_train['length [mm]'] / X_train['D_h [mm]']
# X_test['Len/Dh'] = X_test['length [mm]'] / X_test['D_h [mm]']

## Standard Scaler

In [165]:
col_to_scale = ['pressure [MPa]', 'mass_flux [kg/m2-s]', 'D_e [mm]', 'D_h [mm]', 'length [mm]',	'chf_exp [MW/m2]', 'D_area', 'Volume']

scale = StandardScaler()
scale.set_output(transform='pandas')

X_train[col_to_scale] = scale.fit_transform(X_train[col_to_scale])
X_test[col_to_scale] = scale.transform(X_test[col_to_scale])

In [166]:
X_train.shape

(21229, 16)

## Rename Col (for compatibility with LightGBM)

In [167]:
X_train.columns = range(16)
X_test.columns = range(16)
X_train.columns

RangeIndex(start=0, stop=16, step=1)

# Base

In [13]:
def fit_model(x_train, y_train, estimator, cv):
  avg_fit_time = []
  sd_fit_time = []
  max_fit_time = []

  avg_train_score = []
  sd_train_score = []
  max_train_score = []
  min_train_score = []

  avg_test_score = []
  sd_test_score = []
  max_test_score = []
  min_test_score = []
  
  estimator_cv = []

  for est in estimator:
    print('\nCurrent:', est)
    score = cross_validate(est, x_train, y_train, scoring = 'neg_root_mean_squared_error', cv = cv, return_train_score=True, return_estimator = True,
                           n_jobs = -1, verbose = 1)
    
    avg_fit_time.append(np.mean(score['fit_time']))
    sd_fit_time.append(np.std(score['fit_time']))
    max_fit_time.append(np.max(score['fit_time']))

    avg_train_score.append(np.mean(score['train_score']))
    sd_train_score.append(np.std(score['train_score']))
    max_train_score.append(np.max(score['train_score']))
    min_train_score.append(np.min(score['train_score']))

    avg_test_score.append(np.mean(score['test_score']))
    sd_test_score.append(np.std(score['test_score']))
    max_test_score.append(np.max(score['test_score']))
    min_test_score.append(np.min(score['test_score']))

    param = []
    for i in score['estimator']:
      param.append(i.get_params())
    
    estimator_cv.append(param)

  result = pd.DataFrame({
    'model' : estimator,
    'avg_fit_time' : avg_fit_time,
    'sd_fit_time' : sd_fit_time,
    'max_fit_time' : max_fit_time,

    'avg_train_score' : avg_train_score,
    'sd_train_score' : sd_train_score,
    'max_train_score' : max_train_score,
    'min_train_score' : min_train_score,

    'avg_test_score' : avg_test_score,
    'sd_test_score' : sd_test_score,
    'max_test_score' : max_test_score,
    'min_test_score' : min_test_score,

    'estimators' : estimator_cv,
  })

  return(result)

In [14]:
result_base  = fit_model(X_train, y_train, [RandomForestRegressor(),
                                         LinearSVR(), LGBMRegressor(),
                                          HistGradientBoostingRegressor(),
                                         SGDRegressor(), KNeighborsRegressor()],
                                         KFold(shuffle=True))
result_base


Current: RandomForestRegressor()


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.



Current: LinearSVR()


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.



Current: LGBMRegressor()


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.



Current: HistGradientBoostingRegressor()


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished



Current: SGDRegressor()

Current: KNeighborsRegressor()


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.8s finished


Unnamed: 0,model,avg_fit_time,sd_fit_time,max_fit_time,avg_train_score,sd_train_score,max_train_score,min_train_score,avg_test_score,sd_test_score,max_test_score,min_test_score,estimators
0,RandomForestRegressor(),10.154811,0.078223,10.286122,-0.031307,0.0003,-0.030746,-0.031585,-0.079353,0.002053,-0.077461,-0.083096,"[{'bootstrap': True, 'ccp_alpha': 0.0, 'criter..."
1,LinearSVR(),2.125348,0.088648,2.211208,-0.086583,0.000473,-0.085947,-0.087046,-0.086628,0.002104,-0.082979,-0.089086,"[{'C': 1.0, 'dual': True, 'epsilon': 0.0, 'fit..."
2,LGBMRegressor(),0.189003,0.005869,0.197574,-0.068098,0.000272,-0.067707,-0.068542,-0.074605,0.001129,-0.072929,-0.076448,"[{'boosting_type': 'gbdt', 'class_weight': Non..."
3,HistGradientBoostingRegressor(),0.510068,0.09706,0.662651,-0.07008,0.001033,-0.068127,-0.071153,-0.074828,0.001731,-0.071513,-0.076224,"[{'categorical_features': None, 'early_stoppin..."
4,SGDRegressor(),0.040277,0.003053,0.043819,-0.085838,0.000709,-0.085211,-0.086993,-0.085883,0.002626,-0.081617,-0.088436,"[{'alpha': 0.0001, 'average': False, 'early_st..."
5,KNeighborsRegressor(),0.011162,0.000849,0.012348,-0.06797,0.000289,-0.06752,-0.068289,-0.083645,0.001233,-0.082095,-0.0853,"[{'algorithm': 'auto', 'leaf_size': 30, 'metri..."


# Tune

In [168]:
def tune(estimator, param_distribution, X, y, X_test, modelname = 'Model', cv = KFold(5, shuffle = True), n_iter = 50):
    cdt = dt.today().strftime('%Y-%m-%d')
    rs = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=param_distribution,
        n_iter=n_iter,
        scoring='neg_root_mean_squared_error',
        cv=cv,
        verbose=1,
        random_state=12,
        n_jobs=16,
        pre_dispatch=16,
    )

    #CV results
    rs.fit(X, y)
    filename = '_'+modelname+'-'+str(cdt)+'-result.csv'
    pd.DataFrame(rs.cv_results_).to_csv(filename, index = False)
    print(rs.best_estimator_)
    
    #Uncalibrated
    ypred = rs.predict(X_test)
    submission = X_test.copy(deep = True)
    submission['x_e_out [-]'] = ypred
    submission = submission[['x_e_out [-]']]

    filename = '_'+modelname+'-'+str(cdt)+'-submission.csv'
    submission.to_csv(filename, index = True)

    return(pd.DataFrame(rs.cv_results_))

In [169]:
# define the parameter distributions to search over
param_distribution1_lgbm = {
    'lgbmclassifier__max_depth': [-1, 1, 3, 5, 10, 15, 20, 25],
    'lgbmclassifier__learning_rate': uniform(0.01, 0.3), 
    'lgbmclassifier__n_estimators': randint(25, 1000), 
}

# create an instance of the LGBMClassifier estimator
estimator1 = LGBMRegressor()

res_lgbm1_ada = tune(estimator1, param_distribution1_lgbm, X_train, y_train, X_test, modelname = 'lgbm_4', n_iter = 1500)

Fitting 5 folds for each of 1500 candidates, totalling 7500 fits
LGBMRegressor(lgbmclassifier__learning_rate=0.05624885271390171,
              lgbmclassifier__max_depth=20, lgbmclassifier__n_estimators=278)


In [170]:
res_lgbm1_ada[res_lgbm1_ada['rank_test_score'] < 10]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lgbmclassifier__learning_rate,param_lgbmclassifier__max_depth,param_lgbmclassifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.345569,0.041041,0.084767,0.002080,0.056249,20,278,{'lgbmclassifier__learning_rate': 0.0562488527...,-0.074999,-0.07588,-0.074652,-0.07482,-0.07299,-0.074668,0.00094,1
1,1.458030,0.068077,0.086436,0.005327,0.088995,5,988,{'lgbmclassifier__learning_rate': 0.0889945045...,-0.074999,-0.07588,-0.074652,-0.07482,-0.07299,-0.074668,0.00094,1
2,1.533669,0.048763,0.093803,0.008645,0.014372,20,74,{'lgbmclassifier__learning_rate': 0.0143724887...,-0.074999,-0.07588,-0.074652,-0.07482,-0.07299,-0.074668,0.00094,1
3,1.475849,0.055492,0.092063,0.010864,0.280214,15,498,{'lgbmclassifier__learning_rate': 0.2802144562...,-0.074999,-0.07588,-0.074652,-0.07482,-0.07299,-0.074668,0.00094,1
4,1.486859,0.045312,0.095701,0.011451,0.297085,20,99,{'lgbmclassifier__learning_rate': 0.2970848008...,-0.074999,-0.07588,-0.074652,-0.07482,-0.07299,-0.074668,0.00094,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1.477342,0.021846,0.090448,0.001245,0.187003,15,182,{'lgbmclassifier__learning_rate': 0.1870029155...,-0.074999,-0.07588,-0.074652,-0.07482,-0.07299,-0.074668,0.00094,1
1496,1.482952,0.049909,0.095452,0.006835,0.046965,10,213,{'lgbmclassifier__learning_rate': 0.0469653552...,-0.074999,-0.07588,-0.074652,-0.07482,-0.07299,-0.074668,0.00094,1
1497,1.464303,0.027399,0.077660,0.013875,0.186578,5,185,{'lgbmclassifier__learning_rate': 0.1865776546...,-0.074999,-0.07588,-0.074652,-0.07482,-0.07299,-0.074668,0.00094,1
1498,1.350736,0.104423,0.028337,0.006309,0.02648,20,156,{'lgbmclassifier__learning_rate': 0.0264800357...,-0.074999,-0.07588,-0.074652,-0.07482,-0.07299,-0.074668,0.00094,1
