In [None]:
!pip install numpy==1.23.5

In [None]:
!pip install lightgbm

In [None]:
import numpy as np

from sklearn import datasets

# Diabetes

In [None]:
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target

In [None]:
all_data = [{0:[X,y]}]


# California housing

In [None]:
from sklearn.datasets import fetch_california_housing

X, y = fetch_california_housing(return_X_y=True, as_frame=True)
all_data.append({0:[X.to_numpy(),y.to_numpy()]})

# Liver disorders

In [None]:
from sklearn.datasets import fetch_openml
ld = fetch_openml(name='liver-disorders')

In [None]:
X, y = ld['data'].to_numpy(),ld['target'].to_numpy()
all_data.append({0:[X,y]})

In [None]:

from sklearn.model_selection import train_test_split
dict_data = {}
dict_data["Diabetes"] = {}
dict_data["California housing"] = {}
dict_data["Liver disorders"] = {}

for k in all_data[0]:
    x01,x02,y01,y02 = train_test_split(all_data[0][k][0], all_data[0][k][1], test_size=0.3,random_state=42)
    dict_data["Diabetes"][k] = {"train":{"X":x01,"y":y01},"test":{"X":x02,"y":y02}}
    
for k in all_data[1]:
    x11,x12,y11,y12 = train_test_split(all_data[1][k][0], all_data[1][k][1],test_size=0.3,random_state=42)
    dict_data["California housing"][k] = {"train":{"X":x11,"y":y11},"test":{"X":x12,"y":y12}} 

for k in all_data[2]:
    x11,x12,y11,y12 = train_test_split(all_data[2][k][0], all_data[2][k][1],test_size=0.3,random_state=42)
    dict_data["Liver disorders"][k] = {"train":{"X":x11,"y":y11},"test":{"X":x12,"y":y12}} 

all_data = dict_data

# Boosting

In [None]:
import xgboost as xgb
from boosted_forest import CascadeBoostingRegressor
from deepforest import CascadeForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import optuna

from sklearn.model_selection import KFold
#xgb.set_config(verbosity=2)

def make_modelXGB(max_depth,layers,C):
    return xgb.XGBRegressor(max_depth = max_depth, n_estimators = layers)

def make_modelCascade(max_depth,layers,C):
    return CascadeForestRegressor(max_depth = max_depth, max_layers = layers, n_estimators=4)

def make_modelBoosted(max_depth,layers,C):
    return CascadeBoostingRegressor(C=C, n_layers=layers, n_estimators = 1, max_depth=max_depth, n_iter_no_change = 1, validation_fraction = 0.1, learning_rate = 0.9)


models = {"XGB":make_modelXGB,"Cascade Forest":make_modelCascade, "Boosted Forest": make_modelBoosted}

bo_data = []    

for model_name in models:
    make_model = models[model_name]
    for ds_name in all_data:
        for depth in all_data[ds_name]:
            dat = all_data[ds_name][depth]
            x_train = dat["train"]["X"]
            x_test = dat["test"]["X"]
            Y_train = dat["train"]["y"].flatten()
            Y_test = dat["test"]["y"].flatten()            

            def objective(trial):
                layers = trial.suggest_int('layers', 5, 15)
                max_depth = trial.suggest_int('max_depth', 1, 2)

                if model_name == "Boosted Forest":
                    C = trial.suggest_int('C', 1, 2000)
                else:
                    C = 0

                kf = KFold(n_splits=3)
                scores = []
                for _, (train_index, test_index) in enumerate(kf.split(x_train)):
                    model = make_model(max_depth,layers,C)
                    
                    model.fit(
                         x_train[train_index],
                         Y_train[train_index],
                    )
                    y_pred = model.predict(x_train[test_index]) #, batch_size=batch_size)
                    scores.append(mean_squared_error(Y_train[test_index].flatten(),y_pred.flatten()))
                return np.asarray(scores).mean() 
            
            study = optuna.create_study(direction='minimize')
            study.optimize(objective, n_trials=20)    
            
            layers = study.best_trial.params["layers"]  
            max_depth = study.best_trial.params["max_depth"]  

            if model_name == "Boosted Forest":
                C = study.best_trial.params["C"]  
            else:
                C = 0
            model = make_model(max_depth,layers,C)
            model.fit(
                 x_train,
                 Y_train,
            )        
            
            y_pred = model.predict(x_test) #, batch_size=batch_size)
            mse_score = mean_squared_error(Y_test.flatten(),y_pred.flatten())
            mae_score = mean_absolute_error(Y_test.flatten(),y_pred.flatten())
            print(model_name,ds_name,depth,mse_score, mae_score, Y_test.min(),Y_test.max())     
            bo_data.append([model_name,ds_name,depth,mse_score, mae_score])
    

# Alternative weighting

In [None]:
from sklearn.ensemble import RandomForestClassifier
from ecdfr.gcForest import gcForest
from sklearn.metrics import mean_squared_error

import xgboost as xgb
from boosted_forest import CascadeBoostingRegressor
from deepforest import CascadeForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import optuna

from sklearn.model_selection import KFold
#xgb.set_config(verbosity=2)

def make_modelECDFR(max_depth,layers,resampling_rate, et):
    config = {"estimator_configs":[{"n_fold": 5,"type":None,"max_depth":max_depth}],
              "error_threshold": et,
              "resampling_rate": resampling_rate,
              "random_state":None,
              "max_layers":layers,
              "early_stop_rounds":1,
              "train_evaluation":mean_squared_error}
    
    return gcForest(config,1)

models = {"ecdfr":make_modelECDFR}

bo_data = []    

for model_name in models:
    make_model = models[model_name]
    for ds_name in all_data:
        for depth in all_data[ds_name]:
            dat = all_data[ds_name][depth]
            x_train = dat["train"]["X"]
            x_test = dat["test"]["X"]
            Y_train = dat["train"]["y"].flatten()
            Y_test = dat["test"]["y"].flatten()            

            def objective(trial):
                layers = trial.suggest_int('layers', 5, 15)
                max_depth = trial.suggest_int('max_depth', 1, 2)

                C = trial.suggest_float('resampling_rate', 1.1, 1.8)
                et = trial.suggest_float('et', 0.01, 0.9)
                
                kf = KFold(n_splits=3)
                scores = []
                try:
                    for _, (train_index, test_index) in enumerate(kf.split(x_train)):
                        model = make_modelECDFR(max_depth,layers,C,et)
                    
                        model.fit(
                             x_train[train_index],
                             Y_train[train_index],
                        )
                        y_pred = model.predict(x_train[test_index]) #, batch_size=batch_size)
                        scores.append(mean_squared_error(Y_train[test_index].flatten(),y_pred.flatten()))
                except:
                    scores = [1000000000.]
                return np.asarray(scores).mean() 
            
            study = optuna.create_study(direction='minimize')
            study.optimize(objective, n_trials=1000)    
            
            layers = study.best_trial.params["layers"]  
            max_depth = study.best_trial.params["max_depth"]  


            C = study.best_trial.params["resampling_rate"]  
            et = study.best_trial.params["et"]  
            model = make_model(max_depth,layers,C,et)
            model.fit(
                 x_train,
                 Y_train,
            )        
            
            y_pred = model.predict(x_test) #, batch_size=batch_size)
            mse_score = mean_squared_error(Y_test.flatten(),y_pred.flatten())
            mae_score = mean_absolute_error(Y_test.flatten(),y_pred.flatten())
            print(model_name,ds_name,depth,mse_score, mae_score, Y_test.min(),Y_test.max())     
            bo_data.append([model_name,ds_name,depth,mse_score, mae_score])

In [None]:
import xgboost as xgb
from boosted_forest import CascadeBoostingRegressor
from deepforest import CascadeForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import optuna

from sklearn.model_selection import KFold
#xgb.set_config(verbosity=2)

def make_modelCascade(max_depth,layers,C,wt):
    wf = {0:"linear", 1:"1-w^1/2", 2:"1-w2"}
    return CascadeForestRegressor(max_depth = max_depth, max_layers = layers, n_estimators=4,adaptive=True,weighting_function = wf[wt])


models = {"AWDF":make_modelCascade}

bo_data = []    

for model_name in models:
    make_model = models[model_name]
    for ds_name in all_data:
        for depth in all_data[ds_name]:
            dat = all_data[ds_name][depth]
            x_train = dat["train"]["X"]
            x_test = dat["test"]["X"]
            Y_train = dat["train"]["y"].flatten()
            Y_test = dat["test"]["y"].flatten()            

            def objective(trial):
                layers = trial.suggest_int('layers', 5, 15)
                max_depth = trial.suggest_int('max_depth', 1, 2)
                wt = trial.suggest_int('weight_function', 0, 2)   
                if model_name == "Boosted Forest":
                    C = trial.suggest_int('C', 1, 2000)
                else:
                    C = 0

                kf = KFold(n_splits=3)
                scores = []
                for _, (train_index, test_index) in enumerate(kf.split(x_train)):
                    model = make_model(max_depth,layers,C,wt)
                    
                    model.fit(
                         x_train[train_index],
                         Y_train[train_index],
                    )
                    y_pred = model.predict(x_train[test_index]) #, batch_size=batch_size)
                    scores.append(mean_squared_error(Y_train[test_index].flatten(),y_pred.flatten()))
                return np.asarray(scores).mean() 
            
            study = optuna.create_study(direction='minimize')
            study.optimize(objective, n_trials=50)    
            
            layers = study.best_trial.params["layers"]  
            max_depth = study.best_trial.params["max_depth"]  
            wt = study.best_trial.params['weight_function']
            if model_name == "Boosted Forest":
                C = study.best_trial.params["C"]  
            else:
                C = 0
            model = make_model(max_depth,layers,C,wt)
            model.fit(
                 x_train,
                 Y_train,
            )        
            
            y_pred = model.predict(x_test) #, batch_size=batch_size)
            mse_score = mean_squared_error(Y_test.flatten(),y_pred.flatten())
            mae_score = mean_absolute_error(Y_test.flatten(),y_pred.flatten())
            print(model_name,ds_name,depth,mse_score, mae_score, Y_test.min(),Y_test.max())     
            bo_data.append([model_name,ds_name,depth,mse_score, mae_score])
    