In [1]:
# https://towardsdatascience.com/quickly-test-multiple-models-a98477476f0

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
threshold = 1
random_state = 42

In [4]:
# Import data and preprocess
df = pd.read_csv("../test_data/new leaky wave/S11_V1.csv")
df = df.drop(df[df['dB(S(1,1)) []'] > 0].index) # Remove all rows with positive s11

# Split into x and y
input_x = df.drop(columns=['dB(S(1,1)) []'], axis=1)
input_y = df[['dB(S(1,1)) []']]

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(input_x, input_y, random_state=random_state)

In [5]:
def generate_pipeline(scaler, model):
    return (model.__class__.__name__, Pipeline(steps=[('normalize', scaler), ('model', model)]))

In [6]:
# Check if predicted value is threshold amount above or below actual value
def is_in_threshold(actual, pred):
    return pred <= actual + threshold and pred >= actual - threshold

def create_tf_column(results):
    return results.apply(lambda x: is_in_threshold(x['y_test'], x['predictions']), axis=1)

def get_score(y_test, y_pred, **kwargs):
    dataframe = pd.DataFrame(y_test, columns=['y_test'])
    dataframe['predictions'] = y_pred
    return create_tf_column(dataframe).value_counts().get(True) / dataframe.shape[0]

def get_rsme(y_test, y_pred):
    return mean_squared_error(y_test, y_pred, squared=False)

def get_r_squared(y_test, y_pred):
    return r2_score(y_test, y_pred)

In [7]:
custom_scorer = make_scorer(get_score, greater_is_better=True)
scaler = StandardScaler() # Initialize scaler

In [8]:
dtr_model = DecisionTreeRegressor(random_state=random_state)
dtr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', dtr_model)])

random_grid = {
    'model__splitter': ('best', 'random'),
    'model__max_depth': list(range(1, 20)),
    'model__min_samples_split': [2, 3, 4],
    'model__min_samples_leaf': list(range(1, 20)),
}

decision_tree_search = RandomizedSearchCV(dtr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
decision_tree_search.fit(input_x, input_y.values.ravel())
print(decision_tree_search.best_params_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 1/10; 1/20] START model__max_depth=19, model__min_samples_leaf=10, model__min_samples_split=2, model__splitter=random
[CV 4/10; 1/20] START model__max_depth=19, model__min_samples_leaf=10, model__min_samples_split=2, model__splitter=random
[CV 2/10; 1/20] START model__max_depth=19, model__min_samples_leaf=10, model__min_samples_split=2, model__splitter=random
[CV 1/10; 1/20] END model__max_depth=19, model__min_samples_leaf=10, model__min_samples_split=2, model__splitter=random;, score=0.460 total time=   0.0s
[CV 3/10; 1/20] START model__max_depth=19, model__min_samples_leaf=10, model__min_samples_split=2, model__splitter=random
[CV 5/10; 1/20] START model__max_depth=19, model__min_samples_leaf=10, model__min_samples_split=2, model__splitter=random
[CV 4/10; 1/20] END model__max_depth=19, model__min_samples_leaf=10, model__min_samples_split=2, model__splitter=random;, score=0.391 total time=   0.0s
[CV 2/10; 1/20] END m

In [9]:
rfr_model = RandomForestRegressor(random_state=random_state)
rfr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', rfr_model)])

random_grid = {'model__n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=5)],
               'model__max_depth': [int(x) for x in np.linspace(10, 110, num=5)],
               'model__min_samples_split': [2, 5, 10],
               'model__min_samples_leaf': [1, 2, 4]}

random_forest_search = RandomizedSearchCV(rfr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
random_forest_search.fit(input_x, input_y.values.ravel())
print(random_forest_search.best_params_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 1/10; 1/20] START model__max_depth=60, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=550
[CV 2/10; 1/20] START model__max_depth=60, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=550
[CV 3/10; 1/20] START model__max_depth=60, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=550
[CV 7/10; 1/20] START model__max_depth=60, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=550
[CV 4/10; 1/20] START model__max_depth=60, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=550
[CV 5/10; 1/20] START model__max_depth=60, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=550
[CV 8/10; 1/20] START model__max_depth=60, model__min_samples_leaf=4, model__min_samples_split=2, model__n_estimators=550
[CV 6/10; 1/20] START model__max_depth=60, model__min_samples_leaf=4, model__min_sa

In [10]:
gbr_model = GradientBoostingRegressor(random_state=random_state)
gbr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', gbr_model)])

random_grid = {'model__learning_rate': [0.001],
               'model__n_estimators': [1000, 3000, 5000, 7000, 10000],
               'model__max_depth': [1, 2, 3, 5, 7, 10]}

gradient_boosting_search = RandomizedSearchCV(gbr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
gradient_boosting_search.fit(input_x, input_y.values.ravel())
print(gradient_boosting_search.best_params_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 1/10; 1/20] START model__learning_rate=0.001, model__max_depth=5, model__n_estimators=10000
[CV 2/10; 1/20] START model__learning_rate=0.001, model__max_depth=5, model__n_estimators=10000
[CV 3/10; 1/20] START model__learning_rate=0.001, model__max_depth=5, model__n_estimators=10000
[CV 4/10; 1/20] START model__learning_rate=0.001, model__max_depth=5, model__n_estimators=10000
[CV 5/10; 1/20] START model__learning_rate=0.001, model__max_depth=5, model__n_estimators=10000
[CV 6/10; 1/20] START model__learning_rate=0.001, model__max_depth=5, model__n_estimators=10000
[CV 7/10; 1/20] START model__learning_rate=0.001, model__max_depth=5, model__n_estimators=10000
[CV 8/10; 1/20] START model__learning_rate=0.001, model__max_depth=5, model__n_estimators=10000
[CV 9/10; 1/20] START model__learning_rate=0.001, model__max_depth=5, model__n_estimators=10000
[CV 10/10; 1/20] START model__learning_rate=0.001, model__max_depth=5, mo



[CV 3/10; 9/20] END model__learning_rate=0.001, model__max_depth=10, model__n_estimators=7000;, score=0.626 total time= 1.2min
[CV 6/10; 9/20] END model__learning_rate=0.001, model__max_depth=10, model__n_estimators=7000;, score=0.668 total time= 1.2min
[CV 5/10; 9/20] END model__learning_rate=0.001, model__max_depth=10, model__n_estimators=7000;, score=0.602 total time= 1.2min
[CV 10/10; 9/20] END model__learning_rate=0.001, model__max_depth=10, model__n_estimators=7000;, score=0.520 total time= 1.2min
[CV 8/10; 9/20] END model__learning_rate=0.001, model__max_depth=10, model__n_estimators=7000;, score=0.503 total time= 1.2min
[CV 3/10; 14/20] START model__learning_rate=0.001, model__max_depth=3, model__n_estimators=7000
[CV 6/10; 14/20] START model__learning_rate=0.001, model__max_depth=3, model__n_estimators=7000
[CV 4/10; 14/20] START model__learning_rate=0.001, model__max_depth=3, model__n_estimators=7000
[CV 5/10; 14/20] START model__learning_rate=0.001, model__max_depth=3, model

In [11]:
svr_model = SVR()
svr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', svr_model)])

random_grid = {
    'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'model__C': [1,5,10],
    'model__degree': [3,8],
    'model__coef0': [0.01,10,0.5],
    'model__gamma': ['auto','scale']
}
svr_search = RandomizedSearchCV(svr_pipeline, random_grid, n_iter=20, cv=10, verbose=10, scoring=custom_scorer, n_jobs=-1)
svr_search.fit(input_x, input_y.values.ravel())
print(svr_search.best_params_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 1/10; 1/20] START model__C=10, model__coef0=0.5, model__degree=8, model__gamma=auto, model__kernel=rbf
[CV 2/10; 1/20] START model__C=10, model__coef0=0.5, model__degree=8, model__gamma=auto, model__kernel=rbf
[CV 3/10; 1/20] START model__C=10, model__coef0=0.5, model__degree=8, model__gamma=auto, model__kernel=rbf
[CV 4/10; 1/20] START model__C=10, model__coef0=0.5, model__degree=8, model__gamma=auto, model__kernel=rbf
[CV 5/10; 1/20] START model__C=10, model__coef0=0.5, model__degree=8, model__gamma=auto, model__kernel=rbf
[CV 6/10; 1/20] START model__C=10, model__coef0=0.5, model__degree=8, model__gamma=auto, model__kernel=rbf
[CV 7/10; 1/20] START model__C=10, model__coef0=0.5, model__degree=8, model__gamma=auto, model__kernel=rbf
[CV 8/10; 1/20] START model__C=10, model__coef0=0.5, model__degree=8, model__gamma=auto, model__kernel=rbf
[CV 9/10; 1/20] START model__C=10, model__coef0=0.5, model__degree=8, model__gamm

KeyboardInterrupt: 

In [12]:
xgbr_model = XGBRegressor(random_state=random_state)
xgbr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', xgbr_model)])

random_grid = {
    'model__n_estimators': [100, 500, 900, 1100, 1500],
    'model__max_depth': [2, 3, 5, 10, 15],
    'model__learning_rate': [0.05, 0.1, 0.15, 0.20],
    'model__min_child_weight': [1, 2, 3, 4]
    }
xgbr_search = RandomizedSearchCV(xgbr_pipeline, random_grid, n_iter=20, cv=10, verbose=10, scoring=custom_scorer, n_jobs=-1)
xgbr_search.fit(input_x, input_y.values.ravel())
print(xgbr_search.best_params_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 1/10; 1/20] START model__learning_rate=0.2, model__max_depth=3, model__min_child_weight=4, model__n_estimators=1500
[CV 2/10; 1/20] START model__learning_rate=0.2, model__max_depth=3, model__min_child_weight=4, model__n_estimators=1500
[CV 8/10; 1/20] START model__learning_rate=0.2, model__max_depth=3, model__min_child_weight=4, model__n_estimators=1500
[CV 10/10; 1/20] START model__learning_rate=0.2, model__max_depth=3, model__min_child_weight=4, model__n_estimators=1500
[CV 4/10; 1/20] START model__learning_rate=0.2, model__max_depth=3, model__min_child_weight=4, model__n_estimators=1500
[CV 3/10; 1/20] START model__learning_rate=0.2, model__max_depth=3, model__min_child_weight=4, model__n_estimators=1500
[CV 7/10; 1/20] START model__learning_rate=0.2, model__max_depth=3, model__min_child_weight=4, model__n_estimators=1500
[CV 4/10; 2/20] START model__learning_rate=0.1, model__max_depth=5, model__min_child_weight=1, m

In [13]:
cbr_model = CatBoostRegressor(random_state=random_state)
cbr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', cbr_model)])

random_grid = {
    'model__depth' : [6,8,10],
    'model__learning_rate' : [0.01, 0.05, 0.1],
    'model__iterations'    : [30, 50, 100]
    }
cbr_search = RandomizedSearchCV(cbr_pipeline, random_grid, n_iter=20, cv=10, verbose=10, scoring=custom_scorer, n_jobs=-1)
cbr_search.fit(input_x, input_y.values.ravel())
print(cbr_search.best_params_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 1/10; 1/20] START model__depth=6, model__iterations=30, model__learning_rate=0.1
0:	learn: 4.7652202	total: 51.4ms	remaining: 1.49s
1:	learn: 4.4875020	total: 52.9ms	remaining: 741ms
2:	learn: 4.2484611	total: 54.4ms	remaining: 490ms
3:	learn: 4.0468582	total: 57.1ms	remaining: 371ms
4:	learn: 3.8808050	total: 58.4ms	remaining: 292ms
5:	learn: 3.7230482	total: 59.4ms	remaining: 237ms
6:	learn: 3.5780928	total: 60.4ms	remaining: 199ms
7:	learn: 3.4553295	total: 61.3ms	remaining: 169ms
8:	learn: 3.3505358	total: 62.4ms	remaining: 146ms
9:	learn: 3.2644528	total: 63.3ms	remaining: 127ms
10:	learn: 3.1920935	total: 64.1ms	remaining: 111ms
11:	learn: 3.1066576	total: 65ms	remaining: 97.6ms
12:	learn: 3.0421763	total: 66ms	remaining: 86.3ms
13:	learn: 2.9994391	total: 67ms	remaining: 76.6ms
14:	learn: 2.9566912	total: 68ms	remaining: 68ms
15:	learn: 2.8978289	total: 69ms	remaining: 60.4ms
16:	learn: 2.8691523	total: 69.9ms	re



92:	learn: 2.3860873	total: 2.17s	remaining: 163ms
27:	learn: 3.0558762	total: 838ms	remaining: 2.15s
44:	learn: 2.4347752	total: 845ms	remaining: 93.9ms
28:	learn: 3.0274642	total: 840ms	remaining: 2.06s
45:	learn: 2.4266027	total: 846ms	remaining: 73.6ms
29:	learn: 3.0067406	total: 841ms	remaining: 1.96s
46:	learn: 2.4193024	total: 848ms	remaining: 54.1ms
30:	learn: 2.9887426	total: 843ms	remaining: 1.88s
47:	learn: 2.4068421	total: 849ms	remaining: 35.4ms
[CV 6/10; 6/20] START model__depth=8, model__iterations=100, model__learning_rate=0.1
31:	learn: 2.9720761	total: 844ms	remaining: 1.79s
48:	learn: 2.3980264	total: 850ms	remaining: 17.4ms
11:	learn: 3.0939629	total: 303ms	remaining: 1.05s
49:	learn: 2.3919116	total: 852ms	remaining: 0us
12:	learn: 3.0298410	total: 305ms	remaining: 940ms
13:	learn: 2.9868771	total: 306ms	remaining: 849ms
38:	learn: 2.5311729	total: 978ms	remaining: 276ms
39:	learn: 2.5207500	total: 980ms	remaining: 245ms
40:	learn: 2.5094778	total: 981ms	remaining:

In [14]:
abr_model = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=random_state)
abr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', abr_model)])

random_grid = {'model__estimator__max_depth':[i for i in range(2,11,2)],
              'model__estimator__min_samples_leaf':[5,10],
              'model__n_estimators':[10,50,250,1000],
              'model__learning_rate':[0.01,0.1]}

abr_search = RandomizedSearchCV(abr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
abr_search.fit(input_x, input_y.values.ravel())
print(abr_search.best_params_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 3/10; 1/20] START model__estimator__max_depth=6, model__estimator__min_samples_leaf=10, model__learning_rate=0.1, model__n_estimators=250
[CV 6/10; 1/20] START model__estimator__max_depth=6, model__estimator__min_samples_leaf=10, model__learning_rate=0.1, model__n_estimators=250
[CV 2/10; 2/20] START model__estimator__max_depth=4, model__estimator__min_samples_leaf=10, model__learning_rate=0.01, model__n_estimators=1000
[CV 3/10; 2/20] START model__estimator__max_depth=4, model__estimator__min_samples_leaf=10, model__learning_rate=0.01, model__n_estimators=1000
[CV 6/10; 2/20] START model__estimator__max_depth=4, model__estimator__min_samples_leaf=10, model__learning_rate=0.01, model__n_estimators=1000
[CV 5/10; 2/20] START model__estimator__max_depth=4, model__estimator__min_samples_leaf=10, model__learning_rate=0.01, model__n_estimators=1000[CV 4/10; 2/20] START model__estimator__max_depth=4, model__estimator__min_sam



[CV 9/10; 6/20] END model__estimator__max_depth=6, model__estimator__min_samples_leaf=10, model__learning_rate=0.1, model__n_estimators=1000;, score=0.221 total time=   7.3s
[CV 10/10; 10/20] START model__estimator__max_depth=8, model__estimator__min_samples_leaf=5, model__learning_rate=0.01, model__n_estimators=250
[CV 9/10; 10/20] START model__estimator__max_depth=8, model__estimator__min_samples_leaf=5, model__learning_rate=0.01, model__n_estimators=250
[CV 1/10; 10/20] END model__estimator__max_depth=8, model__estimator__min_samples_leaf=5, model__learning_rate=0.01, model__n_estimators=250;, score=0.499 total time=   2.8s
[CV 2/10; 10/20] END model__estimator__max_depth=8, model__estimator__min_samples_leaf=5, model__learning_rate=0.01, model__n_estimators=250;, score=0.498 total time=   2.8s
[CV 1/10; 11/20] START model__estimator__max_depth=8, model__estimator__min_samples_leaf=10, model__learning_rate=0.1, model__n_estimators=250
[CV 3/10; 10/20] END model__estimator__max_depth

In [None]:
scaler = StandardScaler()
models = [generate_pipeline(scaler, model) for model in [
    CatBoostRegressor(learning_rate=0.1, iterations=100, depth=10),
    XGBRegressor(n_estimators=100, min_child_weight=2, max_depth=10, learning_rate=0.15),
    # SVR(kernel='rbf', gamma='scale', degree=3, coef0=0.01, C=5),
    GradientBoostingRegressor(n_estimators=10000, max_depth=7, learning_rate=0.001, random_state=random_state),
    RandomForestRegressor(max_depth=85, min_samples_leaf=1, min_samples_split=2, n_estimators=1000, random_state=random_state),
    DecisionTreeRegressor(splitter='random', min_samples_split=2, min_samples_leaf=3, max_depth=18, random_state=random_state),
    AdaBoostRegressor(estimator=DecisionTreeRegressor(min_samples_leaf=5, max_depth=10), random_state=random_state, n_estimators=50, learning_rate=0.1)
]]

In [None]:
# Test the random forest model 
model = models[-2]
clf = model[1].fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)
get_score(y_test.values.ravel(), y_pred)

In [None]:
pd.DataFrame({'y_actual': y_test.values.reshape(-1), 'y_pred': y_pred})

In [None]:
results = {}

for name, model in tqdm(models):
  clf = model.fit(X_train, y_train.values.ravel())
  y_pred = clf.predict(X_test)

  results[name] = {
    'score': get_score(y_test.values.ravel(), y_pred),
    'rsme': get_rsme(y_test.values.ravel(), y_pred),
    'r_squared': get_r_squared(y_test.values.ravel(), y_pred),
    'y_pred': y_pred
  }

In [None]:
df = pd.DataFrame(results).T

In [None]:
px.bar(df, x=df.index, y='score')

In [None]:
px.bar(df, x=df.index, y='rsme')

In [None]:
px.bar(df, x=df.index, y='r_squared')

In [None]:
from matplotlib import rc
# Make the font the same as latex 
rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
rc('text', usetex=True)

In [None]:
# s11_sources = []
# for model_name in ["CatBoost", "XGB", "GradientBoosting", "RandomForest", "DecisionTree", "AdaBoost"]:
#     s11_sources += [model_name] * len(y_test.values.tolist())

# s11_colors = []
# for color in ["red", "blue", "green", "yellow", "orange"]:
#     s11_colors += [color] * len(y_test.values.tolist())

In [None]:
# scatter_plot_df = pd.DataFrame({
#     's11_true': y_test.values.tolist() * 6,
#     's11_prediction': np.concatenate((sklearn_predictions,test_predictions.reshape(-1))),
#     's11_source': s11_sources,
#     's11_color': s11_colors
# })

In [None]:
# import matplotlib.pyplot as plt
# fig = plt.figure(figsize=(10,7))
# ax1 = fig.add_subplot(111)
# for model_name in ["XGBRegressor", "RandomForestRegressor"]:
#     model_results = results[model_name]
#     ax1.scatter(y_test, model_results['y_pred'], label=model_name, marker='.')
# plt.xlabel('True $S_{11}$')
# plt.ylabel('Predicted $S_{11}$')
# plt.axline((-30,-30), (10,10))
# plt.legend(loc='upper left')
# plt.show()
# # plt.savefig("")