In [None]:
# https://towardsdatascience.com/quickly-test-multiple-models-a98477476f0

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.tree import DecisionTreeRegressor

In [47]:
threshold = 1
random_state = 42

In [None]:
# Import data and preprocess
df = pd.read_csv("../test_data/Grounded CPW Leaky Wave antenna/S11 Data.csv")
df = df.drop(df[df['dB(S(1,1)) []'] > 0].index) # Remove all rows with positive s11

# Split into x and y
input_x = df.drop(columns=['dB(S(1,1)) []'], axis=1)
input_y = df[['dB(S(1,1)) []']]

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(input_x, input_y, random_state=random_state)

In [None]:
def generate_pipeline(scaler, model):
    return (model.__class__.__name__, Pipeline(steps=[('normalize', scaler), ('model', model)]))

In [None]:
# Check if predicted value is threshold amount above or below actual value
def is_in_threshold(actual, pred):
    return pred <= actual + threshold and pred >= actual - threshold

def create_tf_column(results):
    return results.apply(lambda x: is_in_threshold(x['y_test'], x['predictions']), axis=1)

def get_score(y_test, y_pred, **kwargs):
    dataframe = pd.DataFrame(y_test, columns=['y_test'])
    dataframe['predictions'] = y_pred
    return create_tf_column(dataframe).value_counts().get(True) / dataframe.shape[0]

def get_rsme(y_test, y_pred):
    return mean_squared_error(y_test, y_pred, squared=False)

def get_r_squared(y_test, y_pred):
    return r2_score(y_test, y_pred)

In [None]:
custom_scorer = make_scorer(get_score, greater_is_better=True)
scaler = MinMaxScaler(feature_range=(0,1)) # Initialize scaler

In [None]:
rfr_model = RandomForestRegressor(random_state=random_state)
rfr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', rfr_model)])

random_grid = {'model__n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=5)],
               'model__max_depth': [int(x) for x in np.linspace(10, 110, num=5)],
               'model__min_samples_split': [2, 5, 10],
               'model__min_samples_leaf': [1, 2, 4]}

random_forest_search = RandomizedSearchCV(rfr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
random_forest_search.fit(input_x, input_y.values.ravel())
print(random_forest_search.best_params_)

In [None]:
gbr_model = GradientBoostingRegressor(random_state=random_state)
gbr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', gbr_model)])

random_grid = {'model__learning_rate': [0.001],
               'model__n_estimators': [1000, 3000, 5000, 7000, 10000],
               'model__max_depth': [1, 2, 3, 5, 7, 10]}

gradient_boosting_search = RandomizedSearchCV(gbr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
gradient_boosting_search.fit(input_x, input_y.values.ravel())
print(gradient_boosting_search.best_params_)

In [None]:
svr_model = SVR()
svr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', svr_model)])

random_grid = {
    'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'model__C': [1,5,10],
    'model__degree': [3,8],
    'model__coef0': [0.01,10,0.5],
    'model__gamma': ['auto','scale']
}
svr_search = RandomizedSearchCV(svr_pipeline, random_grid, n_iter=20, cv=10, verbose=10, scoring=custom_scorer, n_jobs=-1)
svr_search.fit(input_x, input_y.values.ravel())
print(svr_search.best_params_)

In [None]:
xgbr_model = XGBRegressor(random_state=random_state)
xgbr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', xgbr_model)])

random_grid = {
    'model__n_estimators': [100, 500, 900, 1100, 1500],
    'model__max_depth': [2, 3, 5, 10, 15],
    'model__learning_rate': [0.05, 0.1, 0.15, 0.20],
    'model__min_child_weight': [1, 2, 3, 4]
    }
xgbr_search = RandomizedSearchCV(xgbr_pipeline, random_grid, n_iter=20, cv=10, verbose=10, scoring=custom_scorer, n_jobs=-1)
xgbr_search.fit(input_x, input_y.values.ravel())
print(xgbr_search.best_params_)

In [None]:
cbr_model = CatBoostRegressor(random_state=random_state)
cbr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', cbr_model)])

random_grid = {
    'model__depth' : [6,8,10],
    'model__learning_rate' : [0.01, 0.05, 0.1],
    'model__iterations'    : [30, 50, 100]
    }
cbr_search = RandomizedSearchCV(cbr_pipeline, random_grid, n_iter=20, cv=10, verbose=10, scoring=custom_scorer, n_jobs=-1)
cbr_search.fit(input_x, input_y.values.ravel())
print(cbr_search.best_params_)

In [None]:
kr_model = KernelRidge()
kr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', kr_model)])

random_grid = {
    'model__alpha': [0.1, 0.5, 1.0, 2.0], 
    'model__kernel': ['linear', 'rbf', 'poly'], 
    'model__gamma': np.logspace(-2, 2, 5)
}

kr_search = RandomizedSearchCV(kr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
kr_search.fit(input_x, input_y.values.ravel())
print(kr_search.best_params_)

In [28]:
abr_model = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=random_state)
abr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', abr_model)])

random_grid = {'model__estimator__max_depth':[i for i in range(2,11,2)],
              'model__estimator__min_samples_leaf':[5,10],
              'model__n_estimators':[10,50,250,1000],
              'model__learning_rate':[0.01,0.1]}

abr_search = RandomizedSearchCV(abr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
abr_search.fit(input_x, input_y.values.ravel())
print(abr_search.best_params_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 1/10; 1/20] START model__estimator__max_depth=2, model__estimator__min_samples_leaf=10, model__learning_rate=0.01, model__n_estimators=50
[CV 2/10; 1/20] START model__estimator__max_depth=2, model__estimator__min_samples_leaf=10, model__learning_rate=0.01, model__n_estimators=50
[CV 3/10; 1/20] START model__estimator__max_depth=2, model__estimator__min_samples_leaf=10, model__learning_rate=0.01, model__n_estimators=50
[CV 4/10; 1/20] START model__estimator__max_depth=2, model__estimator__min_samples_leaf=10, model__learning_rate=0.01, model__n_estimators=50
[CV 5/10; 1/20] START model__estimator__max_depth=2, model__estimator__min_samples_leaf=10, model__learning_rate=0.01, model__n_estimators=50
[CV 6/10; 1/20] START model__estimator__max_depth=2, model__estimator__min_samples_leaf=10, model__learning_rate=0.01, model__n_estimators=50
[CV 7/10; 1/20] START model__estimator__max_depth=2, model__estimator__min_samples_le

In [39]:
scaler = MinMaxScaler(feature_range=(0,1))
models = [generate_pipeline(scaler, model) for model in [
    CatBoostRegressor(learning_rate=0.1, iterations=100, depth=6),
    XGBRegressor(n_estimators=500, min_child_weight=3, max_depth=15, learning_rate=0.1),
    SVR(kernel='poly', gamma='scale', degree=3, coef0=0.5, C=1),
    GradientBoostingRegressor(n_estimators=10000, max_depth=10, learning_rate=0.001, random_state=random_state),
    RandomForestRegressor(max_depth=110, min_samples_leaf=1, min_samples_split=5, n_estimators=1000, random_state=random_state),
    KernelRidge(kernel='rbf', gamma=0.01, alpha=0.5),
    AdaBoostRegressor(estimator=DecisionTreeRegressor(min_samples_leaf=5, max_depth=10), random_state=random_state, n_estimators=10, learning_rate=0.1)
]]

In [40]:
# Test the random forest model 
model = models[0]
clf = model[1].fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)
get_score(y_test.values.ravel(), y_pred)

0:	learn: 6.2543702	total: 2.25ms	remaining: 222ms
1:	learn: 5.9254520	total: 3.85ms	remaining: 189ms
2:	learn: 5.6489125	total: 5.23ms	remaining: 169ms
3:	learn: 5.4115789	total: 6.86ms	remaining: 165ms
4:	learn: 5.1938629	total: 8.15ms	remaining: 155ms
5:	learn: 4.9945163	total: 9.43ms	remaining: 148ms
6:	learn: 4.8585251	total: 10.5ms	remaining: 139ms
7:	learn: 4.7176156	total: 11.9ms	remaining: 137ms
8:	learn: 4.5836222	total: 13ms	remaining: 132ms
9:	learn: 4.4718635	total: 14.2ms	remaining: 127ms
10:	learn: 4.3837140	total: 15.3ms	remaining: 124ms
11:	learn: 4.3108142	total: 16.5ms	remaining: 121ms
12:	learn: 4.2416172	total: 17.9ms	remaining: 120ms
13:	learn: 4.1912116	total: 19ms	remaining: 117ms
14:	learn: 4.1395352	total: 20.2ms	remaining: 114ms
15:	learn: 4.0897465	total: 21.5ms	remaining: 113ms
16:	learn: 4.0470922	total: 23.3ms	remaining: 114ms
17:	learn: 4.0001778	total: 24.4ms	remaining: 111ms
18:	learn: 3.9686577	total: 25.4ms	remaining: 108ms
19:	learn: 3.9371426	total

0.5908658420551856

In [41]:
pd.DataFrame({'y_actual': y_test.values.reshape(-1), 'y_pred': y_pred})

Unnamed: 0,y_actual,y_pred
0,-10.378239,-9.860710
1,-6.207769,-5.628949
2,-12.617678,-15.264085
3,-0.352397,-1.092853
4,-5.473230,-9.212925
...,...,...
1046,-7.475370,-10.700367
1047,-5.701183,-10.218541
1048,-4.898163,-11.779052
1049,-11.964244,-10.201714


In [42]:
results = {}

for name, model in models:
  clf = model.fit(X_train, y_train.values.ravel())
  y_pred = clf.predict(X_test)

  results[name] = {
    'score': get_score(y_test.values.ravel(), y_pred),
    'rsme': get_rsme(y_test.values.ravel(), y_pred),
    'r_squared': get_r_squared(y_test.values.ravel(), y_pred),
  }

0:	learn: 6.2543702	total: 1.52ms	remaining: 151ms
1:	learn: 5.9254520	total: 4.03ms	remaining: 197ms
2:	learn: 5.6489125	total: 5.63ms	remaining: 182ms
3:	learn: 5.4115789	total: 6.82ms	remaining: 164ms
4:	learn: 5.1938629	total: 7.86ms	remaining: 149ms
5:	learn: 4.9945163	total: 8.96ms	remaining: 140ms
6:	learn: 4.8585251	total: 9.98ms	remaining: 133ms
7:	learn: 4.7176156	total: 11.2ms	remaining: 129ms
8:	learn: 4.5836222	total: 12.1ms	remaining: 123ms
9:	learn: 4.4718635	total: 13.5ms	remaining: 122ms
10:	learn: 4.3837140	total: 14.9ms	remaining: 121ms
11:	learn: 4.3108142	total: 16.2ms	remaining: 119ms
12:	learn: 4.2416172	total: 17.4ms	remaining: 116ms
13:	learn: 4.1912116	total: 23.5ms	remaining: 144ms
14:	learn: 4.1395352	total: 24.7ms	remaining: 140ms
15:	learn: 4.0897465	total: 25.7ms	remaining: 135ms
16:	learn: 4.0470922	total: 26.8ms	remaining: 131ms
17:	learn: 4.0001778	total: 27.9ms	remaining: 127ms
18:	learn: 3.9686577	total: 29.3ms	remaining: 125ms
19:	learn: 3.9371426	t

In [43]:
df = pd.DataFrame(results).T

In [44]:
px.bar(df, x=df.index, y='score')

In [45]:
px.bar(df, x=df.index, y='rsme')

In [46]:
px.bar(df, x=df.index, y='r_squared')