In [None]:
# https://towardsdatascience.com/quickly-test-multiple-models-a98477476f0

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [None]:
threshold = 1
random_state = 42

In [None]:
# Import data and preprocess
df = pd.read_csv("../test_data/Grounded CPW Leaky Wave antenna/S11 Data.csv")
df = df.drop(df[df['dB(S(1,1)) []'] > 0].index) # Remove all rows with positive s11

# Split into x and y
input_x = df.drop(columns=['dB(S(1,1)) []'], axis=1)
input_y = df[['dB(S(1,1)) []']]

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(input_x, input_y, random_state=random_state)

In [None]:
def generate_pipeline(scaler, model):
    return (model.__class__.__name__, Pipeline(steps=[('normalize', scaler), ('model', model)]))

In [None]:
# Check if predicted value is threshold amount above or below actual value
def is_in_threshold(actual, pred):
    return pred <= actual + threshold and pred >= actual - threshold

def create_tf_column(results):
    return results.apply(lambda x: is_in_threshold(x['y_test'], x['predictions']), axis=1)

def get_score(y_test, y_pred, **kwargs):
    dataframe = pd.DataFrame(y_test, columns=['y_test'])
    dataframe['predictions'] = y_pred
    return create_tf_column(dataframe).value_counts().get(True) / dataframe.shape[0]

def get_rsme(y_test, y_pred):
    return mean_squared_error(y_test, y_pred, squared=False)

def get_r_squared(y_test, y_pred):
    return r2_score(y_test, y_pred)

In [None]:
custom_scorer = make_scorer(get_score, greater_is_better=True)
scaler = MinMaxScaler(feature_range=(0,1)) # Initialize scaler

In [None]:
rfr_model = RandomForestRegressor(random_state=random_state)
rfr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', rfr_model)])

random_grid = {'model__n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=5)],
               'model__max_depth': [int(x) for x in np.linspace(10, 110, num=5)],
               'model__min_samples_split': [2, 5, 10],
               'model__min_samples_leaf': [1, 2, 4]}

random_forest_search = RandomizedSearchCV(rfr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
random_forest_search.fit(input_x, input_y.values.ravel())
print(random_forest_search.best_params_)

In [None]:
gbr_model = GradientBoostingRegressor(random_state=random_state)
gbr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', gbr_model)])

random_grid = {'model__learning_rate': [0.001],
               'model__n_estimators': [1000, 3000, 5000, 7000, 10000],
               'model__max_depth': [1, 2, 3, 5, 7, 10]}

gradient_boosting_search = RandomizedSearchCV(gbr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
gradient_boosting_search.fit(input_x, input_y.values.ravel())
print(gradient_boosting_search.best_params_)

In [None]:
svr_model = SVR()
svr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', svr_model)])

random_grid = {
    'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'model__C': [1,5,10],
    'model__degree': [3,8],
    'model__coef0': [0.01,10,0.5],
    'model__gamma': ['auto','scale']
}
svr_search = RandomizedSearchCV(svr_pipeline, random_grid, n_iter=20, cv=10, verbose=10, scoring=custom_scorer, n_jobs=-1)
svr_search.fit(input_x, input_y.values.ravel())
print(svr_search.best_params_)

In [None]:
xgbr_model = XGBRegressor(random_state=random_state)
xgbr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', xgbr_model)])

random_grid = {
    'model__n_estimators': [100, 500, 900, 1100, 1500],
    'model__max_depth': [2, 3, 5, 10, 15],
    'model__learning_rate': [0.05, 0.1, 0.15, 0.20],
    'model__min_child_weight': [1, 2, 3, 4]
    }
xgbr_search = RandomizedSearchCV(xgbr_pipeline, random_grid, n_iter=20, cv=10, verbose=10, scoring=custom_scorer, n_jobs=-1)
xgbr_search.fit(input_x, input_y.values.ravel())
print(xgbr_search.best_params_)

In [17]:
cbr_model = CatBoostRegressor(random_state=random_state)
cbr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', cbr_model)])

random_grid = {
    'model__depth' : [6,8,10],
    'model__learning_rate' : [0.01, 0.05, 0.1],
    'model__iterations'    : [30, 50, 100]
    }
cbr_search = RandomizedSearchCV(cbr_pipeline, random_grid, n_iter=20, cv=10, verbose=10, scoring=custom_scorer, n_jobs=-1)
cbr_search.fit(input_x, input_y.values.ravel())
print(cbr_search.best_params_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 2/10; 1/20] START model__depth=8, model__iterations=100, model__learning_rate=0.05
[CV 6/10; 1/20] START model__depth=8, model__iterations=100, model__learning_rate=0.05
0:	learn: 6.3024238	total: 87.5ms	remaining: 8.66s
1:	learn: 6.1255814	total: 155ms	remaining: 7.58s
0:	learn: 6.4700030	total: 63.5ms	remaining: 6.29s
2:	learn: 5.9620908	total: 218ms	remaining: 7.05s
1:	learn: 6.2744904	total: 106ms	remaining: 5.18s
[CV 7/10; 1/20] START model__depth=8, model__iterations=100, model__learning_rate=0.05
3:	learn: 5.8113976	total: 235ms	remaining: 5.65s
4:	learn: 5.6712065	total: 256ms	remaining: 4.87s
5:	learn: 5.5384174	total: 261ms	remaining: 4.09s
6:	learn: 5.4001464	total: 279ms	remaining: 3.71s
2:	learn: 6.0879528	total: 165ms	remaining: 5.34s
[CV 9/10; 1/20] START model__depth=8, model__iterations=100, model__learning_rate=0.05
7:	learn: 5.2740646	total: 299ms	remaining: 3.44s
3:	learn: 5.9387371	total: 206ms	rema

In [18]:
scaler = MinMaxScaler(feature_range=(0,1))
models = [generate_pipeline(scaler, model) for model in [
    CatBoostRegressor(learning_rate=0.1, iterations=100, depth=6),
    XGBRegressor(n_estimators=500, min_child_weight=3, max_depth=15, learning_rate=0.1),
    SVR(kernel='poly', gamma='scale', degree=3, coef0=0.5, C=1),
    GradientBoostingRegressor(n_estimators=10000, max_depth=10, learning_rate=0.001, random_state=random_state),
    RandomForestRegressor(max_depth=110, min_samples_leaf=1, min_samples_split=5, n_estimators=1000, random_state=random_state)
]]

In [20]:
# Test the random forest model 
model = models[0]
clf = model[1].fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)
get_score(y_test.values.ravel(), y_pred)

0:	learn: 6.2543702	total: 1.27ms	remaining: 126ms
1:	learn: 5.9254520	total: 2.29ms	remaining: 112ms
2:	learn: 5.6489125	total: 3.09ms	remaining: 99.9ms
3:	learn: 5.4115789	total: 3.91ms	remaining: 93.8ms
4:	learn: 5.1938629	total: 4.75ms	remaining: 90.3ms
5:	learn: 4.9945163	total: 5.56ms	remaining: 87.1ms
6:	learn: 4.8585251	total: 22ms	remaining: 292ms
7:	learn: 4.7176156	total: 23.2ms	remaining: 267ms
8:	learn: 4.5836222	total: 33.3ms	remaining: 337ms
9:	learn: 4.4718635	total: 35.8ms	remaining: 322ms
10:	learn: 4.3837140	total: 37.2ms	remaining: 301ms
11:	learn: 4.3108142	total: 38.6ms	remaining: 283ms
12:	learn: 4.2416172	total: 39.6ms	remaining: 265ms
13:	learn: 4.1912116	total: 40.9ms	remaining: 251ms
14:	learn: 4.1395352	total: 42.2ms	remaining: 239ms
15:	learn: 4.0897465	total: 43.2ms	remaining: 227ms
16:	learn: 4.0470922	total: 44.2ms	remaining: 216ms
17:	learn: 4.0001778	total: 45.3ms	remaining: 206ms
18:	learn: 3.9686577	total: 46.4ms	remaining: 198ms
19:	learn: 3.9371426

0.3520456707897241

In [21]:
pd.DataFrame({'y_actual': y_test.values.reshape(-1), 'y_pred': y_pred})

Unnamed: 0,y_actual,y_pred
0,-10.378239,-9.860710
1,-6.207769,-5.628949
2,-12.617678,-15.264085
3,-0.352397,-1.092853
4,-5.473230,-9.212925
...,...,...
1046,-7.475370,-10.700367
1047,-5.701183,-10.218541
1048,-4.898163,-11.779052
1049,-11.964244,-10.201714


In [22]:
results = {}

for name, model in models:
  clf = model.fit(X_train, y_train.values.ravel())
  y_pred = clf.predict(X_test)

  results[name] = {
    'score': get_score(y_test.values.ravel(), y_pred),
    'rsme': get_rsme(y_test.values.ravel(), y_pred),
    'r_squared': get_r_squared(y_test.values.ravel(), y_pred),
  }

0:	learn: 6.2543702	total: 555us	remaining: 55ms
1:	learn: 5.9254520	total: 1.25ms	remaining: 61ms
2:	learn: 5.6489125	total: 1.67ms	remaining: 54.1ms
3:	learn: 5.4115789	total: 2.1ms	remaining: 50.3ms
4:	learn: 5.1938629	total: 2.54ms	remaining: 48.3ms
5:	learn: 4.9945163	total: 3.01ms	remaining: 47.1ms
6:	learn: 4.8585251	total: 3.42ms	remaining: 45.5ms
7:	learn: 4.7176156	total: 3.86ms	remaining: 44.4ms
8:	learn: 4.5836222	total: 4.29ms	remaining: 43.4ms
9:	learn: 4.4718635	total: 4.73ms	remaining: 42.6ms
10:	learn: 4.3837140	total: 18.8ms	remaining: 153ms
11:	learn: 4.3108142	total: 88.4ms	remaining: 648ms
12:	learn: 4.2416172	total: 90ms	remaining: 602ms
13:	learn: 4.1912116	total: 107ms	remaining: 660ms
14:	learn: 4.1395352	total: 118ms	remaining: 667ms
15:	learn: 4.0897465	total: 119ms	remaining: 625ms
16:	learn: 4.0470922	total: 121ms	remaining: 588ms
17:	learn: 4.0001778	total: 123ms	remaining: 560ms
18:	learn: 3.9686577	total: 124ms	remaining: 531ms
19:	learn: 3.9371426	total

In [None]:
df = pd.DataFrame(results).T

In [None]:
px.bar(df, x=df.index, y='score')

In [None]:
px.bar(df, x=df.index, y='rsme')

In [None]:
px.bar(df, x=df.index, y='r_squared')