In [None]:
# https://towardsdatascience.com/quickly-test-multiple-models-a98477476f0

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm

In [None]:
threshold = 1
random_state = 42

In [None]:
# Import data and preprocess
df = pd.read_csv("../test_data/patch_antenna/Patch Antenna S11 Data.csv")
df = df.drop(df[df['dB(S(1,1)) []'] > 0].index) # Remove all rows with positive s11

# Split into x and y
input_x = df.drop(columns=['dB(S(1,1)) []'], axis=1)
input_y = df[['dB(S(1,1)) []']]

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(input_x, input_y, random_state=random_state)

In [None]:
def generate_pipeline(scaler, model):
    return (model.__class__.__name__, Pipeline(steps=[('normalize', scaler), ('model', model)]))

In [None]:
# Check if predicted value is threshold amount above or below actual value
def is_in_threshold(actual, pred):
    return pred <= actual + threshold and pred >= actual - threshold

def create_tf_column(results):
    return results.apply(lambda x: is_in_threshold(x['y_test'], x['predictions']), axis=1)

def get_score(y_test, y_pred, **kwargs):
    dataframe = pd.DataFrame(y_test, columns=['y_test'])
    dataframe['predictions'] = y_pred
    return create_tf_column(dataframe).value_counts().get(True) / dataframe.shape[0]

def get_rsme(y_test, y_pred):
    return mean_squared_error(y_test, y_pred, squared=False)

def get_r_squared(y_test, y_pred):
    return r2_score(y_test, y_pred)

In [None]:
custom_scorer = make_scorer(get_score, greater_is_better=True)
scaler = MinMaxScaler(feature_range=(0,1)) # Initialize scaler

In [63]:
decision_tree_search.best_params_

{'model__splitter': 'best',
 'model__min_samples_split': 2,
 'model__min_samples_leaf': 1,
 'model__max_depth': 17}

In [48]:
dtr_model = DecisionTreeRegressor(random_state=random_state)
dtr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', dtr_model)])

random_grid = {
    'model__splitter': ('best', 'random'),
    'model__max_depth': list(range(1, 20)),
    'model__min_samples_split': [2, 3, 4],
    'model__min_samples_leaf': list(range(1, 20)),
}

decision_tree_search = RandomizedSearchCV(dtr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
decision_tree_search.fit(input_x, input_y.values.ravel())
print(decision_tree_search.best_params_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 1/10; 1/20] START model__max_depth=19, model__min_samples_leaf=2, model__min_samples_split=4, model__splitter=random
[CV 1/10; 1/20] END model__max_depth=19, model__min_samples_leaf=2, model__min_samples_split=4, model__splitter=random;, score=0.833 total time=   0.1s
[CV 9/10; 1/20] START model__max_depth=19, model__min_samples_leaf=2, model__min_samples_split=4, model__splitter=random
[CV 9/10; 1/20] END model__max_depth=19, model__min_samples_leaf=2, model__min_samples_split=4, model__splitter=random;, score=0.775 total time=   0.1s
[CV 10/10; 2/20] START model__max_depth=17, model__min_samples_leaf=10, model__min_samples_split=3, model__splitter=random
[CV 10/10; 2/20] END model__max_depth=17, model__min_samples_leaf=10, model__min_samples_split=3, model__splitter=random;, score=0.740 total time=   0.1s
[CV 2/10; 3/20] START model__max_depth=10, model__min_samples_leaf=5, model__min_samples_split=4, model__splitter=

In [None]:
rfr_model = RandomForestRegressor(random_state=random_state)
rfr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', rfr_model)])

random_grid = {'model__n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=5)],
               'model__max_depth': [int(x) for x in np.linspace(10, 110, num=5)],
               'model__min_samples_split': [2, 5, 10],
               'model__min_samples_leaf': [1, 2, 4]}

random_forest_search = RandomizedSearchCV(rfr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
random_forest_search.fit(input_x, input_y.values.ravel())
print(random_forest_search.best_params_)

In [None]:
gbr_model = GradientBoostingRegressor(random_state=random_state)
gbr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', gbr_model)])

random_grid = {'model__learning_rate': [0.001],
               'model__n_estimators': [1000, 3000, 5000, 7000, 10000],
               'model__max_depth': [1, 2, 3, 5, 7, 10]}

gradient_boosting_search = RandomizedSearchCV(gbr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
gradient_boosting_search.fit(input_x, input_y.values.ravel())
print(gradient_boosting_search.best_params_)

In [None]:
svr_model = SVR()
svr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', svr_model)])

random_grid = {
    'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'model__C': [1,5,10],
    'model__degree': [3,8],
    'model__coef0': [0.01,10,0.5],
    'model__gamma': ['auto','scale']
}
svr_search = RandomizedSearchCV(svr_pipeline, random_grid, n_iter=20, cv=10, verbose=10, scoring=custom_scorer, n_jobs=-1)
svr_search.fit(input_x, input_y.values.ravel())
print(svr_search.best_params_)

In [None]:
xgbr_model = XGBRegressor(random_state=random_state)
xgbr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', xgbr_model)])

random_grid = {
    'model__n_estimators': [100, 500, 900, 1100, 1500],
    'model__max_depth': [2, 3, 5, 10, 15],
    'model__learning_rate': [0.05, 0.1, 0.15, 0.20],
    'model__min_child_weight': [1, 2, 3, 4]
    }
xgbr_search = RandomizedSearchCV(xgbr_pipeline, random_grid, n_iter=20, cv=10, verbose=10, scoring=custom_scorer, n_jobs=-1)
xgbr_search.fit(input_x, input_y.values.ravel())
print(xgbr_search.best_params_)

In [None]:
cbr_model = CatBoostRegressor(random_state=random_state)
cbr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', cbr_model)])

random_grid = {
    'model__depth' : [6,8,10],
    'model__learning_rate' : [0.01, 0.05, 0.1],
    'model__iterations'    : [30, 50, 100]
    }
cbr_search = RandomizedSearchCV(cbr_pipeline, random_grid, n_iter=20, cv=10, verbose=10, scoring=custom_scorer, n_jobs=-1)
cbr_search.fit(input_x, input_y.values.ravel())
print(cbr_search.best_params_)

In [None]:
abr_model = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=random_state)
abr_pipeline = Pipeline(steps=[('normalize', scaler), ('model', abr_model)])

random_grid = {'model__estimator__max_depth':[i for i in range(2,11,2)],
              'model__estimator__min_samples_leaf':[5,10],
              'model__n_estimators':[10,50,250,1000],
              'model__learning_rate':[0.01,0.1]}

abr_search = RandomizedSearchCV(abr_pipeline, random_grid, n_iter=20, scoring=custom_scorer, cv=10, verbose=10, n_jobs=-1)
abr_search.fit(input_x, input_y.values.ravel())
print(abr_search.best_params_)

In [68]:
scaler = MinMaxScaler(feature_range=(0,1))
models = [generate_pipeline(scaler, model) for model in [
    CatBoostRegressor(learning_rate=0.1, iterations=100, depth=10),
    XGBRegressor(n_estimators=500, min_child_weight=1, max_depth=5, learning_rate=0.1),
    SVR(kernel='rbf', gamma='scale', degree=3, coef0=0.01, C=5),
    GradientBoostingRegressor(n_estimators=10000, max_depth=7, learning_rate=0.001, random_state=random_state),
    RandomForestRegressor(max_depth=110, min_samples_leaf=1, min_samples_split=2, n_estimators=1000, random_state=random_state),
    DecisionTreeRegressor(splitter='best', min_samples_split=2, min_samples_leaf=1, max_depth=17, random_state=random_state),
    AdaBoostRegressor(estimator=DecisionTreeRegressor(min_samples_leaf=10, max_depth=10), random_state=random_state, n_estimators=250, learning_rate=0.01)
]]

In [69]:
# Test the random forest model 
model = models[-2]
clf = model[1].fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)
get_score(y_test.values.ravel(), y_pred)

0.9079886574753104

In [70]:
pd.DataFrame({'y_actual': y_test.values.reshape(-1), 'y_pred': y_pred})

Unnamed: 0,y_actual,y_pred
0,-1.384846,-1.259927
1,-0.649111,-0.617359
2,-0.508189,-0.504170
3,-0.250716,-0.246170
4,-3.410015,-3.160354
...,...,...
10222,-1.973598,-1.700793
10223,-0.449887,-0.432108
10224,-1.249970,-1.299500
10225,-1.688213,-2.032797


In [71]:
results = {}

for name, model in tqdm(models):
  clf = model.fit(X_train, y_train.values.ravel())
  y_pred = clf.predict(X_test)

  results[name] = {
    'score': get_score(y_test.values.ravel(), y_pred),
    'rsme': get_rsme(y_test.values.ravel(), y_pred),
    'r_squared': get_r_squared(y_test.values.ravel(), y_pred),
  }

  0%|          | 0/7 [00:00<?, ?it/s]

0:	learn: 3.8299390	total: 5.42ms	remaining: 537ms
1:	learn: 3.5885898	total: 10.1ms	remaining: 494ms
2:	learn: 3.3778001	total: 14.5ms	remaining: 468ms
3:	learn: 3.1852642	total: 18.8ms	remaining: 452ms
4:	learn: 2.9988730	total: 22.8ms	remaining: 433ms
5:	learn: 2.8424128	total: 27.1ms	remaining: 425ms
6:	learn: 2.6954057	total: 31.3ms	remaining: 415ms
7:	learn: 2.5701433	total: 35.3ms	remaining: 406ms
8:	learn: 2.4442044	total: 38.8ms	remaining: 393ms
9:	learn: 2.3407825	total: 42.7ms	remaining: 384ms
10:	learn: 2.2467555	total: 46.7ms	remaining: 378ms
11:	learn: 2.1665394	total: 50.3ms	remaining: 369ms
12:	learn: 2.0888742	total: 54.4ms	remaining: 364ms
13:	learn: 2.0266157	total: 57.9ms	remaining: 355ms
14:	learn: 1.9734950	total: 61.4ms	remaining: 348ms
15:	learn: 1.9280668	total: 65.2ms	remaining: 342ms
16:	learn: 1.8678243	total: 68.7ms	remaining: 335ms
17:	learn: 1.8289156	total: 72.2ms	remaining: 329ms
18:	learn: 1.7836635	total: 75.7ms	remaining: 323ms
19:	learn: 1.7524863	t

 14%|█▍        | 1/7 [00:00<00:02,  2.38it/s]

59:	learn: 1.1886346	total: 204ms	remaining: 136ms
60:	learn: 1.1825546	total: 208ms	remaining: 133ms
61:	learn: 1.1738869	total: 211ms	remaining: 129ms
62:	learn: 1.1649301	total: 214ms	remaining: 125ms
63:	learn: 1.1588161	total: 217ms	remaining: 122ms
64:	learn: 1.1550069	total: 220ms	remaining: 118ms
65:	learn: 1.1474447	total: 223ms	remaining: 115ms
66:	learn: 1.1409429	total: 226ms	remaining: 111ms
67:	learn: 1.1377757	total: 229ms	remaining: 108ms
68:	learn: 1.1348324	total: 232ms	remaining: 104ms
69:	learn: 1.1293322	total: 235ms	remaining: 101ms
70:	learn: 1.1222720	total: 238ms	remaining: 97.2ms
71:	learn: 1.1174277	total: 241ms	remaining: 93.8ms
72:	learn: 1.1143670	total: 245ms	remaining: 90.4ms
73:	learn: 1.1111303	total: 248ms	remaining: 87ms
74:	learn: 1.1079193	total: 251ms	remaining: 83.6ms
75:	learn: 1.1046187	total: 254ms	remaining: 80.1ms
76:	learn: 1.1016148	total: 257ms	remaining: 76.7ms
77:	learn: 1.0944583	total: 260ms	remaining: 73.3ms
78:	learn: 1.0884155	tota

100%|██████████| 7/7 [04:17<00:00, 36.76s/it]


In [72]:
df = pd.DataFrame(results).T

In [73]:
px.bar(df, x=df.index, y='score')

In [74]:
px.bar(df, x=df.index, y='rsme')

In [76]:
px.bar(df, x=df.index, y='r_squared')