In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import joblib
from sklearn.metrics import mean_squared_error, r2_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
random_state = 20
threshold = 1 # Accuracy scoring threshold

In [3]:
# Import data and preprocess
df = pd.read_csv("../test_data/new leaky wave/S11_V1.csv")
df = df.drop(df[df['dB(S(1,1)) []'] > 0].index) # Remove all rows with positive s11

In [4]:
df

Unnamed: 0,cpw_in [mm],feed_l [mm],ground_w [mm],patch_ground_w [mm],patch_l [mm],Freq [GHz],"dB(S(1,1)) []"
0,1.5,3.0,0.75,0.8,3.0,11.00,-1.054485
1,1.5,3.0,0.75,0.8,3.0,11.09,-1.122093
2,1.5,3.0,0.75,0.8,3.0,11.18,-1.187705
3,1.5,3.0,0.75,0.8,3.0,11.27,-1.249970
4,1.5,3.0,0.75,0.8,3.0,11.36,-1.308502
...,...,...,...,...,...,...,...
9085,2.5,3.0,1.25,1.2,4.0,19.64,-1.282778
9086,2.5,3.0,1.25,1.2,4.0,19.73,-1.885453
9087,2.5,3.0,1.25,1.2,4.0,19.82,-3.995892
9088,2.5,3.0,1.25,1.2,4.0,19.91,-4.227120


In [5]:
# Split into x and y
input_x = df.drop(columns=['dB(S(1,1)) []'], axis=1)
input_y = df[['dB(S(1,1)) []']]

In [6]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(input_x, input_y, random_state=random_state)

In [7]:
model = RandomForestRegressor(max_depth=85, min_samples_leaf=1, min_samples_split=2, n_estimators=1000, random_state=random_state)
scaler = StandardScaler() # Initialize scaler
pipeline = Pipeline(steps=[('normalize', scaler), ('model', model)]) # Create pipeline with scaler and model

In [8]:
%%time
# Train and predict the pipeline
pipeline_fit = pipeline.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


CPU times: user 6.57 s, sys: 161 ms, total: 6.73 s
Wall time: 6.73 s


In [9]:
%%time
predictions = pipeline_fit.predict(X_test)

CPU times: user 433 ms, sys: 0 ns, total: 433 ms
Wall time: 431 ms


In [10]:
# Check if predicted value is threshold amount above or below actual value
def is_in_threshold(actual, pred):
    return pred <= actual + threshold and pred >= actual - threshold

In [11]:
# Create new boolean column that shows if the test and prediction values are the same
results = X_test.copy()
def create_tf_column(results):
    return results.apply(lambda x: is_in_threshold(x['y_test'], x['predictions']), axis=1)

In [12]:
# Calculate accuracy of model by number of predictions that are within threshold value above or below the test value for each row
def get_score(X_test, y_test, clf_dt):
    predictions = clf_dt.predict(X_test)
    dataframe = pd.DataFrame(X_test.copy(), columns=input_x.columns)
    dataframe['y_test'] = y_test.values
    dataframe['predictions'] = predictions
    return create_tf_column(dataframe).value_counts().get(True) / dataframe.shape[0]

In [13]:
X_test

Unnamed: 0,cpw_in [mm],feed_l [mm],ground_w [mm],patch_ground_w [mm],patch_l [mm],Freq [GHz]
2065,2.5,3.0,0.75,1.2,3.0,15.05
631,1.5,3.0,1.25,0.8,3.0,13.25
3709,2.0,3.0,0.75,0.8,3.5,17.57
5505,2.0,3.0,0.75,1.2,3.5,15.59
1005,1.5,3.0,0.75,1.0,3.0,19.64
...,...,...,...,...,...,...
1331,2.0,3.0,1.00,1.0,3.0,12.62
5415,1.5,3.0,0.75,1.2,3.5,16.58
6204,2.5,3.0,1.25,1.2,3.5,14.87
3856,1.5,3.0,1.00,0.8,3.5,12.62


In [14]:
print(f"Score within +-{threshold}: {get_score(X_test, y_test, pipeline)}")
print("RMSE:", mean_squared_error(y_test, predictions, squared=False))
print("R^2:", r2_score(y_test, predictions))

Score within +-1: 0.7030356357237132
RMSE: 1.8303979298124915
R^2: 0.8741831590173958


In [15]:
joblib.dump(pipeline, "lwam.pkl")

['lwam.pkl']