Methods showcased in dev_pressure_curves facilitate fine-tuned and automated data transfer to dev_machine_learning 

In [1]:
#execute related notebooks within the scope of the current one to use their variables without the need for individual imports or file creation
#%run dev_pressure_curves.ipynb dev_machine_learning.ipynb

In [2]:
#Here, we import files to run through a simple workflow. %store -[OPTION] var to -r : retrieve, -d : delete, -z : clear all  

#%store -r eng iso scl
%store -r files

In [3]:
from src.StreamPort.device.analyses import PressureCurvesAnalyses

ana = PressureCurvesAnalyses(files=files)
print("Number of analyses: ", len(ana.data))
print("Methods: ", ana.get_methods())

Number of analyses:  374
Methods:  ['SAA_411_Irino.M', 'SAA_411_Doc.M', 'SAA_411_5FU.M', 'SAA_411_Pac.M', 'SAA_411_Gem.M']


In [4]:
from src.StreamPort.device.methods import PressureCurvesMethodExtractFeaturesNative

processor = PressureCurvesMethodExtractFeaturesNative()
ana = processor.run(ana)
#keep track of hyperparameters and compare the effectivity of ML with different combinations
parameters = processor.parameters
print("Feature extraction parameters: ", parameters)

Diag device/methods/ln. 161: method:  SAA_411_Gem.M , end time:  2.09499999999999
Diag device/methods/ln. 161: method:  SAA_411_Gem.M , end time:  2.09499999999999
Diag device/methods/ln. 161: method:  SAA_411_Gem.M , end time:  2.09499999999999
Diag device/methods/ln. 161: method:  SAA_411_Gem.M , end time:  2.08499999999999
Diag device/methods/ln. 161: method:  SAA_411_Gem.M , end time:  2.09499999999999
Diag device/methods/ln. 161: method:  SAA_411_Gem.M , end time:  2.09499999999999
Diag device/methods/ln. 161: method:  SAA_411_Gem.M , end time:  2.09499999999999
Diag device/methods/ln. 161: method:  SAA_411_Gem.M , end time:  2.09499999999999
Diag device/methods/ln. 161: method:  SAA_411_Gem.M , end time:  2.09499999999999
Diag device/methods/ln. 161: method:  SAA_411_Gem.M , end time:  2.09499999999999
Diag device/methods/ln. 161: method:  SAA_411_Gem.M , end time:  2.09499999999999
Diag device/methods/ln. 161: method:  SAA_411_Gem.M , end time:  2.09499999999999
Diag device/meth

In [5]:
method = "SAA_411_Pac.M"
method_indices = ana.get_method_indices(method)
#%store ana

In [6]:
import datetime
date_threshold = "2021-08-18"
date_threshold = datetime.datetime.strptime(date_threshold, "%Y-%m-%d")

train_indices = []
for i in method_indices:
    meta = ana.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = datetime.datetime.fromisoformat(start_time)
    if batch_position > 4 and start_time < date_threshold:
        train_indices.append(i)

train_data = ana.get_features(train_indices)
train_metadata = ana.get_metadata(train_indices)
print("Number of training curves: ", len(train_indices))

Number of training curves:  20


In [7]:
fig_train=ana.plot_pressure_curves(indices = train_indices)
fig_train.update_layout(showlegend=False)
fig_train.show()
#train_indices.extend([239, 245, 266, 117])# 117 is interesting

In [8]:
train_data.describe()

Unnamed: 0,area,pressure_max,pressure_min,pressure_mean,pressure_std,pressure_range,runtime,residual_noise,residual_std,roc_0.0_0.995,abs_deviation_0.0_0.995,roc_1.0_1.995,abs_deviation_1.0_1.995,roc_2.0_2.995,abs_deviation_2.0_2.995,roc_3.0_3.975,abs_deviation_3.0_3.975
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,215.491688,71.001,36.0085,54.214604,13.196363,34.9925,239.7,26.1319,0.200198,0.2145,0.714036,-0.119,0.696429,-0.3355,0.680667,0.0295,0.690631
std,0.64409,0.259349,0.107276,0.161953,0.054923,0.236128,0.0,1.22639,0.009362,0.029285,0.082857,0.061976,0.074012,0.027621,0.067338,0.185939,0.074876
min,214.57035,70.47,35.87,53.9825,13.089369,34.55,239.7,23.547442,0.175556,0.15,0.590476,-0.26,0.565238,-0.36,0.573333,-0.06,0.546667
25%,215.095069,70.84,35.965,54.115173,13.163849,34.845,239.7,25.386692,0.195515,0.2075,0.662143,-0.11,0.639881,-0.3525,0.637381,-0.06,0.652738
50%,215.3632,70.995,35.98,54.182456,13.204293,34.98,239.7,25.951841,0.201646,0.22,0.685952,-0.1,0.695238,-0.34,0.672143,-0.045,0.700952
75%,215.724181,71.15,36.05,54.27342,13.23046,35.1325,239.7,27.069435,0.206173,0.24,0.761726,-0.0875,0.726429,-0.33,0.71619,-0.03,0.719524
max,217.269175,71.51,36.34,54.661332,13.295331,35.41,239.7,27.838527,0.213708,0.25,0.863333,-0.07,0.834286,-0.27,0.820952,0.47,0.874286


In [9]:
rest_indices = list(set(method_indices) - set(train_indices))
print("Total curves: ", len(method_indices), "\t", method_indices, "\n")
print("Train curves: ",  len(train_indices), "\t", train_indices, "\n")
print("Remaining: ", len(rest_indices), "\t", rest_indices, "\n")

Total curves:  93 	 [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279] 

Train curves:  20 	 [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60] 

Remaining:  73 	 [266, 267, 268, 269, 14, 15, 16, 17, 275, 276, 277, 278, 279, 47, 48, 49, 50, 270, 271, 272, 273, 274, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247] 



In [10]:
test_source_indices = [ ]
for i in rest_indices:
    meta = ana.get_metadata(i)
    batch_position = meta["batch_position"].item()
    if  batch_position > 0:
        test_source_indices.append(i)
print("Selected test superset: ", len(test_source_indices), "\t", test_source_indices)

Selected test superset:  73 	 [266, 267, 268, 269, 14, 15, 16, 17, 275, 276, 277, 278, 279, 47, 48, 49, 50, 270, 271, 272, 273, 274, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247]


In [11]:
test_dates = []
import datetime
import random
for i in [1, 2, 3]:
    now = datetime.datetime.now().isoformat()
    now = now.replace(":", "-").replace(".", "-").replace("T", " ")
    test_dates.append(now)

    test_indices = random.sample(test_source_indices, random.randint(5, 8))
    print("Test set ", now ,  " indices: ", test_indices)

    test_data = ana.get_features(test_indices)
    test_metadata = ana.get_metadata(test_indices)
    test_data.to_csv(f"dev/test_{now}_features.csv", index=False)
    test_metadata.to_csv(f"dev/test_{now}_metadata.csv", index=False)

    fig_test_curves=ana.plot_pressure_curves(indices = test_indices)
    fig_test_curves.update_layout(showlegend=False)
    fig_test_curves.write_image(f"dev/figures/fig_test_{now}_curves.png", width=1000, height= 350, scale = 3)
    #fig_test_curves.update_layout(showlegend=True)
    #fig_test_curves.show()

    fig_test_features=ana.plot_features(indices = test_indices)
    fig_test_features.update_layout(showlegend=False)
    fig_test_features.write_image(f"dev/figures/fig_test_{now}_features.png", width=1100, height= 350, scale = 3)
    fig_test_features.update_layout(title=f"Test set {now}", showlegend=True)
    fig_test_features.show()
    
    fig_test_features_raw = ana.plot_features_raw(indices = test_indices)
    fig_test_features_raw.update_layout(showlegend=False)
    fig_test_features_raw.write_image(f"dev/figures/fig_test_{now}_features_raw.png", width=1100, height= 350, scale = 3)
    fig_test_features_raw.update_layout(title=f"Test set {now}", showlegend=True)
    fig_test_features_raw.show()


test_set = test_data
test_set_meta = test_metadata
#%store test_set test_set_meta

Test set  2025-07-29 16-40-11-948837  indices:  [125, 151, 165, 267, 245, 237, 269]


Test set  2025-07-29 16-40-17-449626  indices:  [129, 121, 123, 117, 162]


Test set  2025-07-29 16-40-22-832458  indices:  [168, 49, 132, 129, 236]


In [12]:
import pandas as pd
from src.StreamPort.machine_learning.analyses import MachineLearningAnalyses

ml_ana = MachineLearningAnalyses(variables = train_data, metadata = train_metadata)
print(ml_ana)


MachineLearningAnalyses 
  variables: 20 rows, 17 columns
  metadata: 20 rows, 14 columns



In [13]:
from src.StreamPort.machine_learning.methods import MachineLearningScaleFeaturesScalerSklearn

scaler = MachineLearningScaleFeaturesScalerSklearn()
scaling_parameters = scaler.parameters
print("Scaling parameters: ", scaling_parameters)
parameters.update(scaling_parameters)

ml_ana = scaler.run(ml_ana)
fig_train_features = ml_ana.plot_data()
fig_train_features.update_layout(title="Train set features")
fig_train_features.show()

Scaling parameters:  {'type': 'MinMaxScaler'}



IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [14]:

from src.StreamPort.machine_learning.methods import MachineLearningMethodIsolationForestSklearn

iforest = MachineLearningMethodIsolationForestSklearn()
iforest_parameters = iforest.parameters
print("Isolation Forest parameters: ", iforest_parameters)
parameters.update(iforest_parameters)

ml_ana = iforest.run(ml_ana)
ml_ana.train()
fig_train_scores = ml_ana.plot_scores()
fig_train_scores.show()

Isolation Forest parameters:  {'n_estimators': 100, 'max_samples': 'auto', 'contamination': 'auto', 'max_features': 1, 'bootstrap': False, 'n_jobs': None, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [15]:
import os

parameters_df = []
for i in test_dates:
    test_data = pd.read_csv(f"dev/test_{i}_features.csv")
    test_metadata = pd.read_csv(f"dev/test_{i}_metadata.csv")
    ml_ana.predict(test_data, test_metadata)
    outliers_test = ml_ana.test_prediction_outliers()
    
    classified_test_metadata = outliers_test
    classified_test_metadata["date"] = i
    classified_test_metadata.to_csv(f"dev/test_{i}_classified_samples.csv", index=False)
    print("Test set ", i, ": \n", test_metadata["index"].tolist()) 
    print("\n")
    
    test_batch_parameters = parameters.copy()
    test_set = len(classified_test_metadata)
    train_set = len(ml_ana.get_training_scores())
    threshold = classified_test_metadata["threshold"][1]
    num_outliers = sum(classified_test_metadata["class"] == "outlier")
    percent_outliers = (sum(classified_test_metadata["class"] == "outlier")/len(classified_test_metadata["class"]))*100

    fig_test_scores = ml_ana.plot_scores()
    fig_test_scores.write_image(f"dev/figures/fig_test_{i}_scores.png", width=1100, height= 350, scale = 3)
    fig_test_scores.update_layout(title=f"Test set {i}")
    fig_test_scores.show()

    fig_test_features = ml_ana.plot_data()
    fig_test_features.write_image(f"dev/figures/fig_test_{i}_features.png", width=1100, height= 350, scale = 3)
    #fig_test_features.show()

    test_batch_parameters.update(
        {
        "date" : i,
        "train_set" : train_set, 
        "test_set" : test_set,
        "threshold" : threshold,
        "outliers" : num_outliers,
        "outliers_percent" : percent_outliers
        }
    )
    parameters_df.append(test_batch_parameters)

    #optionally add seen normal curves to train set
    #ml_ana.add_prediction()

parameters_df = pd.DataFrame(parameters_df)
if os.path.exists("dev/test_record.csv"):
    old_records = pd.read_csv("dev/test_record.csv")
    parameters_df = pd.concat([old_records, parameters_df])
    parameters_df.drop_duplicates(subset="date", keep="last", inplace=True)
    
parameters_df.to_csv("dev/test_record.csv", index = False)
print("Workflow parameters: \n", parameters_df)

Test set  2025-07-29 16-40-11-948837 : 
 [125, 151, 165, 267, 245, 237, 269]




Test set  2025-07-29 16-40-17-449626 : 
 [129, 121, 123, 117, 162]




Test set  2025-07-29 16-40-22-832458 : 
 [168, 49, 132, 129, 236]




Workflow parameters: 
    period  window_size  bins  crop          type  n_estimators max_samples  \
0      10            7     4     2  MinMaxScaler           100        auto   
1      10            7     4     2  MinMaxScaler           100        auto   
2      10            7     4     2  MinMaxScaler           100        auto   
3      10            7     4     2  MinMaxScaler           100        auto   
4      10            7     4     2  MinMaxScaler           100        auto   
5      10            7     4     2  MinMaxScaler           100        auto   
0      10            7     4     2  MinMaxScaler           100        auto   
1      10            7     4     2  MinMaxScaler           100        auto   
2      10            7     4     2  MinMaxScaler           100        auto   

  contamination  max_features  bootstrap n_jobs random_state  verbose  \
0          auto             1      False    NaN          NaN        0   
1          auto             1      False    NaN   

In [16]:
import plotly.graph_objects as go

test_record = pd.read_csv("dev/test_record.csv") if os.path.exists("dev/test_record.csv") else None
test_record = test_record.sort_values("date")

if test_record is not None:
    result_logs = []
    for date in test_record["date"]:
        result_logs.append(f"dev/test_{date}_classified_samples.csv") if os.path.exists(f"dev/test_{date}_classified_samples.csv") else print(f"No records for {date}")
    
    result_logs = [pd.read_csv(log) for log in result_logs]

    for log in result_logs:
        log["date"] = log["date"].astype(str).str[:19].str.replace("T", " ")

    result_logs = [log.to_string(index=False).replace("\n", "<br>") for log in result_logs]

    #test_record["date"] = pd.to_datetime(test_record["date"], format="%Y-%m-%dT%H-%M-%S-%f").dt.floor("s")
    #width = 600 
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=test_record["date"],
            y=test_record["threshold"],
            mode="lines+markers",
            name="Threshold",
            yaxis="y1",  
            hovertemplate=[
                "<br>Threshold: " + str(test_record["threshold"][i])  
                for i in range(len(test_record))
            ],
            line=dict(color="red", width=2, dash='dash'),
            marker=dict(size=8, symbol="circle")
        )
    )

    fig.add_trace(
        go.Bar(
            x=test_record["date"],
            y=test_record["outliers"],
            name="Outliers",
            yaxis="y2",
            width = 0.15, #width
            marker_color="blue",
            hovertext=result_logs,
            hoverinfo="text"
        )
    )

    # fig.add_trace(
    #     go.Scatter(
    #         x=test_record["date"],
    #         y=test_record["outliers"],
    #         mode="lines+markers",
    #         name="Outliers",
    #         yaxis="y2",
    #         line=dict(color="blue", width=2),
    #         marker=dict(size=8, symbol="diamond", color="blue"),
    #         hovertemplate=[
    #             "<br>Train Set: " + str(test_record["train_set"][i]) +
    #             "<br>Test Set: " + str(test_record["test_set"][i]) +
    #             "<br>Outliers: " + str(test_record["outliers"][i])
    #             for i in range(len(test_record))
    #         ]
    #     )
    # )

    fig.add_trace(
        go.Scatter(
            x=test_record["date"],
            y=test_record["train_set"], 
            mode="lines+markers",
            name="Training curves",
            yaxis="y2",
            hovertemplate=[
                "<br>Training samples: " + str(test_record["train_set"][i]) +
                "<br>Test samples: " + str(test_record["test_set"][i]) +
                "<br>Outliers: " + str(test_record["outliers"][i]) +
                "<br>Outliers %: " + str(test_record["outliers_percent"][i]) 
                for i in range(len(test_record))
            ],
            line=dict(color="green", width=2, dash='solid'),
            marker=dict(size=8, symbol="star")
        )
    )

fig.update_layout(
    title="Detection accuracy over Test Runs and Training Set Size",
    xaxis=dict(
        tickvals=test_record["date"], 
        ticktext=[d[:19].replace("T", " ") for d in test_record["date"]], 
        tickangle=45,
        title="Test Dates"
    ),
    yaxis=dict(  
        title=dict(text="Threshold", font=dict(color="red")),
        tickfont=dict(color="red")
    ),
    yaxis2=dict(#train set and outlier realistic in scale. Uncomment y3 to adjust 
        title=dict(text="Outliers", font=dict(color="blue")),
        tickfont=dict(color="blue"),
        overlaying="y",
        side="right"
    ),
    bargap = 1, 
    template="simple_white",
    legend=dict(
        x=0.5,
        y=1.1,
        xanchor="center",
        yanchor="top",
        orientation="h",  
        bgcolor="rgba(255,255,255,0.5)", 
        borderwidth=1  
    )
)
fig.write_image("dev/figures/fig_threshold_variation.png", width=1100, height= 350, scale= 3)
fig.show()
