Methods showcased in dev_pressure_curves facilitate fine-tuned and automated data transfer to dev_machine_learning 

In [1]:
#%run to execute related notebooks within the scope of the current one to use their variables without the need for individual imports or file creation

#%run dev_pressure_curves.ipynb dev_machine_learning.ipynb

In [2]:
#Here, we import files to run through a simple workflow. %store -r var to retrieve var, %store -d var to delete var, %store -z to clear all variables 

#%store -r eng iso scl
%store -r files

In [3]:
from src.StreamPort.device.analyses import PressureCurvesAnalyses

ana = PressureCurvesAnalyses(files=files)
print("Number of analyses: ", len(ana.data))
print("Methods: ", ana.get_methods())

Number of analyses:  374
Methods:  ['SAA_411_Doc.M', 'SAA_411_Irino.M', 'SAA_411_Gem.M', 'SAA_411_5FU.M', 'SAA_411_Pac.M']


In [4]:
from src.StreamPort.device.methods import PressureCurvesMethodExtractFeaturesNative

processor = PressureCurvesMethodExtractFeaturesNative()
ana = processor.run(ana)
#keep track of hyperparameters and compare the effectivity of ML with different combinations
parameters = processor.parameters
print("Workflow parameters: ", parameters)

Workflow parameters:  {'period': 10, 'window_size': 7, 'bins': 4, 'crop': 2}


In [5]:
#run before feature extraction to inspect entries with missing values

# for analysis in ana.data:
#     print(analysis["index"],  analysis["method"], len(analysis["time_var"]), len(analysis["pressure_var"])) #, analysis["sample"])

In [6]:
method = "SAA_411_Pac.M"
method_indices = ana.get_method_indices(method)

In [7]:
import datetime
date_threshold = "2021-08-18"
date_threshold = datetime.datetime.strptime(date_threshold, "%Y-%m-%d")

train_indices = []
for i in method_indices:
    meta = ana.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = datetime.datetime.fromisoformat(start_time)
    if batch_position > 4 and start_time < date_threshold:
        train_indices.append(i)

train_data = ana.get_features(train_indices)
train_metadata = ana.get_metadata(train_indices)
print("Number of training curves: ", len(train_indices))

Number of training curves:  20


In [8]:
fig_train=ana.plot_pressure_curves(indices = train_indices)
fig_train.update_layout(showlegend=False)
fig_train.show()
#train_indices.extend([239, 245, 266, 117])# 117 is interesting

In [9]:
train_data.describe()

Unnamed: 0,area,pressure_max,pressure_min,pressure_mean,pressure_std,pressure_range,runtime,residual_noise,residual_std,abs_deviation_0.01_1.0,abs_deviation_1.005_1.995,abs_deviation_2.0_2.99,abs_deviation_2.995_3.985
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,215.491688,71.001,36.0085,54.223068,13.165054,34.9925,239.7,26.1319,0.200198,0.714036,0.695143,0.680833,0.693179
std,0.64409,0.259349,0.107276,0.161479,0.055299,0.236128,0.0,1.22639,0.009362,0.082857,0.073199,0.067735,0.072574
min,214.57035,70.47,35.87,53.9885,13.057493,34.55,239.7,23.547442,0.175556,0.590476,0.565238,0.573333,0.546667
25%,215.095069,70.84,35.965,54.123437,13.132156,34.845,239.7,25.386692,0.195515,0.662143,0.639881,0.637381,0.653095
50%,215.3632,70.995,35.98,54.195187,13.173285,34.98,239.7,25.951841,0.201646,0.685952,0.695238,0.670238,0.700952
75%,215.724181,71.15,36.05,54.280931,13.201411,35.1325,239.7,27.069435,0.206173,0.761726,0.72,0.717619,0.719524
max,217.269175,71.51,36.34,54.666362,13.262941,35.41,239.7,27.838527,0.213708,0.863333,0.834286,0.820952,0.874286


In [10]:
rest_indices = list(set(method_indices) - set(train_indices))
print("Total curves: ", len(method_indices), "\t", method_indices, "\n")
print("Train curves: ",  len(train_indices), "\t", train_indices, "\n")
print("Remaining: ", len(rest_indices), "\t", rest_indices, "\n")

Total curves:  93 	 [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279] 

Train curves:  20 	 [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60] 

Remaining:  73 	 [266, 267, 268, 269, 14, 15, 16, 17, 275, 276, 277, 278, 279, 47, 48, 49, 50, 270, 271, 272, 273, 274, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247] 



In [11]:
test_source_indices = [117]
for i in rest_indices:
    meta = ana.get_metadata(i)
    batch_position = meta["batch_position"].item()
    if  batch_position > 0:
        test_source_indices.append(i)
print("Selected test superset: ", len(test_source_indices), "\t", test_source_indices)

Selected test superset:  74 	 [117, 266, 267, 268, 269, 14, 15, 16, 17, 275, 276, 277, 278, 279, 47, 48, 49, 50, 270, 271, 272, 273, 274, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247]


In [None]:
import random
for i in [3, 4, 5]:
    test_indices = random.sample(test_source_indices, random.randint(5, 8))
    print("Test set ", i ,  " indices: ", test_indices)

    test_data = ana.get_features(test_indices)
    test_metadata = ana.get_metadata(test_indices)
    test_data.to_csv(f"dev/test{i}_features.csv", index=False)
    test_metadata.to_csv(f"dev/test{i}_metadata.csv", index=False)

    fig_test_curves=ana.plot_pressure_curves(indices = test_indices)
    fig_test_curves.update_layout(showlegend=False)
    fig_test_curves.write_image(f"dev/figures/fig_test{i}_curves.png", width=1000, height= 350, scale = 3)
    #fig_test_curves.update_layout(showlegend=True)
    #fig_test_curves.show()

    fig_test_features=ana.plot_features(indices = test_indices)
    fig_test_features.update_layout(showlegend=False)
    fig_test_features.write_image(f"dev/figures/fig_test{i}_features.png", width=1100, height= 350, scale = 3)
    fig_test_features.update_layout(showlegend=True)
    fig_test_features.show()
    
    fig_test_features_raw = ana.plot_features_raw(indices = test_indices)
    fig_test_features_raw.update_layout(showlegend=False)
    fig_test_features_raw.write_image(f"dev/figures/fig_test{i}_features_raw.png", width=1100, height= 350, scale = 3)
    fig_test_features_raw.update_layout(showlegend=True)
    fig_test_features_raw.show()

Test set  3  indices:  [160, 122, 242, 157, 166, 49, 244, 235]


Test set  4  indices:  [162, 163, 17, 167, 267]


Test set  5  indices:  [150, 128, 277, 17, 123, 132, 158, 270]


In [13]:
import pandas as pd
from src.StreamPort.machine_learning.analyses import MachineLearningAnalyses

ana = MachineLearningAnalyses(variables = train_data, metadata = train_metadata)
print(ana)


MachineLearningAnalyses 
  variables: 20 rows, 13 columns
  metadata: 20 rows, 14 columns



In [14]:
from src.StreamPort.machine_learning.methods import MachineLearningScaleFeaturesScalerSklearn

scaler = MachineLearningScaleFeaturesScalerSklearn()
scaling_parameters = scaler.parameters
print(scaling_parameters)
parameters.update(scaling_parameters)

ana = scaler.run(ana)
fig_train_features = ana.plot_data()
fig_train_features.show()

{'type': 'MinMaxScaler'}



IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [15]:

from src.StreamPort.machine_learning.methods import MachineLearningMethodIsolationForestSklearn

iforest = MachineLearningMethodIsolationForestSklearn()
iforest_parameters = iforest.parameters
print(iforest_parameters)
parameters.update(iforest_parameters)

ana = iforest.run(ana)
fig_train_scores = ana.plot_scores()
fig_train_scores.show()

{'n_estimators': 100, 'max_samples': 'auto', 'contamination': 'auto', 'max_features': 1, 'bootstrap': False, 'n_jobs': None, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [16]:
def set_class_label(test_metadata, outliers_test):
    
    outliers_test["outlier"] = outliers_test["outlier"].map({True: "outlier", False: "normal"})
    outliers_test["class"] = outliers_test["outlier"]
    outliers_test.drop(columns=["outlier"], inplace = True)

    classified_test_metadata = pd.concat([test_metadata["index"], outliers_test], axis=1)

    return classified_test_metadata

In [None]:
import os

parameters_df = []
for i in [3, 4, 5]:
    test_data = pd.read_csv(f"dev/test{i}_features.csv")
    test_metadata = pd.read_csv(f"dev/test{i}_metadata.csv")
    ana.predict(test_data, test_metadata)
    outliers_test = ana.test_prediction_outliers()
    
    classified_test_metadata = set_class_label(test_metadata, outliers_test)
    classified_test_metadata.to_csv(f"dev/test{i}_classified_samples.csv", index=False)
    print("Test set ", i, ": \n", test_metadata["index"].tolist())
    
    test_batch_parameters = parameters.copy()
    train_set = len(ana.get_training_scores())
    threshold = classified_test_metadata["threshold"][1]
    num_outliers = sum(classified_test_metadata["class"] == "outlier")
    percent_outliers = (sum(classified_test_metadata["class"] == "outlier")/len(classified_test_metadata["class"]))*100

    test_batch_parameters.update(
        {
        "test_batch" : i,
        "train_set" : train_set, 
        "threshold" : threshold,
        "outliers" : num_outliers,
        "outliers %" : percent_outliers
        }
    )
    parameters_df.append(test_batch_parameters)

    fig_test_scores = ana.plot_scores()
    fig_test_scores.write_image(f"dev/figures/fig_test{i}_scores.png", width=1100, height= 350, scale = 3)
    fig_test_scores.show()

    #optionally add seen normal curves to train set
    ana.add_prediction()
    fig_test_features = ana.plot_data()
    fig_test_features.write_image(f"dev/figures/fig_test{i}_features.png", width=1100, height= 350, scale = 3)
    #fig_test_features.show()

parameters_df = pd.DataFrame(parameters_df)
if os.path.exists("dev/test_record.csv"):
    old_records = pd.read_csv("dev/test_record.csv")
    parameters_df = pd.concat([old_records, parameters_df])
    parameters_df.drop_duplicates(inplace=True)
    
parameters_df.to_csv("dev/test_record.csv", index = False)
print(parameters_df)

Test set  3 : 
 [160, 122, 242, 157, 166, 49, 244, 235]


Test set  4 : 
 [162, 163, 17, 167, 267]


Test set  5 : 
 [150, 128, 277, 17, 123, 132, 158, 270]


   period  window_size  bins  crop          type  n_estimators max_samples  \
0      10            7     4     2  MinMaxScaler           100        auto   
1      10            7     4     2  MinMaxScaler           100        auto   
2      10            7     4     2  MinMaxScaler           100        auto   

  contamination  max_features  bootstrap n_jobs random_state  verbose  \
0          auto             1      False   None         None        0   
1          auto             1      False   None         None        0   
2          auto             1      False   None         None        0   

   warm_start  test_batch  train_set  threshold  outliers  outliers %  
0       False           3         20  -0.104285         3        37.5  
1       False           4         25  -0.128897         1        20.0  
2       False           5         29  -0.112722         1        12.5  


In [25]:
import plotly.graph_objects as go
test_record = pd.read_csv("dev/test_record.csv") if os.path.exists("dev/test_record.csv") else None

fig = go.Figure()

if test_record is not None:
    
    fig.add_trace(
        go.Scatter(
            x=test_record["train_set"],
            y=test_record["threshold"],
            mode="lines+markers",
            name="Threshold",
            yaxis="y1",  
            hovertemplate=[
                "<br>Threshold: " + str(test_record["threshold"][i]) +
                "<br>Outliers in test set: " + str(test_record["outliers"][i]) 
                for i in range(len(test_record))
            ],
            line=dict(color="red", width=2, dash='dash'),
            marker=dict(size=8, symbol="circle")
        )
    )

    fig.add_trace(
        go.Scatter(
            x=test_record["train_set"],
            y=test_record["outliers"],
            mode="markers",
            name="Outliers",
            yaxis="y2", 
            hovertemplate=[
                "<br>Train Set: " + str(test_record["train_set"][i]) +
                "<br>Outliers %: " + str(test_record["outliers %"][i]) + "%"
                for i in range(len(test_record))
            ],
            line=dict(color="blue", width=2),
            marker=dict(size=8, symbol="diamond")
        )
    )

fig.update_layout(
    title="Threshold and Outliers vs. Training Set Size",
    xaxis=dict(title="Number of Training Curves"),
    yaxis=dict(  
        title=dict(text="Threshold", font=dict(color="red")),
        tickfont=dict(color="red")
    ),
    yaxis2=dict(  
        title=dict(text="Number of Outliers", font=dict(color="blue")),
        tickfont=dict(color="blue"),
        overlaying="y",
        side="right"
    ),
    template="simple_white",
    legend=dict(
        x=0.5,
        y=1.1,
        xanchor="center",
        yanchor="top",
        orientation="h",  # horizontal legend (optional)
        bgcolor="rgba(255,255,255,0.5)",  # optional background
        borderwidth=1  # optional border
    )
)
fig.write_image("dev/figures/fig_threshold_variation.png", width=1100, height= 350, scale = 3)
fig.show()
