Automate methods showcased in pressure_curves to produce train and test sets for machine_learning

In [1]:
import pandas as pd
from src.StreamPort.device.analyses import PressureCurvesAnalyses
from src.StreamPort.device.methods import PressureCurvesMethodExtractFeaturesNative

%store -r files

ana = PressureCurvesAnalyses(files=files)
processor = PressureCurvesMethodExtractFeaturesNative()
processor.run(ana)

methods = ana.get_methods()
print("Methods: ", methods)

method = "SAA_411_Pac.M"
method_indices = ana.get_method_indices(method)
fig_sel_method = ana.plot_methods(method_indices)

Methods:  ['SAA_411_Irino.M', 'SAA_411_Pac.M', 'SAA_411_Gem.M', 'SAA_411_Doc.M', 'SAA_411_5FU.M']


In [2]:
fig_sel_method.show()

In [3]:
# old_train_metadata = pd.read_csv("dev/train_metadata.csv")
# if method not in old_train_metadata["method"].values:
#     pass
# else:
#     train_indices = old_train_metadata["index"].tolist()
#     method_indices = list(set(method_indices) - set(train_indices))


In [None]:
import random
import datetime
date_threshold = "2021-08-18"
date_threshold = datetime.datetime.strptime(date_threshold, "%Y-%m-%d")

train_indices = []
for i in method_indices:
    meta = ana.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = datetime.datetime.fromisoformat(start_time)
    if batch_position > 4 and start_time < date_threshold:
        train_indices.append(i)

train_data = ana.get_features(train_indices)
train_metadata = ana.get_metadata(train_indices)
train_data.to_csv("dev/train_features.csv", index=False)
train_metadata.to_csv("dev/train_metadata.csv", index=False)
print("Number of training curves: ", len(train_indices))

Number of training curves:  20


In [5]:
fig_train=ana.plot_pressure_curves(indices = train_indices)
fig_train.update_layout(showlegend=False)
for trace in fig_train.data:
    trace.line.color = "black"
fig_train.write_image("dev/figures/fig_train.png", width=1100, height= 350, scale = 3)
fig_train.show()

In [7]:
fig_train_features=ana.plot_features(indices = train_indices)
fig_train_features.update_layout(showlegend=False)
for trace in fig_train_features.data:
    trace.line.color = "black"
fig_train_features.write_image("dev/figures/fig_train_features.png", width=1100, height= 350, scale = 3)
fig_train_features.show()

In [8]:
train_data.describe()

Unnamed: 0,area,pressure_max,pressure_min,pressure_mean,pressure_std,pressure_range,runtime,residual_noise,residual_std,abs_deviation_0.01_1.0,abs_deviation_1.005_1.995,abs_deviation_2.0_2.99,abs_deviation_2.995_end
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,215.491688,71.001,36.0085,54.223068,13.165054,34.9925,239.7,26.1319,0.200198,0.822362,0.758612,0.790692,0.827976
std,0.64409,0.259349,0.107276,0.161479,0.055299,0.236128,0.0,1.22639,0.009362,0.080079,0.078631,0.071065,0.090382
min,214.57035,70.47,35.87,53.9885,13.057493,34.55,239.7,23.547442,0.175556,0.655828,0.628415,0.679114,0.690629
25%,215.095069,70.84,35.965,54.123437,13.132156,34.845,239.7,25.386692,0.195515,0.778642,0.706119,0.732179,0.777995
50%,215.3632,70.995,35.98,54.195187,13.173285,34.98,239.7,25.951841,0.201646,0.808974,0.761853,0.786713,0.820233
75%,215.724181,71.15,36.05,54.280931,13.201411,35.1325,239.7,27.069435,0.206173,0.855542,0.801672,0.829586,0.857302
max,217.269175,71.51,36.34,54.666362,13.262941,35.41,239.7,27.838527,0.213708,0.988578,0.963427,0.905851,1.100909


In [None]:
rest_indices = list(set(method_indices) - set(train_indices))
print("Total curves: ", len(method_indices), "\t", method_indices, "\n")
print("Train curves: ",  len(train_indices), "\t", train_indices, "\n")
print("Test curves: ", len(rest_indices), "\t", rest_indices, "\n")

Total curves:  93 	 [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279] 

Train curves:  20 	 [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60] 

Test curves:  73 	 [266, 267, 268, 269, 14, 15, 16, 17, 275, 276, 277, 278, 279, 47, 48, 49, 50, 270, 271, 272, 273, 274, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247] 



In [None]:
# date_threshold_min = "2021-08-18"
# date_threshold_min= datetime.datetime.strptime(date_threshold_min, "%Y-%m-%d")

# date_threshold_max = "2021-08-22"
# date_threshold_max= datetime.datetime.strptime(date_threshold_max, "%Y-%m-%d")

# test_indices = [220]
# for i in method_indices:
#     meta = ana.get_metadata(i)
#     batch_position = meta["batch_position"].item()
#     start_time = meta["start_time"].item()
#     if isinstance(start_time, str):
#         start_time = datetime.datetime.fromisoformat(start_time)
#     if start_time < date_threshold_max and start_time > date_threshold_min and batch_position > 0:
#         test_indices.append(i)

# test_data = ana.get_features(test_indices)
# test_metadata = ana.get_metadata(test_indices)
# test_data.to_csv("dev/test_features.csv", index=False)
# test_metadata.to_csv("dev/test_metadata.csv", index=False)
# print("Number of test curves: ", len(test_indices))

# fig_test=ana.plot_pressure_curves(indices = test_indices)
# fig_test.update_layout(showlegend=False)
# #for trace in fig_outliers.data:
# #    trace.line.color = "black"
# fig_test.write_image("dev/figures/fig_test.png", width=550, height= 350, scale = 3)
# fig_test.update_layout(showlegend=True)
# fig_test.show()

In [None]:
import random
for i in [3, 4, 5, 6, 7]:
    test_source_indices = random.sample(rest_indices, 10)
    print("Test set ", i ,  " indices: ", test_source_indices)    
    test_indices = test_source_indices
    test_data = ana.get_features(test_indices)
    test_metadata = ana.get_metadata(test_indices)
    test_data.to_csv(f"dev/test{i}_features.csv", index=False)
    test_metadata.to_csv(f"dev/test{i}_metadata.csv", index=False)
    fig_text_curves=ana.plot_pressure_curves(indices = test_indices)
    fig_text_curves.update_layout(showlegend=False)
    # for trace in fig_text_2_curves.data:
    #     trace.line.color = "black"
    fig_text_curves.write_image(f"dev/figures/fig_test{i}_curves.png", width=1000, height= 350, scale = 3)
    fig_text_curves.update_layout(showlegend=True)
    fig_text_curves.show()
    fig_text_features=ana.plot_features(indices = test_indices)
    fig_text_features.update_layout(showlegend=False)
    fig_text_features.write_image(f"dev/figures/fig_test{i}_features.png", width=1100, height= 350, scale = 3)
    fig_text_features.update_layout(showlegend=True)
    fig_text_features.show()
    fig_test_features_raw = ana.plot_features_raw(indices = test_indices)
    fig_test_features_raw.update_layout(showlegend=False)
    fig_test_features_raw.write_image("dev/figures/fig_test_features_raw_no_fourier.png", width=1100, height= 350, scale = 3)
    fig_test_features_raw.update_layout(showlegend=True)
    fig_test_features_raw.show()

Test curve indices  3 :  [234, 118, 153, 240, 276, 160, 149, 278, 125, 158]


Test curve indices  4 :  [152, 156, 276, 116, 120, 165, 267, 157, 244, 242]


Test curve indices  5 :  [50, 276, 247, 237, 242, 277, 129, 153, 116, 270]


Test curve indices  6 :  [49, 269, 278, 268, 277, 118, 238, 155, 244, 128]


Test curve indices  7 :  [270, 165, 236, 121, 50, 16, 234, 168, 273, 17]
