Automate methods showcased in pressure_curves to produce train and test sets for machine_learning

In [1]:
import pandas as pd
from src.StreamPort.device.analyses import PressureCurvesAnalyses
from src.StreamPort.device.methods import PressureCurvesMethodExtractFeaturesNative

%store -r files

ana = PressureCurvesAnalyses(files=files)
processor = PressureCurvesMethodExtractFeaturesNative()
processor.run(ana)

methods = ana.get_methods()
print("Methods: ", methods)

method = "SAA_411_Irino.M"
method_indices = ana.get_method_indices(method)
fig_sel_method = ana.plot_methods(method_indices)

Methods:  ['SAA_411_Doc.M', 'SAA_411_5FU.M', 'SAA_411_Irino.M', 'SAA_411_Gem.M', 'SAA_411_Pac.M']


In [4]:
fig_sel_method.show()

In [2]:
old_train_metadata = pd.read_csv("dev/train_metadata.csv")
if method not in old_train_metadata["method"].values:
    pass
else:
    train_indices = old_train_metadata["index"].tolist()
    method_indices = list(set(method_indices) / set(train_indices))


In [None]:
import random
import datetime
date_threshold = "2021-09-10"
date_threshold = datetime.datetime.strptime(date_threshold, "%Y-%m-%d")

train_indices = []
for i in method_indices:
    meta = ana.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = datetime.datetime.fromisoformat(start_time)
    if batch_position > 4 and start_time < date_threshold:
        train_indices.append(i)

train_data = ana.get_features(train_indices)
train_metadata = ana.get_metadata(train_indices)
train_data.to_csv("dev/train_features.csv", index=False)
train_metadata.to_csv("dev/train_metadata.csv", index=False)
print("Number of training curves: ", len(train_indices))

Number of training curves:  36


In [8]:
fig_train=ana.plot_pressure_curves(indices = train_indices)
fig_train.update_layout(showlegend=False)
for trace in fig_train.data:
    trace.line.color = "black"
fig_train.write_image("dev/figures/fig_train.png", width=1100, height= 350, scale = 3)
fig_train.show()

In [7]:
train_indices.remove(220)

In [11]:
fig_train_features=ana.plot_features(indices = train_indices)
fig_train_features.update_layout(showlegend=False)
for trace in fig_train_features.data:
    trace.line.color = "black"
fig_train_features.write_image("dev/figures/fig_train_features.png", width=1100, height= 350, scale = 3)
fig_train_features.show()

In [13]:
train_data.describe()

Unnamed: 0,area,pressure_max,pressure_min,pressure_mean,pressure_std,pressure_range,runtime,residual_noise,residual_std,abs_deviation_0.01_1.25,abs_deviation_1.255_2.495,abs_deviation_2.5_3.74,abs_deviation_3.745_4.985,abs_deviation_3.745_4.98
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,34.0,2.0
mean,529.305403,121.450278,67.963056,106.424654,16.627925,53.487222,299.683333,39.174586,0.284968,1.490735,1.23847,1.044184,1.088581,0.851818
std,3.325589,0.660132,0.557739,0.673784,0.263373,0.331478,0.069693,24.590361,0.181108,2.152485,0.427937,0.107324,0.1547,0.066128
min,520.7089,120.12,67.19,104.68887,15.189191,52.86,299.4,31.490302,0.228814,0.875431,0.943916,0.859557,0.791981,0.805058
25%,527.01465,121.0375,67.5,105.956078,16.590871,53.335,299.7,33.110119,0.241624,1.049812,1.095361,0.968776,0.96127,0.828438
50%,529.559363,121.47,67.925,106.469815,16.682057,53.535,299.7,35.520868,0.255079,1.103275,1.162821,1.023543,1.111096,0.851818
75%,531.052563,121.7925,68.2625,106.766557,16.731462,53.66,299.7,36.912558,0.267801,1.229184,1.272372,1.10919,1.211573,0.875198
max,535.31225,122.8,69.01,107.62335,16.864907,54.02,299.7,182.008642,1.337216,14.031375,3.572517,1.303683,1.500303,0.898578


In [None]:
import datetime
date_threshold_min = "2021-08-18"
date_threshold_min= datetime.datetime.strptime(date_threshold_min, "%Y-%m-%d")

date_threshold_max = "2021-08-22"
date_threshold_max= datetime.datetime.strptime(date_threshold_max, "%Y-%m-%d")

#test_indices = list(set(method_indices) - set(train_indices))

test_indices = []
for i in method_indices:
    meta = ana.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = datetime.datetime.fromisoformat(start_time)
    if start_time < date_threshold_max and start_time > date_threshold_min and batch_position > 0:
        test_indices.append(i)

test_data = eng.analyses.get_features(test_indices)
test_metadata = eng.analyses.get_metadata(test_indices)
test_data.to_csv("dev/test_features.csv", index=False)
test_metadata.to_csv("dev/test_metadata.csv", index=False)
print("Number of test curves: ", len(test_indices))

fig_test=eng.analyses.plot_pressure_curves(indices = test_indices)
fig_test.update_layout(showlegend=False)
#for trace in fig_outliers.data:
#    trace.line.color = "black"
fig_test.write_image("dev/figures/fig_test.png", width=550, height= 350, scale = 3)
fig_test.update_layout(showlegend=True)
fig_test.show()

In [None]:
fig_test = eng.analyses.plot_features(indices = test_indices)
fig_test.update_layout(showlegend=False)
# for trace in fig_text.data:
#     trace.line.color = "black"
fig_test.write_image("dev/figures/fig_test_features.png", width=1100, height= 350, scale = 3)
fig_test.update_layout(showlegend=True)
fig_test.show()

In [None]:
fig_test_features_raw_no_fourier = eng.analyses.plot_features_raw(indices = test_indices)
fig_test_features_raw_no_fourier.update_layout(showlegend=False)
fig_test_features_raw_no_fourier.write_image("dev/figures/fig_test_features_raw_no_fourier.png", width=1100, height= 350, scale = 3)
fig_test_features_raw_no_fourier.update_layout(showlegend=True)
fig_test_features_raw_no_fourier.show()

In [None]:
# import random
# for i in [4, 5, 6, 7]:
#     rest_indices = list(set(method_indices) - set(train_indices) - set(test_indices) - set(test2_indices) - set(test_2_indices))
#     test_source_indices = random.sample(rest_indices, 10)
#     print("New test curve indices: ", test_source_indices)    
#     test3_indices = test_source_indices
#     test3_data = eng.analyses.get_features(test3_indices)
#     test3_metadata = eng.analyses.get_metadata(test3_indices)
#     test3_data.to_csv(f"dev/test{i}_features.csv", index=False)
#     test3_metadata.to_csv(f"dev/test{i}_metadata.csv", index=False)
#     fig_text3_curves=eng.analyses.plot_pressure_curves(indices = test3_indices)
#     fig_text3_curves.update_layout(showlegend=False)
#     # for trace in fig_text_2_curves.data:
#     #     trace.line.color = "black"
#     fig_text3_curves.write_image(f"dev/figures/fig_test{i}_curves.png", width=1000, height= 350, scale = 3)
#     fig_text3_curves.update_layout(showlegend=True)
#     fig_text3_curves.show()
#     fig_text3_features=eng.analyses.plot_features(indices = test3_indices)
#     fig_text3_features.update_layout(showlegend=False)
#     fig_text3_features.write_image(f"dev/figures/fig_test{i}_features.png", width=1100, height= 350, scale = 3)
#     fig_text3_features.update_layout(showlegend=True)
#     fig_text3_features.show()

In [None]:

test_2_indices = [21, 116, 117, 153, 26, 16, 17, 47]
test_2_data = eng.analyses.get_features(test_2_indices)
test_2_metadata = eng.analyses.get_metadata(test_2_indices)
test_2_data.to_csv("dev/test3_features.csv", index=False)
test_2_metadata.to_csv("dev/test3_metadata.csv", index=False)
fig_text_2_curves=eng.analyses.plot_pressure_curves(indices = test_2_indices)
fig_text_2_curves.update_layout(showlegend=False)
# for trace in fig_text_2_curves.data:
#     trace.line.color = "black"
fig_text_2_curves.write_image("dev/figures/fig_test3_curves.png", width=1000, height= 350, scale = 3)
fig_text_2_curves.update_layout(showlegend=True)
fig_text_2_curves.show()

In [None]:
eng.analyses.plot_features(indices = test_2_indices)

In [None]:
for i in [4, 5, 6, 7]:
    test_data = pd.read_csv(f"dev/test{i}_features.csv")
    test_metadata = pd.read_csv(f"dev/test{i}_metadata.csv")
    ana.predict(test_data, test_metadata)
    outliers_test = ana.test_prediction_outliers()
    print(outliers_test)
    fig_test_scores = ana.plot_scores()
    fig_test_scores.write_image(f"dev/figures/fig_test{i}_scores.png", width=1100, height= 350, scale = 3)
    fig_test_scores.show()
    ana.add_prediction()
    print(ana)
    fig_test_features = ana.plot_data()
    fig_test_features.write_image(f"dev/figures/fig_test{i}_features.png", width=1100, height= 350, scale = 3)
    fig_test_features.show()