In [None]:
#execute related notebooks within the scope of the current one to use their variables without the need for individual imports or file creation
#%run dev_pressure_curves.ipynb dev_machine_learning.ipynb

In [None]:
#Here, we import pressure curve files/object to run through a simple workflow. %store -[OPTION] var to -r : retrieve, -d : delete, -z : clear all  
%store -r pc 

In [None]:
methods_plot = pc.plot_methods()
methods_plot.show()

methods = pc.get_methods()
print("Methods: ", methods)

In [None]:
indices = pc.get_method_indices('SAA_411_Pac.M')
pac_plots = pc.plot_batches(indices)
pac_plots.show()

In [None]:
from src.StreamPort.device.methods import PressureCurvesMethodExtractFeaturesNative

processor = PressureCurvesMethodExtractFeaturesNative(period=10, bins=6, window_size=7)
pc = processor.run(pc)

In [None]:
batches = [batch for batch in pc.get_batches() if "Pac" in batch]
print("Batches: ", batches)

batches.sort()
print("Sorted by date: ", batches)

tests = batches[2:]

In [None]:
print("test batch dates: ", tests) # used to get train data, then discarded and test sets iteratively selected by increasing batch date

In [None]:
# create training set
first_test_group = tests[0]
print("First test group date: ", first_test_group)

first_test = first_test_group.split(" ")[-2:]
first_test = " ".join(first_test)
first_test = first_test.replace(":", "-")

date_threshold_min = first_test

In [None]:
train_indices = []
for i in indices:
    meta = pc.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = start_time.replace(":", "-")
    else:
        start_time = start_time.strftime("%Y-%m-%d %H-%M-%S")
    if batch_position > 5 and start_time < date_threshold_min:
        train_indices.append(i)

train_data = pc.get_features(train_indices)
train_metadata = pc.get_metadata(train_indices)
train_data.to_csv("dev/workflow_train_features.csv", index=False) # test data files will include batch date
train_metadata.to_csv("dev/workflow_train_metadata.csv", index=False)

train_size = len(train_indices)
print("Number of training curves: ", train_size)

In [None]:
fig_train=pc.plot_pressure_curves(train_indices)
fig_train.update_layout(showlegend=False)
fig_train.show()
#train_indices.extend([239, 245, 266, 117])# 117 is interesting

In [None]:
fig_train_features = pc.plot_features(train_indices)
fig_train_features.show()

In [None]:
train_data.describe() # maybe this can be fed to model to guide feature selection for splits based on importance. Increase weight of less-impactful statistical features like area

In [None]:
import pandas as pd
from src.StreamPort.machine_learning.analyses import MachineLearningAnalyses

ml_ana = MachineLearningAnalyses(variables = train_data, metadata = train_metadata)
print(ml_ana)

In [None]:
from src.StreamPort.machine_learning.methods import MachineLearningScaleFeaturesScalerSklearn

scaler = MachineLearningScaleFeaturesScalerSklearn(scaler_type = "StandardScaler")
ml_ana = scaler.run(ml_ana)

fig_train_features = ml_ana.plot_data()
fig_train_features.update_layout(title="Train set features")
fig_train_features.show()

In [None]:

from src.StreamPort.machine_learning.methods import MachineLearningMethodIsolationForestSklearn

iforest = MachineLearningMethodIsolationForestSklearn()

ml_ana = iforest.run(ml_ana)


ml_ana.train()
fig_train_scores = ml_ana.plot_scores()
fig_train_scores.show()

In [None]:
# new tests sorted by date
tests = {}
for batch in batches:
    batch_date = batch.split(" ")[-2 : ]
    batch_date = " ".join(batch_date)

    if batch_date >= date_threshold_min: # search and collect test samples
        date_threshold_min = batch_date 
        
        test_indices = pc.get_batch_indices(batch)
        test_size = len(test_indices)

        for i in range(test_size):
            tests[f"{batch_date}_{i+1}"] = test_indices[i]
            
    else:
        continue

In [None]:
dates = list(tests.keys())
print(dates)

In [None]:
#for date in dates[:1]:
date = dates[0]

test_data = pc.get_features(tests[date])
test_metadata = pc.get_metadata(tests[date])

print(f"Test {date}: index:", tests[date])
train_indices.append(tests[date])

fig_test_features = pc.plot_features(train_indices, normalize=False)
fig_test_features.show()

fig_test_features_raw = pc.plot_features_raw(train_indices)
fig_test_features_raw.update_layout(showlegend=False)
fig_test_features_raw.show()

In [None]:
ml_ana.predict(test_data, test_metadata)

outliers = ml_ana.test_prediction_outliers() # defaults: n_tests = 1, show_scores = False
print(outliers)

fig_test_curves = pc.plot_pressure_curves(test_metadata["index"].tolist())
fig_test_curves.show()

fig_test_scores = ml_ana.plot_scores()
fig_test_scores.update_layout(title=f"Test set {date} final run")
fig_test_scores.show()


In [None]:
#confidence_plot = evaluator.plot_confidences()
confidence_plot = ml_ana.plot_confidences()
confidence_plot.show()

In [None]:
import os

test_data.drop(columns="runtime", inplace=True)
data = pd.concat([test_metadata, test_data], axis=1)

if outliers["class"].iloc[0] == "outlier":
    outlier_data = data

    if os.path.exists("dev/outliers.csv"):
        outliers_file = pd.read_csv("dev/outliers.csv")
        outliers_file = pd.concat([outliers_file, outlier_data], axis = 0)
        outliers_file.drop_duplicates(subset="index", inplace = True, ignore_index=True)
        outliers_file.to_csv("dev/outliers.csv", index=False)
    
    else:
        outlier_data.to_csv("dev/outliers.csv", index=False)

else:

    if os.path.exists("dev/normals.csv"):
        normals_file = pd.read_csv("dev/normals.csv")

        normals_file = pd.concat([normals_file, data], axis = 0)
        normals_file.drop_duplicates(subset="index", inplace=True, ignore_index=True)
        normals_file.to_csv("dev/normals.csv", index=False)
    
    else:
        data.to_csv("dev/normals.csv", index=False)

ml_ana.add_prediction() # add_data already calls self.train()

In [None]:
#threshold_plot = evaluator.plot_threshold_variation()
threshold_plot = ml_ana.plot_threshold_variation()
threshold_plot.show()
#threshold_plot.write_image("dev/figures/fig_threshold_variation_serialized_tests.png", width=1100, height= 350, scale = 3)

In [None]:
#train_time_plot = evaluator.plot_train_time()
train_time_plot = ml_ana.plot_train_time()
train_time_plot.show()