In [None]:
# run extract.bat "Path\to\SignalExtraction\Tool" "Path\to\method_data\root_directory" to automate SignalExtraction for processing. 

# this takes a while and provides output only once complete or interrupted, unless the subprocess library is used to capture the shell output in real time
#!src\StreamPort\extract.bat "C:\Users\Sandeep\Desktop\SignalExtraction v.01" "C:\Users\Sandeep\Desktop\Error-LC\Method-Data"

In [None]:
import os
from src.StreamPort.device.analyses import PressureCurvesAnalyses

path = "C:/Users/Sandeep/Desktop/Error-LC/Method-Data"
batches = os.listdir(path)
batches = [os.path.join(path, file) for file in batches]

error_lc_files = []
for batch in batches:
    batch_files = os.listdir(batch)
    batch_files = [os.path.join(batch, file) for file in batch_files if ".D" in file]
    error_lc_files.extend(batch_files)

#%store error_lc_files

In [None]:
analyses = PressureCurvesAnalyses(files=error_lc_files)
print("Number of analyses: ", len(analyses.data))

In [None]:
batches = analyses.get_batches()
batch_plot = analyses.plot_batches()
batch_plot.show()
print(batches)

In [None]:
batch_plot = analyses.plot_batches()
batch_plot.show()

In [None]:
from src.StreamPort.device.methods import PressureCurvesMethodExtractFeaturesNative

processor = PressureCurvesMethodExtractFeaturesNative(window_size=4, bins=7) # defaults: (period = 10, window_size = 7, bins = 4, crop = 2)
processor.run(analyses)
parameters = processor.parameters
print("Feature extraction parameters: ", parameters)

In [None]:
batch = '250620_Mix-1_25x100ng-mL 2025-06-20 09-26-34'
batch_indices = analyses.get_batch_indices(batch)
fig_sel_method = analyses.plot_batches(batch_indices)

fig_sel_method.write_image("dev/figures/fig_error_lc_sel_batch.png", width=1100, height= 350, scale = 3)
fig_sel_method.write_image("dev/figures/fig_error_lc_sel_batch_half.png", width=550, height= 350, scale = 3)

fig_sel_method.show()

In [None]:
#batch_indices.extend(analyses.get_batch_indices('250703_Mix-1+IS_20x100ng-mL 2025-07-03 09-29-09'))

In [None]:
fig_curves_raw = analyses.plot_pressure_curves(batch_indices)

fig_curves_raw.update_layout(showlegend=True)

for trace in fig_curves_raw.data:
    trace.line.color = "black"

fig_curves_raw.write_image("dev/figures/fig_error_lc_curves_raw.png", width=1100, height= 350, scale = 3)

fig_curves_raw.show()

In [None]:
print("Number of curves: ", len(batch_indices))
print(batch_indices)
fig_features=analyses.plot_features(indices = batch_indices)

fig_features.update_layout(showlegend=True)
for trace in fig_features.data:
    trace.line.color = "black"

fig_features.write_image("dev/figures/fig_error_lc_features.png", width=1100, height= 350, scale = 3)

fig_features.show()

In [None]:
#import datetime
#date_threshold = "2021-08-18"
#date_threshold = datetime.datetime.strptime(date_threshold, "%Y-%m-%d")

train_indices = []
for i in batch_indices:
    meta = analyses.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    # if isinstance(start_time, str):
    #     start_time = datetime.datetime.fromisoformat(start_time)
    if batch_position in [5, 8, 9, 10, 11] or batch_position in range(18, 29):
        train_indices.append(i)

In [None]:
train_indices = [i for i in train_indices if i not in [175, 165] ]

In [None]:
fig_train=analyses.plot_pressure_curves(indices = train_indices)
fig_train.update_layout(showlegend=True)
for trace in fig_train.data:
    trace.line.color = "black"
fig_train.write_image("dev/figures/fig_error_lc__train.png", width=1100, height= 350, scale = 3)
fig_train.show()

In [None]:
train_indices.remove(162)

In [None]:

train_data = analyses.get_features(train_indices)
train_metadata = analyses.get_metadata(train_indices)
train_data.to_csv("dev/error_lc_train_features.csv", index=False)
train_metadata.to_csv("dev/error_lc_train_metadata.csv", index=False)
print("Number of training curves: ", len(train_indices))

In [None]:
rest_indices  = list(set(batch_indices) - set(train_indices))
print("Train indices: ", len(train_indices), " ", train_indices)
print("Test indices: ", len(rest_indices), " ", rest_indices)

In [None]:
import random
for i in [1, 2, 3, 4]:
    test_indices = random.sample(rest_indices, random.randint(4, 5))
    print("Test set ", i ,  " indices: ", test_indices)

    test_data = analyses.get_features(test_indices)
    test_metadata = analyses.get_metadata(test_indices)
    test_data.to_csv(f"dev/error_lc_test{i}_features.csv", index=False)
    test_metadata.to_csv(f"dev/error_lc_test{i}_metadata.csv", index=False)

    fig_test_curves=analyses.plot_pressure_curves(indices = test_indices)
    fig_test_curves.update_layout(showlegend=False)
    fig_test_curves.write_image(f"dev/figures/fig_error_lc_test{i}_curves.png", width=1000, height= 350, scale = 3)
    #fig_test_curves.update_layout(showlegend=True)
    #fig_test_curves.show()

    fig_test_features=analyses.plot_features(indices = test_indices)
    fig_test_features.update_layout(showlegend=False)
    fig_test_features.write_image(f"dev/figures/fig_error_lc_test{i}_features.png", width=1100, height= 350, scale = 3)
    fig_test_features.update_layout(title=f"Test set {i}", showlegend=True)
    fig_test_features.show()
    
    fig_test_features_raw = analyses.plot_features_raw(indices = test_indices)
    fig_test_features_raw.update_layout(showlegend=False)
    fig_test_features_raw.write_image(f"dev/figures/fig_error_lc_test{i}_features_raw.png", width=1100, height= 350, scale = 3)
    fig_test_features_raw.update_layout(title=f"Test set {i}", showlegend=True)
    fig_test_features_raw.show()

In [None]:
from src.StreamPort.machine_learning.analyses import MachineLearningAnalyses

iso_analyses = MachineLearningAnalyses(variables = train_data, metadata = train_metadata)
print(iso_analyses)

In [None]:
from src.StreamPort.machine_learning.methods import MachineLearningScaleFeaturesScalerSklearn 

scl = MachineLearningScaleFeaturesScalerSklearn()
scaling_parameters = scl.parameters
print("Scaling parameters: ", scaling_parameters)
parameters.update(scaling_parameters)

iso_analyses = scl.run(iso_analyses)
fig_train_features = iso_analyses.plot_data()
fig_train_features.update_layout(title="Train set features")
fig_train_features.show()

In [None]:
from src.StreamPort.machine_learning.methods import MachineLearningMethodIsolationForestSklearn 

iso = MachineLearningMethodIsolationForestSklearn()
iforest_parameters = iso.parameters
print("Isolation Forest parameters: ", iforest_parameters)
parameters.update(iforest_parameters)

iso_analyses = iso.run(iso_analyses)
fig_train_scores = iso_analyses.plot_scores()
fig_train_scores.show()

In [None]:
import pandas as pd
import os
import numpy as np
import datetime

def set_class_label(test_metadata, outliers_test):
    
    temp_outliers_test = outliers_test
    temp_outliers_test["outlier"] = temp_outliers_test["outlier"].map({True: "outlier", False: "normal"})
    temp_outliers_test["class"] = temp_outliers_test["outlier"]
    temp_outliers_test.drop(columns=["outlier"], inplace = True)

    classified_test_metadata = pd.concat([test_metadata["index"], temp_outliers_test], axis=1)

    return classified_test_metadata

#user defined constraints determine that data should be added to the training set if it has been declared an inlier a minimum of x times with a confidence >= y. add_prediction needs to be tweaked 
# def prepare_to_add(confidence : int = None, classified_normal : int = None):
#     #error_lc_test_record holds all the tests conducted. Date is common with the logs of each test : classified samples
#     test_record = pd.read_csv("dev/error_lc_test_record.csv")
#     tests = [f for f in os.listdir(os.getcwd()) if f.endswith(".csv") and test_record["date"].any() in f]
#     print(tests)
#     search_columns = ["index", "confidence", "class"]
#     sample_list = pd.DataFrame()
#     if len(test_record.index) > 3:
#         for file in tests:
#             test = pd.read_csv(file)
#             sample_list = pd.concat([sample_list, test[search_columns]])
#     else:
#         print("Not enough evidence of true inliers! You may still add training data using add_prediction()")
#         return
#     print(sample_list)
    
#     """ FIX this logic       
#     # Total appearances of each index
#     count_all = sample_list["index"].value_counts().rename("total_count")

#     # Appearances where class == "normal"
#     is_normal = sample_list[sample_list["class"] == "normal"]
#     count_normal = is_normal["index"].value_counts().rename("normal_count")

#     # Normal entries with confidence < threshold
#     low_conf_normal = is_normal[is_normal["confidence"] < confidence]
#     count_low_conf = low_conf_normal["index"].value_counts().rename("conf_amount")
#     """
#     # Combine all into one DataFrame
#     summary = pd.concat([count_all, count_normal, count_low_conf], axis=1).fillna(0).astype(int)
#     summary = summary[summary["normal_count"] >= classified_normal]
#     return summary

# summary = prepare_to_add(70, 3)
# print(summary)

In [None]:
threshold = "auto"
parameters_df = []
for i in [1, 2, 3, 4]:
    now = datetime.datetime.now().isoformat()
    now = now.replace(":", "-").replace(".", "-")

    test_data = pd.read_csv(f"dev/error_lc_test{i}_features.csv")
    test_metadata = pd.read_csv(f"dev/error_lc_test{i}_metadata.csv")
    iso_analyses.predict(test_data, test_metadata)
    outliers_test = iso_analyses.test_prediction_outliers(threshold)
    
    classified_test_metadata = set_class_label(test_metadata, outliers_test)
    classified_test_metadata["date"] = now

    print("Test set ", i, ": \n", test_metadata["index"].tolist()) 
    print("\n")
    
    test_batch_parameters = parameters.copy()
    test_set = len(test_metadata)

    train_scores = iso_analyses.get_training_scores()
    train_set = len(train_scores)

    threshold = classified_test_metadata["threshold"].iloc[1]
    num_outliers = sum(classified_test_metadata["class"] == "outlier")
    percent_outliers = (sum(classified_test_metadata["class"] == "outlier")/len(classified_test_metadata["class"]))*100

    fig_test_scores = iso_analyses.plot_scores(threshold)
    fig_test_scores.write_image(f"dev/figures/fig_error_lc_test{i}_{now}_scores.png", width=1100, height= 350, scale = 3)
    fig_test_scores.update_layout(title=f"Test set {i}")
    fig_test_scores.show()

    fig_test_features = iso_analyses.plot_data()
    fig_test_features.write_image(f"dev/figures/fig_error_lc_test{i}_{now}_features.png", width=1100, height= 350, scale = 3)
    #fig_test_features.show()

    #optionally add seen normal curves to train set
    #iso_analyses.add_prediction()

    if num_outliers < 1:
        threshold += 0.5*np.std(train_scores)

    test_batch_parameters.update(
        {
        "date" : now,
        "test_batch" : i,
        "train_set" : train_set, 
        "test_set" : test_set,
        "threshold" : threshold,
        "outliers" : num_outliers,
        "outliers_percent" : percent_outliers
        }
    )
    parameters_df.append(test_batch_parameters)

    classification_record = f"dev/error_lc_test{i}_{now}_classified_samples.csv"
    if os.path.exists(classification_record):
        old_records = pd.read_csv(classification_record)
        classified_test_metadata = pd.concat([old_records, classified_test_metadata])
        classified_test_metadata.drop_duplicates(subset=["index", "date"], keep="last", inplace=True)
    classified_test_metadata.to_csv(classification_record, index=False)

In [None]:

parameters_df = pd.DataFrame(parameters_df)
if os.path.exists("dev/error_lc_test_record.csv"):
    old_records = pd.read_csv("dev/error_lc_test_record.csv")
    parameters_df = pd.concat([old_records, parameters_df])
    parameters_df.drop_duplicates(subset=["test_batch", "date"], keep="last", inplace=True)
    
parameters_df.to_csv("dev/error_lc_test_record.csv", index = False)
print("Workflow parameters: \n", parameters_df)

In [None]:
import plotly.graph_objects as go
test_record = pd.read_csv("dev/error_lc_test_record.csv") if os.path.exists("dev/error_lc_test_record.csv") else None

fig = go.Figure()

if test_record is not None:
    
    fig.add_trace(
        go.Scatter(
            x=test_record["train_set"],
            y=test_record["threshold"],
            mode="lines+markers",
            name="Threshold",
            yaxis="y1",  
            hovertemplate=[
                "<br>Threshold: " + str(test_record["threshold"][i]) +
                "<br>Outliers %: " + str(test_record["outliers_percent"][i]) 
                for i in range(len(test_record))
            ],
            line=dict(color="red", width=2, dash='dash'),
            marker=dict(size=8, symbol="circle")
        )
    )

    fig.add_trace(
        go.Bar(
            x=test_record["train_set"],
            y=test_record["outliers"],
            name="Outliers",
            yaxis="y2",
            width = 0.05, 
            marker_color="blue",
            hovertemplate=[
                "<br>Train Set: " + str(test_record["train_set"][i]) +
                "<br>Test Set: " + str(test_record["test_set"][i]) +
                "<br>Outliers: " + str(test_record["outliers"][i])
                for i in range(len(test_record))
            ]
        )
    )

fig.update_layout(
    title="Threshold and Outliers vs. Training Set Size",
    xaxis=dict(title="Number of Training Curves"),
    yaxis=dict(  
        title=dict(text="Threshold", font=dict(color="red")),
        tickfont=dict(color="red")
    ),
    yaxis2=dict(  
        title=dict(text="Number of Outliers", font=dict(color="blue")),
        tickfont=dict(color="blue"),
        overlaying="y",
        side="right"
    ),
    bargap = 1, 
    template="simple_white",
    legend=dict(
        x=0.5,
        y=1.1,
        xanchor="center",
        yanchor="top",
        orientation="h",  
        bgcolor="rgba(255,255,255,0.5)", 
        borderwidth=1  
    )
)
fig.write_image("dev/figures/fig_error_lc_threshold_variation.png", width=1100, height= 350, scale = 3)
fig.show()
