Methods showcased in dev_pressure_curves facilitate fine-tuned and automated data transfer to dev_machine_learning 

In [None]:
#execute related notebooks within the scope of the current one to use their variables without the need for individual imports or file creation
#%run dev_pressure_curves.ipynb dev_machine_learning.ipynb

In [53]:
#Here, we import pressure curve files to run through a simple workflow. %store -[OPTION] var to -r : retrieve, -d : delete, -z : clear all  
%store -r files

In [54]:
from src.StreamPort.device.analyses import PressureCurvesAnalyses

ana = PressureCurvesAnalyses(files=files)
print("Number of analyses: ", len(ana.data))
print("Methods: ", ana.get_methods())

Number of analyses:  374
Methods:  ['SAA_411_5FU.M', 'SAA_411_Pac.M', 'SAA_411_Doc.M', 'SAA_411_Irino.M', 'SAA_411_Gem.M']


In [55]:
from src.StreamPort.device.methods import PressureCurvesMethodExtractFeaturesNative

processor = PressureCurvesMethodExtractFeaturesNative()
ana = processor.run(ana)
#keep track of hyperparameters and compare the effectivity of ML with different combinations
parameters = processor.parameters
print("Feature extraction parameters: ", parameters)

Feature extraction parameters:  {'period': 10, 'window_size': 7, 'bins': 4, 'crop': 2}


In [56]:
method = "SAA_411_Pac.M"
method_indices = ana.get_method_indices(method)
#%store ana

In [57]:
import datetime
date_threshold = "2021-08-18"
date_threshold = datetime.datetime.strptime(date_threshold, "%Y-%m-%d")

train_indices = []
for i in method_indices:
    meta = ana.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = datetime.datetime.fromisoformat(start_time)
    if batch_position > 4 and start_time < date_threshold:
        train_indices.append(i)

train_data = ana.get_features(train_indices)
train_metadata = ana.get_metadata(train_indices)
train_data.to_csv("dev/workflow_train_features.csv", index=False)
train_metadata.to_csv("dev/workflow_train_metadata.csv", index=False)
print("Number of training curves: ", len(train_indices))

Number of training curves:  20


In [58]:
fig_train=ana.plot_pressure_curves(indices = train_indices)
fig_train.update_layout(showlegend=False)
fig_train.show()
#train_indices.extend([239, 245, 266, 117])# 117 is interesting

In [59]:
train_data.describe()

Unnamed: 0,area,pressure_max,pressure_min,pressure_mean,pressure_std,pressure_range,runtime,residual_noise,residual_std,roc_0.0_0.995,abs_deviation_0.0_0.995,roc_1.0_1.995,abs_deviation_1.0_1.995,roc_2.0_2.995,abs_deviation_2.0_2.995,roc_3.0_3.975,abs_deviation_3.0_3.975
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,215.491688,71.001,36.0085,54.214604,13.196363,34.9925,239.7,26.1319,0.200198,0.167,0.714036,-0.2595,0.696429,-0.275,0.680667,0.4615,0.690631
std,0.64409,0.259349,0.107276,0.161953,0.054923,0.236128,0.0,1.22639,0.009362,0.010311,0.082857,0.006863,0.074012,0.00513,0.067338,0.007452,0.074876
min,214.57035,70.47,35.87,53.9825,13.089369,34.55,239.7,23.547442,0.175556,0.15,0.590476,-0.27,0.565238,-0.28,0.573333,0.45,0.546667
25%,215.095069,70.84,35.965,54.115173,13.163849,34.845,239.7,25.386692,0.195515,0.16,0.662143,-0.26,0.639881,-0.28,0.637381,0.46,0.652738
50%,215.3632,70.995,35.98,54.182456,13.204293,34.98,239.7,25.951841,0.201646,0.16,0.685952,-0.26,0.695238,-0.275,0.672143,0.46,0.700952
75%,215.724181,71.15,36.05,54.27342,13.23046,35.1325,239.7,27.069435,0.206173,0.1725,0.761726,-0.2575,0.726429,-0.27,0.71619,0.47,0.719524
max,217.269175,71.51,36.34,54.661332,13.295331,35.41,239.7,27.838527,0.213708,0.19,0.863333,-0.25,0.834286,-0.27,0.820952,0.47,0.874286


In [60]:
rest_indices = list(set(method_indices) - set(train_indices))
print("Total curves: ", len(method_indices), "\t", method_indices, "\n")
print("Train curves: ",  len(train_indices), "\t", train_indices, "\n")
print("Remaining: ", len(rest_indices), "\t", rest_indices, "\n")

Total curves:  93 	 [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279] 

Train curves:  20 	 [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60] 

Remaining:  73 	 [266, 267, 268, 269, 14, 15, 16, 17, 275, 276, 277, 278, 279, 47, 48, 49, 50, 270, 271, 272, 273, 274, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247] 



In [61]:
test_source_indices = [ ]
for i in rest_indices:
    meta = ana.get_metadata(i)
    batch_position = meta["batch_position"].item()
    if  batch_position > 0:
        test_source_indices.append(i)
print("Selected test superset: ", len(test_source_indices), "\t", test_source_indices)

Selected test superset:  73 	 [266, 267, 268, 269, 14, 15, 16, 17, 275, 276, 277, 278, 279, 47, 48, 49, 50, 270, 271, 272, 273, 274, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247]


In [65]:
test_dates = []
import datetime
import random
for i in [1, 2, 3]:
    now = datetime.datetime.now().isoformat()
    now = now.replace(":", "-").replace(".", "-").replace("T", " ")
    test_dates.append(now)

    test_indices = random.sample(test_source_indices, random.randint(5, 8))
    print("Test set ", now ,  " indices: ", test_indices)

    test_data = ana.get_features(test_indices)
    test_metadata = ana.get_metadata(test_indices)
    test_data.to_csv(f"dev/test_{now}_features.csv", index=False)
    test_metadata.to_csv(f"dev/test_{now}_metadata.csv", index=False)

    fig_test_curves=ana.plot_pressure_curves(indices = test_indices)
    fig_test_curves.update_layout(showlegend=False)
    fig_test_curves.write_image(f"dev/figures/fig_test_{now}_curves.png", width=1000, height= 350, scale = 3)
    fig_test_curves.update_layout(showlegend=True)
    fig_test_curves.show()

    fig_test_features=ana.plot_features(indices = test_indices, normalize = False)
    fig_test_features.update_layout(showlegend=False)
    fig_test_features.write_image(f"dev/figures/fig_test_{now}_features.png", width=1100, height= 350, scale = 3)
    fig_test_features.update_layout(title=f"Test set {now}", showlegend=True)
    fig_test_features.show()
    
    fig_test_features_raw = ana.plot_features_raw(indices = test_indices)
    fig_test_features_raw.update_layout(showlegend=False)
    fig_test_features_raw.write_image(f"dev/figures/fig_test_{now}_features_raw.png", width=1100, height= 350, scale = 3)
    fig_test_features_raw.update_layout(title=f"Test set {now}", showlegend=True)
    fig_test_features_raw.show()


test_set = test_data
test_set_meta = test_metadata
#%store test_set test_set_meta

Test set  2025-08-05 14-28-31-846792  indices:  [239, 246, 245, 152, 14, 234]


Test set  2025-08-05 14-28-40-573333  indices:  [271, 48, 167, 158, 125, 276, 164, 277]


Test set  2025-08-05 14-28-47-775926  indices:  [157, 121, 116, 240, 15, 236]


In [70]:
import pandas as pd
from src.StreamPort.machine_learning.analyses import MachineLearningAnalyses

ml_ana = MachineLearningAnalyses(variables = train_data, metadata = train_metadata)
print(ml_ana)


MachineLearningAnalyses 
  variables: 20 rows, 17 columns
  metadata: 20 rows, 14 columns



In [71]:
from src.StreamPort.machine_learning.methods import MachineLearningScaleFeaturesScalerSklearn

scaler = MachineLearningScaleFeaturesScalerSklearn(scaler_type = "StandardScaler")
scaling_parameters = scaler.parameters
print("Scaling parameters: ", scaling_parameters)
parameters.update(scaling_parameters)

ml_ana = scaler.run(ml_ana)
fig_train_features = ml_ana.plot_data()
fig_train_features.update_layout(title="Train set features")
fig_train_features.show()

Scaling parameters:  {'type': 'StandardScaler'}


In [72]:

from src.StreamPort.machine_learning.methods import MachineLearningMethodIsolationForestSklearn

iforest = MachineLearningMethodIsolationForestSklearn()
iforest_parameters = iforest.parameters
print("Isolation Forest parameters: ", iforest_parameters)
parameters.update(iforest_parameters)

ml_ana = iforest.run(ml_ana)
ml_ana.train()
fig_train_scores = ml_ana.plot_scores()
fig_train_scores.show()

Isolation Forest parameters:  {'n_estimators': 100, 'max_samples': 'auto', 'contamination': 'auto', 'max_features': 1, 'bootstrap': False, 'n_jobs': None, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [74]:
import os

parameters_df = []
for i in test_dates:
    test_data = pd.read_csv(f"dev/test_{i}_features.csv")
    test_metadata = pd.read_csv(f"dev/test_{i}_metadata.csv")
    ml_ana.predict(test_data, test_metadata)
    outliers_test = ml_ana.test_prediction_outliers()
    
    classified_test_metadata = outliers_test
    classified_test_metadata["date"] = i
    classified_test_metadata.to_csv(f"dev/test_{i}_classified_samples.csv", index=False)
    print("Test set ", i, ": \n", test_metadata["index"].tolist()) 
    print("\n")
    
    test_batch_parameters = parameters.copy()
    test_set = len(classified_test_metadata)
    train_set = len(ml_ana.get_training_scores())
    threshold = classified_test_metadata["threshold"][1]
    num_outliers = sum(classified_test_metadata["class"] == "outlier")
    percent_outliers = (sum(classified_test_metadata["class"] == "outlier")/len(classified_test_metadata["class"]))*100

    fig_test_scores = ml_ana.plot_scores()
    fig_test_scores.write_image(f"dev/figures/fig_test_{i}_scores.png", width=1100, height= 350, scale = 3)
    fig_test_scores.update_layout(title=f"Test set {i}")
    fig_test_scores.show()

    fig_test_features = ml_ana.plot_data()
    fig_test_features.write_image(f"dev/figures/fig_test_{i}_features.png", width=1100, height= 350, scale = 3)
    #fig_test_features.show()

    test_batch_parameters.update(
        {
        "date" : i,
        "train_set" : train_set, 
        "test_set" : test_set,
        "threshold" : threshold,
        "outliers" : num_outliers,
        "outliers_percent" : percent_outliers
        }
    )
    parameters_df.append(test_batch_parameters)

    #optionally add seen normal curves to train set
    #ml_ana.add_prediction()

parameters_df = pd.DataFrame(parameters_df)
if os.path.exists("dev/test_record.csv"):
    old_records = pd.read_csv("dev/test_record.csv")
    parameters_df = pd.concat([old_records, parameters_df])
    parameters_df.drop_duplicates(subset="date", keep="last", inplace=True)
    
parameters_df.to_csv("dev/test_record.csv", index = False)
print("Workflow parameters: \n", parameters_df)

Test set  2025-08-05 14-28-31-846792 : 
 [239, 246, 245, 152, 14, 234]




Test set  2025-08-05 14-28-40-573333 : 
 [271, 48, 167, 158, 125, 276, 164, 277]




Test set  2025-08-05 14-28-47-775926 : 
 [157, 121, 116, 240, 15, 236]




Workflow parameters: 
     period  window_size  bins  crop            type  n_estimators max_samples  \
0       10            7     4     2    MinMaxScaler           100        auto   
1       10            7     4     2    MinMaxScaler           100        auto   
2       10            7     4     2    MinMaxScaler           100        auto   
3       10            7     4     2    MinMaxScaler           100        auto   
4       10            7     4     2    MinMaxScaler           100        auto   
5       10            7     4     2    MinMaxScaler           100        auto   
6       10            7     4     2    MinMaxScaler           100        auto   
7       10            7     4     2    MinMaxScaler           100        auto   
8       10            7     4     2    MinMaxScaler           100        auto   
9       10            7     4     2    MinMaxScaler           100        auto   
10      10            7     4     2    MinMaxScaler           100        auto   
11   

In [75]:
# this plot can be migrated to native method of EvaluateModelStability
import plotly.graph_objects as go

test_record = pd.read_csv("dev/test_record.csv") if os.path.exists("dev/test_record.csv") else None
test_record = test_record.sort_values("date")

if test_record is not None:
    result_logs = []
    for date in test_record["date"]:
        result_logs.append(f"dev/test_{date}_classified_samples.csv") if os.path.exists(f"dev/test_{date}_classified_samples.csv") else print(f"No records for {date}")
    
    result_logs = [pd.read_csv(log) for log in result_logs]

    for log in result_logs:
        log["date"] = log["date"].astype(str).str[:19].str.replace("T", " ")

    result_logs = [log.to_string(index=False).replace("\n", "<br>") for log in result_logs]

    #test_record["date"] = pd.to_datetime(test_record["date"], format="%Y-%m-%dT%H-%M-%S-%f").dt.floor("s")
    #width = 600 
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=test_record["date"],
            y=test_record["threshold"],
            mode="lines+markers",
            name="Threshold",
            yaxis="y1",  
            hovertemplate=[
                "<br>Threshold: " + str(test_record["threshold"][i])  
                for i in range(len(test_record))
            ],
            line=dict(color="red", width=2, dash='dash'),
            marker=dict(size=8, symbol="circle")
        )
    )

    fig.add_trace(
        go.Bar(
            x=test_record["date"],
            y=test_record["outliers"],
            name="Outliers",
            yaxis="y2",
            width = 0.15, #width
            marker_color="blue",
            hovertext=result_logs,
            hoverinfo="text"
        )
    )

    fig.add_trace(
        go.Scatter(
            x=test_record["date"],
            y=test_record["train_set"], 
            mode="lines+markers",
            name="Training curves",
            yaxis="y2",
            hovertemplate=[
                "<br>Training samples: " + str(test_record["train_set"][i]) +
                "<br>Test samples: " + str(test_record["test_set"][i]) +
                "<br>Outliers: " + str(test_record["outliers"][i]) +
                "<br>Outliers %: " + str(test_record["outliers_percent"][i]) 
                for i in range(len(test_record))
            ],
            line=dict(color="green", width=2, dash='solid'),
            marker=dict(size=8, symbol="star")
        )
    )

fig.update_layout(
    title="Detection accuracy over Test Runs and Training Set Size",
    xaxis=dict(
        tickvals=test_record["date"], 
        ticktext=[d[:19].replace("T", " ") for d in test_record["date"]], 
        tickangle=45,
        title="Test Dates"
    ),
    yaxis=dict(  
        title=dict(text="Threshold", font=dict(color="red")),
        tickfont=dict(color="red")
    ),
    yaxis2=dict(#train set and outlier realistic in scale. Uncomment y3 to adjust 
        title=dict(text="Outliers", font=dict(color="blue")),
        tickfont=dict(color="blue"),
        overlaying="y",
        side="right"
    ),
    bargap = 1, 
    template="simple_white",
    legend=dict(
        x=0.5,
        y=1.1,
        xanchor="center",
        yanchor="top",
        orientation="h",  
        bgcolor="rgba(255,255,255,0.5)", 
        borderwidth=1  
    )
)
fig.write_image("dev/figures/fig_threshold_variation.png", width=1100, height= 350, scale= 3)
fig.show()


In [76]:
# join old train and test data with labels to be passed into knn
old_train_data = pd.read_csv("dev/workflow_train_features.csv")
old_train_metadata = pd.read_csv("dev/workflow_train_metadata.csv")
print("Old train features: ", old_train_data.shape)

old_train_metadata["label"] = "normal"
metadata_columns = old_train_metadata.columns
print("Old train metadata: ", old_train_metadata.shape)

# runtime column exists in both data and metadata
old_train_data.drop(columns="runtime", inplace=True)
old_train_data = pd.concat([old_train_metadata, old_train_data], axis = 1)
print("Labeled Normal data: ", old_train_data.shape) 
#old_train_data.head()
#  
print("Labeled normal metadata: ", old_train_metadata.shape)
#old_train_data[metadata_columns].head()

Old train features:  (20, 17)
Old train metadata:  (20, 15)
Labeled Normal data:  (20, 31)
Labeled normal metadata:  (20, 15)


In [77]:
import os

test_files = []
test_meta = []

#collect IForest test records
path_to_test_records = "dev/test_record.csv"

test_record = pd.read_csv(path_to_test_records) if os.path.exists(path_to_test_records) else None
test_record = test_record.sort_values("date") if test_record is not None else None

if test_record is not None:
    result_logs = []
    for date in test_record["date"]:

        filepath = f"dev/test_{date}_classified_samples.csv"
        result_logs.append(filepath) if os.path.exists(filepath) else print(f"No records for {date}")

        featurepath = f"dev/test_{date}_features.csv"
        test_files.append(featurepath) if os.path.exists(featurepath) else print(f"No records for {date}")
        
        metapath = f"dev/test_{date}_metadata.csv"
        test_meta.append(metapath) if os.path.exists(metapath) else print(f"No records for {date}")
        
else:
    print("Not enough evidence of true inliers! Please run more tests")

In [78]:
summary = pd.DataFrame()
data = pd.DataFrame()
metadata = pd.DataFrame()

# combine data and their labels from tests
for i in range(len(test_files)):
    feature_df = pd.read_csv(test_files[i])
    data = pd.concat([data, feature_df], ignore_index=True)

    log = pd.read_csv(result_logs[i])
    summary = pd.concat([summary, log], ignore_index=True)

    meta = pd.read_csv(test_meta[i])
    metadata = pd.concat([metadata, meta], ignore_index=True)

# data from test records are newly labelled. Classification results over tests will be averaged using EvaluateModelStability

print(data.shape, metadata.shape)
data.drop(columns="runtime", inplace=True)
new_data = pd.concat([metadata, data], axis = 1)# fix bug with double runtime appearance or fix method to handle. watch out for column move 2 cells down
print("New_data: ", new_data.shape)

new_data = new_data.sort_values("index")
new_data.drop_duplicates(subset = "index", inplace=True)
new_data.reset_index(drop=True, inplace=True)

summary = summary.sort_values("index")
summary.reset_index(drop=True, inplace = True)
summary["algorithm"] = "iforest"
summary.to_csv("dev/test_summary.csv", index=False)
print("Test summary: ")
summary.head()

(189, 17) (189, 14)
New_data:  (189, 30)
Test summary: 


Unnamed: 0,index,threshold,score,confidence,class,date,algorithm
0,14,-0.080903,-0.111895,1.38,outlier,2025-07-30 10-59-59-838221,iforest
1,14,-0.079961,-0.148079,1.85,outlier,2025-08-05 12-02-13-508108,iforest
2,14,-0.079269,-0.119173,1.5,outlier,2025-08-05 14-28-31-846792,iforest
3,15,-0.071634,-0.100461,1.4,outlier,2025-08-05 11-56-51-704847,iforest
4,15,-0.078787,-0.114911,1.46,outlier,2025-08-05 11-50-40-877905,iforest


In [79]:
#%%timeit # ~1.9s +- ~30ms current setup
# EvaluateModelStability also prepares data for transfer from unsupervised to supervised learning by assigning true class labels
from src.StreamPort.machine_learning.methods import MachineLearningEvaluateModelStabilityNative

model_eval = MachineLearningEvaluateModelStabilityNative(test_records=summary)
results = model_eval.run()
true_classes = results["true_classes"] 
stability_score = results["stability_score"] 

print("Model stability score: ", stability_score)

confidence_plot = model_eval.plot_confidences()
confidence_plot.write_image("dev/figures/fig_model_stability.png", width=1100, height= 350, scale = 3)
confidence_plot.update_layout(showlegend=True)
confidence_plot.show()

Some samples are unverified. Setting true_classes to majority class
Model stability score:  0.7868348055262132


In [80]:
to_remove_from_tests = [116, 117, 121, 129, 132, 236, 272]
%store to_remove_from_tests

Stored 'to_remove_from_tests' (list)


In [81]:
# sorted. ensure identical column order 
new_data["label"] = true_classes["class_true"]
col = new_data.pop("label")
new_data.insert(14, "label", col)
print("New data: ", new_data.shape) 
new_data[metadata_columns].head()

New data:  (70, 31)


Unnamed: 0,index,name,path,batch,batch_position,idle_time,sample,method,timestamp,detector,pump,start_time,end_time,runtime,label
0,14,210812_Pac,C:/Users/Sandeep/Desktop/ExtractedSignals\2108...,210812_Pac 2021-08-12 10-30-07,1,142.0,Blank,SAA_411_Pac.M,2021-08-12 10:30:12,G7110B,G7110B,2021-08-12 10:32:20,2021-08-12 10:36:21,239.7,outlier
1,15,210812_Pac--002,C:/Users/Sandeep/Desktop/ExtractedSignals\2108...,210812_Pac 2021-08-12 10-30-07,2,39.0,Blank,SAA_411_Pac.M,2021-08-12 10:36:26,G1315C,G7110B,2021-08-12 10:37:00,2021-08-12 10:41:00,239.7,outlier
2,16,210812_Pac--003,C:/Users/Sandeep/Desktop/ExtractedSignals\2108...,210812_Pac 2021-08-12 10-30-07,3,39.0,Blank,SAA_411_Pac.M,2021-08-12 10:41:05,G1315C,G7110B,2021-08-12 10:41:39,2021-08-12 10:45:39,239.7,outlier
3,47,210813_Pac,C:/Users/Sandeep/Desktop/ExtractedSignals\2108...,210813_Pac 2021-08-13 10-37-27,1,5673.0,Blank,SAA_411_Pac.M,2021-08-13 10:37:33,G7110B,G7110B,2021-08-13 10:39:51,2021-08-13 10:43:51,239.7,outlier
4,48,210813_Pac--002,C:/Users/Sandeep/Desktop/ExtractedSignals\2108...,210813_Pac 2021-08-13 10-37-27,2,38.0,Blank,SAA_411_Pac.M,2021-08-13 10:43:56,G1315C,G7116B,2021-08-13 10:44:29,2021-08-13 10:48:30,239.7,normal


In [82]:
#combine all available labeled data, features and metadata included 
old_train_data = old_train_data.reset_index(drop=True)
new_data.reset_index(drop=True, inplace=True)

labeled_data = pd.concat([old_train_data, new_data], ignore_index=True)

# drop unclassified samples
labeled_data = labeled_data[labeled_data["label"] != "not set"]

metadata = labeled_data[metadata_columns]
labels = metadata["label"]
labeled_data = labeled_data.drop(columns= metadata_columns)
print("KNN Training Data: ", labeled_data.shape)

KNN Training Data:  (90, 16)


In [83]:
# create random splits of training and test sets. Shuffling and train-test-splitting with 'stratify' ensure a good distribution of classes so KNN learning is effective
from sklearn.model_selection import train_test_split
features_train, features_test, metadata_train, metadata_test = train_test_split(labeled_data, metadata, test_size=0.3, stratify=metadata["label"])

#%store features_train features_test metadata_train metadata_test

In [84]:

from src.StreamPort.machine_learning.analyses import MachineLearningAnalyses
knn_ana = MachineLearningAnalyses(variables=features_train, metadata=metadata_train)

from src.StreamPort.machine_learning.methods import MachineLearningScaleFeaturesScalerSklearn
scl = MachineLearningScaleFeaturesScalerSklearn(scaler_type="StandardScaler")
knn_ana = scl.run(knn_ana)

from src.StreamPort.machine_learning.methods import MachineLearningMethodNearestNeighboursClassifierSklearn
knn = MachineLearningMethodNearestNeighboursClassifierSklearn(n_neighbors=1) #odd number for binary classification. 1 usually causes overfit, could be suitable for small amount of data
knn_ana = knn.run(knn_ana)

print(knn_ana)


NearestNeighboursAnalyses 
  variables: 63 rows, 16 columns
  metadata: 63 rows, 15 columns



In [85]:
knn_ana.plot_data()

In [86]:
knn_ana.train()
train_classes = knn_ana.get_training_labels()
print(train_classes)

    index    label
58    165   normal
13     54   normal
21     15  outlier
34    123   normal
5      23   normal
..    ...      ...
71    243   normal
25     49  outlier
44    150   normal
17     58   normal
28    117   normal

[63 rows x 2 columns]


In [87]:
train_labels = knn_ana.get_true_labels("train")
print("True labels: ",train_labels.shape)#, " ", train_labels)
print("Metadata labels: ", metadata_train[["index","label"]].shape)#, " ", metadata_train[["index","label"]])
if not train_labels.equals(train_classes):
    print("mismatch")
else:
    print("match")

True labels:  (63, 2)
Metadata labels:  (63, 2)
match


In [91]:
import umap
import numpy as np
import sklearn.preprocessing as scaler
import plotly.graph_objects as go

scaler = scaler.StandardScaler()
features_train_scaled = scaler.fit_transform(features_train)

# UMAP for dimensionality reduction
umap_train_model = umap.UMAP(n_components=2)
features_test_reduced_umap = umap_train_model.fit_transform(features_train_scaled)

# Prepare the reduced data frame
reduced_data_umap = pd.DataFrame(features_test_reduced_umap, columns=['UMAP-1', 'UMAP-2'])
reduced_data_umap["predicted_labels"] = train_classes["label"]
reduced_data_umap["true_labels"] = train_labels["label"]
reduced_data_umap["index"] = train_labels["index"]

# Convert labels to binary values (0 for normal, 1 for outlier)
#reduced_data_umap["predicted_labels"] = np.where(reduced_data_umap["predicted_labels"] == "normal", 0, 1)
#reduced_data_umap["true_labels"] = np.where(reduced_data_umap["true_labels"] == "normal", 0, 1)

# Initialize the UMAP plot
fig_umap = go.Figure()

# Plot true labels (blue circles)
for i in range(len(reduced_data_umap)):
    if reduced_data_umap["true_labels"][i] == "outlier":
        fig_umap.add_trace(
            go.Scatter(
                x=[reduced_data_umap["UMAP-1"][i]],
                y=[reduced_data_umap["UMAP-2"][i]],
                mode="markers",
                marker=dict(
                    color="blue",
                    size=10,
                    opacity=0.7,
                    symbol="circle",
                ),
                name="True labels",
                hovertext=f"Index: {reduced_data_umap['index'][i]}<br>Predicted label: {reduced_data_umap['predicted_labels'][i]}<br>True label: {reduced_data_umap['true_labels'][i]}"
            )
        )

    elif reduced_data_umap["true_labels"][i] == "normal":
        fig_umap.add_trace(
            go.Scatter(
                x=[reduced_data_umap["UMAP-1"][i]],
                y=[reduced_data_umap["UMAP-2"][i]],
                mode="markers",
                marker=dict(
                    color="white",  # Color for normal labels
                    size=10,
                    opacity=0.4,
                    symbol="circle",
                ),
                hovertext=f"Index: {reduced_data_umap['index'][i]}<br>Predicted label: {reduced_data_umap['predicted_labels'][i]}<br>True label: {reduced_data_umap['true_labels'][i]}"
            )
        )

# Plot predicted labels (red 'x's)
for i in range(len(reduced_data_umap)):
    if reduced_data_umap["predicted_labels"][i] == "outlier":
        fig_umap.add_trace(
            go.Scatter(
                x=[reduced_data_umap["UMAP-1"][i]],
                y=[reduced_data_umap["UMAP-2"][i]],
                mode="markers",
                marker=dict(
                    color="red",
                    size=8,
                    opacity=0.5,
                    symbol="x",
                ),
                name="Predicted Labels",
                hovertext=f"Index: {reduced_data_umap['index'][i]}<br>Predicted label: {reduced_data_umap['predicted_labels'][i]}<br>True label: {reduced_data_umap['true_labels'][i]}"
            )
        )

    elif reduced_data_umap["predicted_labels"][i] == "normal":
        fig_umap.add_trace(
            go.Scatter(
                x=[reduced_data_umap["UMAP-1"][i]],
                y=[reduced_data_umap["UMAP-2"][i]],
                mode="markers",
                marker=dict(
                    color="white",  # Color for normal predicted labels
                    size=8,
                    opacity=0.4,
                    symbol="x",
                ),
                hovertext=f"Index: {reduced_data_umap['index'][i]}<br>Predicted label: {reduced_data_umap['predicted_labels'][i]}<br>True label: {reduced_data_umap['true_labels'][i]}"
            )
        )

# Update layout for the UMAP plot
fig_umap.update_layout(
    title="KNN Train Data Classification using UMAP reduction",
    xaxis_title="UMAP-1",
    yaxis_title="UMAP-2",
    template="plotly_dark",
    showlegend=False,
    hovermode="closest"
)

fig_umap.show()

In [88]:
knn_ana.predict(features_test, metadata_test)
classes = knn_ana.get_prediction_labels()
classes.reset_index(drop=True, inplace=True)
print("Predicted classes: ", classes.shape)#, " ", classes)

Predicted classes:  (27, 2)


In [89]:
true_labels = knn_ana.get_true_labels()
true_labels.reset_index(drop=True, inplace=True)
print("True classes: ", true_labels.shape)#, " ", true_labels)

True classes:  (27, 2)


In [90]:
# probabilities will sub for kNN where confidence was used for iForest 
probs = knn_ana.get_prediction_probabilities()
# print("Prediction probabilities: ", probs) # n_neighbours = 1 is not ideal but provides best results for small datasets. Use GridSearchCV (see dev_model_comparison) to find best params

In [None]:
# from sklearn.manifold import TSNE


# n_samples = len(features_test_scaled)
# tsne = TSNE(
#     perplexity=n_samples - 1 if 30 > n_samples else 30,
#     n_components=2, 
#     random_state=42
#     )
# features_test_reduced = tsne.fit_transform(features_test_scaled)

# print(f"Scaled Data NaNs: {pd.isna(features_test_scaled).sum()}")

# reduced_data = pd.DataFrame(features_test_reduced, columns=['TSNE-1', 'TSNE-2'])

# reduced_data["predicted_labels"] = classes["label"]
# reduced_data["true_labels"] = true_labels["label"]
# reduced_data["index"] = true_labels["index"]
# print(reduced_data)


# reduced_data["predicted_labels"] = np.where(reduced_data["predicted_labels"] == "normal", 0, 1)
# reduced_data["true_labels"] = np.where(reduced_data["true_labels"] == "normal", 0, 1)
# print(f"Classes: {classes.shape}")
# print(f"True labels: {true_labels.shape}")
# print(f"Reduced data: {reduced_data.shape}")

# fig = go.Figure()

# for i in range(len(reduced_data)):
#     if reduced_data["true_labels"][i] == 1:
#         fig.add_trace(
#             go.Scatter(
#                 x=[reduced_data["TSNE-1"][i]],  
#                 y=[reduced_data["TSNE-2"][i]],  
#                 mode="markers",
#                 marker=dict(
#                     color="blue",  
#                     size=10,
#                     opacity=0.7,
#                     symbol="circle",
#                 ),
#                 name="True labels",
#                 hovertext=f"Index: {reduced_data['index'][i]}<br>Predicted label: {reduced_data['predicted_labels'][i]}<br>True label: {reduced_data['true_labels'][i]}"
#             )
#         )

#     if reduced_data["predicted_labels"][i] == 1:
#         fig.add_trace(
#             go.Scatter(
#                 x=[reduced_data["TSNE-1"][i]], 
#                 y=[reduced_data["TSNE-2"][i]], 
#                 mode="markers",
#                 marker=dict(
#                     color="red",  
#                     size=8,
#                     opacity=0.5,
#                     symbol="x",
#                 ),
#                 name="Predicted Labels",
#                 hovertext=f"Index: {reduced_data['index'][i]}<br>Predicted label: {reduced_data['predicted_labels'][i]}<br>True label: {reduced_data['true_labels'][i]}"
#             )
#         )

#     else:
#          fig.add_trace(
#             go.Scatter(
#                 x=[reduced_data["TSNE-1"][i]],  # Only plot the current row
#                 y=[reduced_data["TSNE-2"][i]],  # Only plot the current row
#                 mode="markers",
#                 marker=dict(
#                     color="white",  # White color for true labels with value 0
#                     size=10,
#                     opacity=0.4,
#                     symbol="circle",
#                 ),
#                 hovertext=f"Index: {reduced_data['index'][i]}<br>Predicted label: {reduced_data['predicted_labels'][i]}<br>True label: {reduced_data['true_labels'][i]}"
#             )
#         )        

# fig.update_layout(
#     title = "KNN classification results using t-SNE reduction",
#     xaxis_title = "TSNE-1",
#     yaxis_title = "TSNE-2",
#     template = "plotly_dark",
#     showlegend = False,
#     hovermode = "closest"
# )

# fig.show()

In [None]:
# UMAP for dimensionality reduction while maintaining local relationships
umap_test_model = umap.UMAP(n_components=2)
features_test_scaled = scaler.fit_transform(features_test)
features_test_reduced_umap = umap_test_model.fit_transform(features_test_scaled)

# Prepare the reduced data frame
reduced_data_umap = pd.DataFrame(features_test_reduced_umap, columns=['UMAP-1', 'UMAP-2'])
reduced_data_umap["predicted_labels"] = classes["label"]
reduced_data_umap["true_labels"] = true_labels["label"]
reduced_data_umap["index"] = true_labels["index"]

# Convert labels to binary values (0 for normal, 1 for outlier)
#reduced_data_umap["predicted_labels"] = np.where(reduced_data_umap["predicted_labels"] == "normal", 0, 1)
#reduced_data_umap["true_labels"] = np.where(reduced_data_umap["true_labels"] == "normal", 0, 1)

# Initialize the UMAP plot
fig_umap = go.Figure()

# Plot true labels (blue circles)
for i in range(len(reduced_data_umap)):
    if reduced_data_umap["true_labels"][i] == "outlier":
        fig_umap.add_trace(
            go.Scatter(
                x=[reduced_data_umap["UMAP-1"][i]],
                y=[reduced_data_umap["UMAP-2"][i]],
                mode="markers",
                marker=dict(
                    color="blue",
                    size=10,
                    opacity=0.7,
                    symbol="circle",
                ),
                name="True labels",
                hovertext=f"Index: {reduced_data_umap['index'][i]}<br>Predicted label: {reduced_data_umap['predicted_labels'][i]}<br>True label: {reduced_data_umap['true_labels'][i]}"
            )
        )

    elif reduced_data_umap["true_labels"][i] == "normal":
        fig_umap.add_trace(
            go.Scatter(
                x=[reduced_data_umap["UMAP-1"][i]],
                y=[reduced_data_umap["UMAP-2"][i]],
                mode="markers",
                marker=dict(
                    color="white",  # Color for normal labels
                    size=10,
                    opacity=0.4,
                    symbol="circle",
                ),
                hovertext=f"Index: {reduced_data_umap['index'][i]}<br>Predicted label: {reduced_data_umap['predicted_labels'][i]}<br>True label: {reduced_data_umap['true_labels'][i]}"
            )
        )

# Plot predicted labels (red 'x's)
for i in range(len(reduced_data_umap)):
    if reduced_data_umap["predicted_labels"][i] == "outlier":
        fig_umap.add_trace(
            go.Scatter(
                x=[reduced_data_umap["UMAP-1"][i]],
                y=[reduced_data_umap["UMAP-2"][i]],
                mode="markers",
                marker=dict(
                    color="red",
                    size=8,
                    opacity=0.5,
                    symbol="x",
                ),
                name="Predicted Labels",
                hovertext=f"Index: {reduced_data_umap['index'][i]}<br>Predicted label: {reduced_data_umap['predicted_labels'][i]}<br>True label: {reduced_data_umap['true_labels'][i]}"
            )
        )

    elif reduced_data_umap["predicted_labels"][i] == "normal":
        fig_umap.add_trace(
            go.Scatter(
                x=[reduced_data_umap["UMAP-1"][i]],
                y=[reduced_data_umap["UMAP-2"][i]],
                mode="markers",
                marker=dict(
                    color="white",  # Color for normal predicted labels
                    size=8,
                    opacity=0.4,
                    symbol="x",
                ),
                hovertext=f"Index: {reduced_data_umap['index'][i]}<br>Predicted label: {reduced_data_umap['predicted_labels'][i]}<br>True label: {reduced_data_umap['true_labels'][i]}"
            )
        )

# Update layout for the UMAP plot
fig_umap.update_layout(
    title="KNN Test Classification Results using UMAP reduction",
    xaxis_title="UMAP-1",
    yaxis_title="UMAP-2",
    template="plotly_dark",
    showlegend=False,
    hovermode="closest"
)

fig_umap.show()
#fig_umap.write_image("dev/figures/fig_knn_test_predictions_umap.png", width=1100, height= 350, scale = 3)

In [None]:
from sklearn.decomposition import PCA

# PCA for dimensionality reduction
pca_model = PCA(n_components=2)
features_test_reduced_pca = pca_model.fit_transform(features_test_scaled)

covariance = pca_model.get_covariance()
# Prepare the reduced data frame for PCA plot
reduced_data_pca = pd.DataFrame(features_test_reduced_pca, columns=['PCA-1', 'PCA-2'])
reduced_data_pca["predicted_labels"] = classes["label"]
reduced_data_pca["true_labels"] = true_labels["label"]
reduced_data_pca["index"] = true_labels["index"]

# Initialize the PCA plot
fig_pca = go.Figure()

for i in range(len(reduced_data_pca)):
    if reduced_data_pca["true_labels"][i] == "outlier":
        fig_pca.add_trace(
            go.Scatter(
                x=[reduced_data_pca["PCA-1"][i]],
                y=[reduced_data_pca["PCA-2"][i]],
                mode="markers",
                marker=dict(
                    color="blue",
                    size=10,
                    opacity=0.8,
                    symbol="circle",
                ),
                name="True labels",
                hovertext=f"Index: {reduced_data_pca['index'][i]}<br>Predicted label: {reduced_data_pca['predicted_labels'][i]}<br>True label: {reduced_data_pca['true_labels'][i]}"
            )
        )

    if reduced_data_pca["predicted_labels"][i] == "outlier":
        fig_pca.add_trace(
            go.Scatter(
                x=[reduced_data_pca["PCA-1"][i]],
                y=[reduced_data_pca["PCA-2"][i]],
                mode="markers",
                marker=dict(
                    color="red",
                    size=8,
                    opacity=0.5,
                    symbol="x",
                ),
                name="Predicted Labels",
                hovertext=f"Index: {reduced_data_pca['index'][i]}<br>Predicted label: {reduced_data_pca['predicted_labels'][i]}<br>True label: {reduced_data_pca['true_labels'][i]}"
            )
        )
    else:
        fig_pca.add_trace(
            go.Scatter(
                x=[reduced_data_pca["PCA-1"][i]],
                y=[reduced_data_pca["PCA-2"][i]],
                mode="markers",
                marker=dict(
                    color="white",  # White color for true labels with value 0
                    size=10,
                    opacity=0.4,
                    symbol="circle",
                ),
                hovertext=f"Index: {reduced_data_pca['index'][i]}<br>Predicted label: {reduced_data_pca['predicted_labels'][i]}<br>True label: {reduced_data_pca['true_labels'][i]}"
            )
        )

# Update the layout
fig_pca.update_layout(
    title="KNN classification results using PCA reduction",
    xaxis_title="PCA-1",
    yaxis_title="PCA-2",
    template="plotly_dark",
    showlegend=False,
    hovermode="closest"
)

fig_pca.show()
#fig_pca.write_image("dev/figures/fig_knn_test_predictions_pca.png", width=1100, height= 350, scale = 3)

In [None]:
#print("covariance:", covariance)
covariance_matrix = []
for i in range(len(covariance)):
    covariance_matrix.append(pd.DataFrame(covariance[i], columns=[features_test.columns[i]], index =features_test.columns ))

covariance_matrix = pd.concat(covariance_matrix, axis = 1)

fig = go.Figure(data=go.Heatmap(
    z=covariance_matrix.values,
    x=covariance_matrix.columns,
    y=covariance_matrix.index,
    colorscale='Viridis',
    colorbar=dict(title='Covariance'),
    zmin=np.min(covariance_matrix.values),
    zmax=np.max(covariance_matrix.values)
))

fig.update_layout(
    title='PCA Covariance Matrix Heatmap',
    xaxis_title='Features',
    yaxis_title='Features',
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    width=700,
    height=700
)

fig.show()