In [1]:
import os
import pandas as pd
# evaluate stability also prepares data for transfer from unsupervised to supervised learning by assigning true class labels
from src.StreamPort.machine_learning.methods import MachineLearningEvaluateModelStabilityNative

# join old train and test data with labels to be passed into knn
old_train_data = pd.read_csv("dev/train_features.csv")
old_train_metadata = pd.read_csv("dev/train_metadata.csv")
old_train_metadata["label"] = "normal"
metadata_columns = old_train_metadata.columns
old_train_data = pd.concat([old_train_metadata, old_train_data], axis = 1)

test_files = []
test_meta = []

path_to_test_records = "dev/test_record.csv"

test_record = pd.read_csv(path_to_test_records) if os.path.exists(path_to_test_records) else None
test_record = test_record.sort_values("date") if test_record is not None else None

if test_record is not None:
    result_logs = []
    for date in test_record["date"]:

        filepath = f"dev/test_{date}_classified_samples.csv"
        result_logs.append(filepath) if os.path.exists(filepath) else print(f"No records for {date}")

        featurepath = f"dev/test_{date}_features.csv"
        test_files.append(featurepath) if os.path.exists(featurepath) else print(f"No records for {date}")
        
        metapath = f"dev/test_{date}_metadata.csv"
        test_meta.append(metapath) if os.path.exists(metapath) else print(f"No records for {date}")
        
else:
    print("Not enough evidence of true inliers! Please run more tests")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

summary = pd.DataFrame()
data = pd.DataFrame()
metadata = pd.DataFrame()

for i in range(len(test_files)):
    feature_df = pd.read_csv(test_files[i])
    data = pd.concat([data, feature_df], ignore_index=True)

    log = pd.read_csv(result_logs[i])
    summary = pd.concat([summary, log], ignore_index=True)

    meta = pd.read_csv(test_meta[i])
    metadata = pd.concat([metadata, meta], ignore_index=True)

new_data = pd.concat([metadata, data], axis = 1)

new_data = new_data.sort_values("index")
new_data.drop_duplicates(subset = "index", inplace=True)
new_data.reset_index(drop=True, inplace=True)

summary = summary.sort_values("index")
summary.reset_index(drop=True, inplace = True)

In [3]:
print(summary)
summary.to_csv("dev/test_summary.csv", index=False)

    index  threshold     score  confidence    class  \
0      47  -0.061153 -0.093758        1.53  outlier   
1      47  -0.061153 -0.093758        1.53  outlier   
2      49  -0.060696 -0.070741        1.17  outlier   
3      50  -0.061153 -0.016271        0.27   normal   
4     117  -0.061153 -0.030526        0.50   normal   
5     117  -0.061153 -0.030526        0.50   normal   
6     117  -0.060696 -0.062700        1.03  outlier   
7     119  -0.061153 -0.064955        1.06  outlier   
8     121  -0.061153 -0.031383        0.51   normal   
9     121  -0.060696 -0.020562        0.34   normal   
10    121  -0.061689 -0.045886        0.74   normal   
11    122  -0.061153 -0.000478        0.01   normal   
12    123  -0.060696 -0.023892        0.39   normal   
13    123  -0.061153 -0.043823        0.72   normal   
14    125  -0.060696 -0.020570        0.34   normal   
15    126  -0.061153 -0.001754        0.03   normal   
16    127  -0.061689 -0.067913        1.10  outlier   
17    128 

In [4]:
model_eval = MachineLearningEvaluateModelStabilityNative(test_records=summary)
true_classes, stability_score = model_eval.run()

print("True classes: ", true_classes)
print("Model stability score: ", stability_score)

if (true_classes["class_true"] == "not_set").any():
    print("Classification Complete")
else:
    print("Some samples are unverified")

Model performance Summary:     index  threshold     score  confidence    class  \
0     47  -0.061153 -0.093758        1.53  outlier   
1     47  -0.061153 -0.093758        1.53  outlier   
2     49  -0.060696 -0.070741        1.17  outlier   
3     50  -0.061153 -0.016271        0.27   normal   
4    117  -0.061153 -0.030526        0.50   normal   

                         date class_true  stability_score  
0  2025-07-29 16-35-41-271047    not set         1.000000  
1  2025-07-29 16-35-46-848140    not set         1.000000  
2  2025-07-29 16-40-22-832458    not set         0.545455  
3  2025-07-29 16-35-41-271047    not set         0.545455  
4  2025-07-29 16-35-41-271047    not set         0.833333  
True classes:        class_true majority_class  confidence_consistency
index                                                  
47       not set        outlier                1.000000
49       not set        outlier                0.090909
50       not set         normal                0

In [None]:
# sorted. Minor edits to make column order the same
true_classes.reset_index(drop=True, inplace=True)
new_data["label"] = true_classes["class_true"]
col = new_data.pop("label")
new_data.insert(14, "label", col)

In [8]:
from src.StreamPort.machine_learning.methods import MachineLearningMethodNearestNeighboursClassifierSklearn
from src.StreamPort.machine_learning.methods import MachineLearningScaleFeaturesScalerSklearn
from src.StreamPort.machine_learning.analyses import MachineLearningAnalyses
from sklearn.model_selection import train_test_split

old_train_data = old_train_data.reset_index(drop=True)
new_data.reset_index(drop=True, inplace=True)
data = pd.concat([old_train_data, new_data], ignore_index=True)
# drop unclassified samples
data = data[data["label"] != "not_set"]
# shuffle dataframe randomly by rows and keep 100% (frac = 1)
data = data.sample(frac=1).reset_index(drop=True)

metadata = data[metadata_columns]
labels = metadata["label"]
data = data.drop(columns= metadata_columns)

features_train, features_test, metadata_train, metadata_test = train_test_split(data, metadata, test_size=0.3, stratify=metadata)

ana = MachineLearningAnalyses(variables=features_train, metadata=metadata_train)

scl = MachineLearningScaleFeaturesScalerSklearn()
ana = scl.run(ana)

knn = MachineLearningMethodNearestNeighboursClassifierSklearn()
ana = knn.run(ana)

print(ana)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
ana.plot_data()

In [None]:
ana.train()
train_classes = ana.get_training_scores()
print(train_classes)

In [None]:
ana.predict(features_test, metadata_test)
classes = ana.get_prediction_classes()
print(classes)

In [None]:
%store -r test_set test_set_meta

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

for i in range(len(true_classes)):
    current_index = true_classes.index[i]
    this_group_in_summary = summary[summary["index"] == current_index]
    print("this Index: ", current_index, " and group: ", this_group_in_summary)
    if this_group_in_summary.empty:
        continue

    # X axis is repetition count: 1, 2, ..., N
    x_vals = list(range(1, len(this_group_in_summary) + 1))

    # Map 'class' to 0/1
    y1 = this_group_in_summary["class"].map({"normal": 0, "outlier": 1}).tolist()
    y2 = this_group_in_summary["confidence"].tolist()

    # Class trace (left Y axis)
    fig.add_trace(go.Scatter(
        x=x_vals,
        y=y1,
        mode="lines+markers",
        yaxis="y1",
        marker=dict(symbol="circle", color="red"),
        line=dict(dash="dot"),
        hovertext=this_group_in_summary["class"],
        hoverinfo="text"
    ))

    # Confidence trace (right Y axis)
    fig.add_trace(go.Scatter(
        x=x_vals,
        y=y2,
        mode="lines+markers",
        yaxis="y2",
        marker=dict(symbol="square", color="blue"),
        line=dict(dash="solid"),
        hovertemplate="Confidence: %{y}<extra></extra>"
    ))

# Layout setup
fig.update_layout(
    title="True Class (0/1) and Confidence by Test Repetition",
    xaxis=dict(title="Test Repetition Number"),
    yaxis=dict(
        title="Class (0=Normal, 1=Outlier)",
        tickvals=[0, 1],
        ticktext=["Normal", "Outlier"],
        tickfont=dict(color="red")
    ),
    yaxis2=dict(
        title="Confidence Score",
        overlaying="y",
        side="right",
        range=[0.4, 1.6],
        tickfont=dict(color="blue")
    ),
    template="simple_white",
    height=500,
    width=1000,
    legend=dict(
        x=0.5,
        y=1.1,
        xanchor="center",
        yanchor="top",
        orientation="h"
    ),
    showlegend = False
)

fig.show()
