In [1]:
import pandas as pd

# join old train and test data with labels to be passed into knn
old_train_data = pd.read_csv("dev/train_features.csv")
old_train_metadata = pd.read_csv("dev/train_metadata.csv")
old_train_metadata["label"] = "normal"
metadata_columns = old_train_metadata.columns
old_train_data = pd.concat([old_train_metadata, old_train_data], axis = 1)
print("Normal data: ", old_train_data[metadata_columns])

Normal data:      index             name                                               path  \
0      18  210812_Pac--005  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
1      19  210812_Pac--006  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
2      20  210812_Pac--007  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
3      21  210812_Pac--008  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
4      22  210812_Pac--009  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
5      23  210812_Pac--010  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
6      24  210812_Pac--011  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
7      25  210812_Pac--012  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
8      26  210812_Pac--013  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
9      27  210812_Pac--014  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
10     51  210813_Pac--005  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
11     52  210

In [2]:
import os

test_files = []
test_meta = []

path_to_test_records = "dev/test_record.csv"

test_record = pd.read_csv(path_to_test_records) if os.path.exists(path_to_test_records) else None
test_record = test_record.sort_values("date") if test_record is not None else None

if test_record is not None:
    result_logs = []
    for date in test_record["date"]:

        filepath = f"dev/test_{date}_classified_samples.csv"
        result_logs.append(filepath) if os.path.exists(filepath) else print(f"No records for {date}")

        featurepath = f"dev/test_{date}_features.csv"
        test_files.append(featurepath) if os.path.exists(featurepath) else print(f"No records for {date}")
        
        metapath = f"dev/test_{date}_metadata.csv"
        test_meta.append(metapath) if os.path.exists(metapath) else print(f"No records for {date}")
        
else:
    print("Not enough evidence of true inliers! Please run more tests")

In [3]:

summary = pd.DataFrame()
data = pd.DataFrame()
metadata = pd.DataFrame()

for i in range(len(test_files)):
    feature_df = pd.read_csv(test_files[i])
    data = pd.concat([data, feature_df], ignore_index=True)

    log = pd.read_csv(result_logs[i])
    summary = pd.concat([summary, log], ignore_index=True)

    meta = pd.read_csv(test_meta[i])
    metadata = pd.concat([metadata, meta], ignore_index=True)

new_data = pd.concat([metadata, data], axis = 1)

new_data = new_data.sort_values("index")
new_data.drop_duplicates(subset = "index", inplace=True)
new_data.reset_index(drop=True, inplace=True)

summary = summary.sort_values("index")
summary.reset_index(drop=True, inplace = True)

In [4]:
print(summary)
summary.to_csv("dev/test_summary.csv", index=False)

    index  threshold     score  confidence    class  \
0      14  -0.080903 -0.111895        1.38  outlier   
1      47  -0.061153 -0.093758        1.53  outlier   
2      47  -0.061153 -0.093758        1.53  outlier   
3      49  -0.060696 -0.070741        1.17  outlier   
4      50  -0.061153 -0.016271        0.27   normal   
..    ...        ...       ...         ...      ...   
65    272  -0.061153 -0.038800        0.63   normal   
66    272  -0.061153 -0.038800        0.63   normal   
67    275  -0.061153 -0.037287        0.61   normal   
68    275  -0.061689 -0.061361        0.99   normal   
69    278  -0.061153 -0.051743        0.85   normal   

                          date  
0   2025-07-30 10-59-59-838221  
1   2025-07-29 16-35-41-271047  
2   2025-07-29 16-35-46-848140  
3   2025-07-29 16-40-22-832458  
4   2025-07-29 16-35-41-271047  
..                         ...  
65  2025-07-29 16-35-35-763234  
66  2025-07-29 16-35-46-848140  
67  2025-07-29 16-35-35-763234  
68  2025-

In [5]:
# evaluate stability also prepares data for transfer from unsupervised to supervised learning by assigning true class labels
from src.StreamPort.machine_learning.methods import MachineLearningEvaluateModelStabilityNative
model_eval = MachineLearningEvaluateModelStabilityNative(test_records=summary)
true_classes, stability_score = model_eval.run()

print("True classes: ", true_classes)
print("Model stability score: ", stability_score)

if (true_classes["class_true"] == "not_set").any():
    print("Classification Complete")
else:
    print("Some samples are unverified. Setting true_classes to majority class")
    mask = true_classes["class_true"] == "not set"
    true_classes.loc[mask, "class_true"] = true_classes.loc[mask, "majority_class"]

  from .autonotebook import tqdm as notebook_tqdm


Model performance Summary:     index  threshold     score  confidence    class  \
0     14  -0.080903 -0.111895        1.38  outlier   
1     47  -0.061153 -0.093758        1.53  outlier   
2     47  -0.061153 -0.093758        1.53  outlier   
3     49  -0.060696 -0.070741        1.17  outlier   
4     50  -0.061153 -0.016271        0.27   normal   

                         date class_true  stability_score  
0  2025-07-30 10-59-59-838221    not set         0.545455  
1  2025-07-29 16-35-41-271047    not set         1.000000  
2  2025-07-29 16-35-46-848140    not set         1.000000  
3  2025-07-29 16-40-22-832458    not set         0.545455  
4  2025-07-29 16-35-41-271047    not set         0.545455  
True classes:        class_true majority_class  confidence_consistency
index                                                  
14       not set        outlier                0.090909
47       not set        outlier                1.000000
49       not set        outlier                0

In [6]:
# sorted. Minor edits to ensure identical column order 
true_classes.reset_index(drop=True, inplace=True)
new_data["label"] = true_classes["class_true"]
col = new_data.pop("label")
new_data.insert(14, "label", col)
print("New data: ", new_data[metadata_columns])

New data:      index             name                                               path  \
0      14       210812_Pac  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
1      47       210813_Pac  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
2      49  210813_Pac--003  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
3      50  210813_Pac--004  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
4     116       210819_Pac  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
5     117  210819_Pac--002  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
6     118  210819_Pac--003  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
7     119  210819_Pac--004  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
8     120  210819_Pac--005  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
9     121  210819_Pac--006  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
10    122  210819_Pac--007  C:/Users/Sandeep/Desktop/ExtractedSignals\2108...   
11    123  210819

In [7]:
old_train_data = old_train_data.reset_index(drop=True)
new_data.reset_index(drop=True, inplace=True)

data = pd.concat([old_train_data, new_data], ignore_index=True)
print("KNN Training Data: ", data.shape)

# drop unclassified samples
data = data[data["label"] != "not set"]

# shuffle dataframe randomly by rows and keep 100% (frac = 1)
data = data.sample(frac=1).reset_index(drop=True)
print("Shuffled: ", data.shape)

metadata = data[metadata_columns]
labels = metadata["label"]
data = data.drop(columns= metadata_columns)


KNN Training Data:  (67, 32)
Shuffled:  (67, 32)


In [8]:
# create random splits of training and test sets. Shuffling and train-test-splitting ensure a good distribution of classes
from sklearn.model_selection import train_test_split
features_train, features_test, metadata_train, metadata_test = train_test_split(data, metadata, test_size=0.3, stratify=metadata["label"])

In [9]:
# automate test parameter comparison using GridSearchCV
from src.StreamPort.machine_learning.methods import MachineLearningAutomateTestParametersGridSearchSklearn
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import KNeighborsClassifier

iso = IsolationForest
nbr = KNeighborsClassifier
grid = MachineLearningAutomateTestParametersGridSearchSklearn(model=nbr, train_data=features_train, train_metadata=metadata_train)

grid = grid.run()

scaler = grid[0]
scaled_test = scaler.transform(features_test)
features_test = pd.DataFrame(
    scaled_test,
    columns= features_test.columns,
    index = features_test.index
)

search = grid[1]
print("Best params: ", search.best_params_)

best_knn = search.best_estimator_
test_accuracy = best_knn.score(features_test, metadata_test["label"])

print("Test accuracy:", test_accuracy)

is_classifier




TypeError: Cannot clone object. You should provide an instance of scikit-learn estimator instead of a class.

In [None]:

from src.StreamPort.machine_learning.analyses import MachineLearningAnalyses
ana = MachineLearningAnalyses(variables=features_train, metadata=metadata_train)

from src.StreamPort.machine_learning.methods import MachineLearningScaleFeaturesScalerSklearn
scl = MachineLearningScaleFeaturesScalerSklearn(scaler_type="StandardScaler")
ana = scl.run(ana)

from src.StreamPort.machine_learning.methods import MachineLearningMethodNearestNeighboursClassifierSklearn
knn = MachineLearningMethodNearestNeighboursClassifierSklearn(n_neighbors=1) #odd number for binary classification. 1 usually causes overfit, could be suitable for small amount of data
ana = knn.run(ana)

print(ana)

In [None]:
ana.plot_data()

In [None]:
ana.train()
train_classes = ana.get_training_scores()
print(train_classes)

In [None]:
ana.predict(features_test, metadata_test)
classes = ana.get_prediction_classes()
print(classes)

In [None]:
probs = ana.get_prediction_probabilities()
print(probs)

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

for i in range(len(true_classes)):
    current_index = true_classes.index[i]
    this_group_in_summary = summary[summary["index"] == current_index]
    print("this Index: ", current_index, " and group: ", this_group_in_summary)
    if this_group_in_summary.empty:
        continue

    # X axis is repetition count: 1, 2, ..., N
    x_vals = list(range(1, len(this_group_in_summary) + 1))

    # Map 'class' to 0/1
    y1 = this_group_in_summary["class"].map({"normal": 0, "outlier": 1}).tolist()
    y2 = this_group_in_summary["confidence"].tolist()

    # Class trace (left Y axis)
    fig.add_trace(go.Scatter(
        x=x_vals,
        y=y1,
        mode="lines+markers",
        yaxis="y1",
        marker=dict(symbol="circle", color="red"),
        line=dict(dash="dot"),
        hovertext=this_group_in_summary["class"],
        hoverinfo="text"
    ))

    # Confidence trace (right Y axis)
    fig.add_trace(go.Scatter(
        x=x_vals,
        y=y2,
        mode="lines+markers",
        yaxis="y2",
        marker=dict(symbol="square", color="blue"),
        line=dict(dash="solid"),
        hovertemplate="Confidence: %{y}<extra></extra>"
    ))

# Layout setup
fig.update_layout(
    title="True Class (0/1) and Confidence by Test Repetition",
    xaxis=dict(title="Test Repetition Number"),
    yaxis=dict(
        title="Class (0=Normal, 1=Outlier)",
        tickvals=[0, 1],
        ticktext=["Normal", "Outlier"],
        tickfont=dict(color="red")
    ),
    yaxis2=dict(
        title="Confidence Score",
        overlaying="y",
        side="right",
        range=[0.4, 1.6],
        tickfont=dict(color="blue")
    ),
    template="simple_white",
    height=500,
    width=1000,
    legend=dict(
        x=0.5,
        y=1.1,
        xanchor="center",
        yanchor="top",
        orientation="h"
    ),
    showlegend = False
)

fig.show()
