In [None]:
import os
import pandas as pd
# evaluate stability also prepares data for transfer from unsupervised to supervised learning by assigning true class labels
from src.StreamPort.machine_learning.methods import MachineLearningEvaluateModelStabilityNative

# join old train and test data with labels to be passed into knn
old_train_data = pd.read_csv("dev/error_lc_train_features.csv")
old_train_metadata = pd.read_csv("dev/error_lc_train_metadata.csv")
old_train_metadata["label"] = "normal"
metadata_columns = old_train_metadata.columns
old_train_data = pd.concat([old_train_metadata, old_train_data], axis = 1)

test_files = []
test_meta = []

path_to_test_records = "dev/error_lc_test_record.csv"

test_record = pd.read_csv(path_to_test_records) if os.path.exists(path_to_test_records) else None
test_record = test_record.sort_values("date") if test_record is not None else None

if test_record is not None:
    result_logs = []
    for date in test_record["date"]:

        filepath = f"dev/error_lc_test_{date}_classified_samples.csv"
        result_logs.append(filepath) if os.path.exists(filepath) else print(f"No records for {date}")

        featurepath = f"dev/error_lc_test_{date}_features.csv"
        test_files.append(featurepath) if os.path.exists(featurepath) else print(f"No records for {date}")
        
        metapath = f"dev/error_lc_test_{date}_metadata.csv"
        test_meta.append(metapath) if os.path.exists(metapath) else print(f"No records for {date}")
        
else:
    print("Not enough evidence of true inliers! Please run more tests")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

summary = pd.DataFrame()
data = pd.DataFrame()
metadata = pd.DataFrame()

for i in range(len(test_files)):
    feature_df = pd.read_csv(test_files[i])
    data = pd.concat([data, feature_df], ignore_index=True)

    log = pd.read_csv(result_logs[i])
    summary = pd.concat([summary, log], ignore_index=True)

    meta = pd.read_csv(test_meta[i])
    metadata = pd.concat([metadata, meta], ignore_index=True)

new_data = pd.concat([metadata, data], axis = 1)

new_data = new_data.sort_values("index")
new_data.drop_duplicates(subset = "index", inplace=True)
new_data.reset_index(drop=True, inplace=True)

summary = summary.sort_values("index")

In [3]:
model_eval = MachineLearningEvaluateModelStabilityNative(test_records=summary)
true_classes, stability_score = model_eval.run()

print("True classes: ", true_classes)
print("Model stability score: ", stability_score)

if (true_classes["class_true"] == "not_set").any():
    print("Classification Complete")
else:
    print("Some samples are unverified")

Model performance Summary:     index  threshold     score  confidence    class  \
0    582  -0.053762 -0.017555        0.33   normal   
1    582  -0.053762 -0.017555        0.33   normal   
2    582  -0.053762 -0.017555        0.33   normal   
3    587  -0.053762 -0.002876        0.05   normal   
4    588  -0.053762 -0.060119        1.12  outlier   

                         date class_true  stability_score  
0  2025-07-25 15-59-15-127053     normal         1.000000  
1  2025-07-25 15-59-21-099517                    1.000000  
2  2025-07-25 15-59-26-772089                    1.000000  
3  2025-07-25 15-59-26-772089    not set         0.545455  
4  2025-07-25 15-59-21-099517    outlier         1.000000  
True classes:        class_true majority_class  confidence_consistency
index                                                  
582       normal         normal                1.000000
587      not set         normal                0.090909
588      outlier        outlier                1

In [4]:
# sorted
true_classes.reset_index(drop=True, inplace=True)
new_data["label"] = true_classes["class_true"]
col = new_data.pop("label")

new_data.insert(14, "label", col)

print("labeled data: ", new_data)

labeled data:      index                     name  \
0     582         001-D2F-B1-Flush   
1     587    006-D2F-A1-Mix 1 + IS   
2     588    007-D2F-A1-Mix 1 + IS   
3     592         011-D2F-B2-Blank   
4     594    013-D2F-A1-Mix 1 + IS   
5     605  024-D2F-B3-Matrix Blank   
6     606         025-D2F-B2-Blank   
7     607    026-D2F-A1-Mix 1 + IS   
8     608    027-D2F-A1-Mix 1 + IS   
9     610    029-D2F-A1-Mix 1 + IS   
10    611    030-D2F-A1-Mix 1 + IS   

                                                 path  \
0   C:/Users/Sandeep/Desktop/Error-LC/Method-Data\...   
1   C:/Users/Sandeep/Desktop/Error-LC/Method-Data\...   
2   C:/Users/Sandeep/Desktop/Error-LC/Method-Data\...   
3   C:/Users/Sandeep/Desktop/Error-LC/Method-Data\...   
4   C:/Users/Sandeep/Desktop/Error-LC/Method-Data\...   
5   C:/Users/Sandeep/Desktop/Error-LC/Method-Data\...   
6   C:/Users/Sandeep/Desktop/Error-LC/Method-Data\...   
7   C:/Users/Sandeep/Desktop/Error-LC/Method-Data\...   
8   C:/Users/Sa

In [5]:
from src.StreamPort.machine_learning.methods import MachineLearningMethodNearestNeighboursClassifierSklearn
from src.StreamPort.machine_learning.methods import MachineLearningScaleFeaturesScalerSklearn
from src.StreamPort.machine_learning.analyses import MachineLearningAnalyses
from sklearn.model_selection import train_test_split

old_train_data = old_train_data.reset_index(drop=True)
new_data.reset_index(drop=True, inplace=True)
data = pd.concat([old_train_data, new_data], ignore_index=True)
# drop unclassified samples
data = data[data["label"] != "not_set"]
# shuffle dataframe randomly by rows and keep 100% (frac = 1)
data = data.sample(frac=1).reset_index(drop=True)

metadata = data[metadata_columns]
labels = metadata["label"]
data = data.drop(columns= metadata_columns)

features_train, features_test, metadata_train, metadata_test = train_test_split(data, metadata, test_size=0.3, stratify=metadata)

ana = MachineLearningAnalyses(variables=features_train, metadata=metadata_train)

scl = MachineLearningScaleFeaturesScalerSklearn()
ana = scl.run(ana)

knn = MachineLearningMethodNearestNeighboursClassifierSklearn()
ana = knn.run(ana)

print(ana)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
ana.plot_data()

In [None]:
ana.train()
train_classes = ana.get_training_scores()
print(train_classes)

['outlier' 'outlier' 'outlier' 'outlier' 'outlier' 'outlier' 'outlier'
 'outlier' 'outlier' 'outlier' 'outlier' 'outlier' 'outlier' 'outlier'
 'outlier' 'outlier' 'outlier' 'outlier' 'outlier' 'outlier']


In [None]:
ana.predict(features_test, metadata_test)
classes = ana.get_prediction_classes()
print(classes)

None


In [None]:
%store -r test_set test_set_meta