In [None]:
import os
import pandas as pd
from src.StreamPort.machine_learning.methods import MachineLearningEvaluateModelStabilityNative

rest_indices = 20
test_indices = 4

path_to_test_records = "dev/error_lc_test_record.csv"

test_record = pd.read_csv(path_to_test_records) if os.path.exists(path_to_test_records) else None
test_record = test_record.sort_values("date") if test_record is not None else None
if test_record is not None and len(test_record) > rest_indices//test_indices:
    result_logs = []
    for date in test_record["date"]:
        result_logs.append(f"dev/error_lc_test_{date}_classified_samples.csv") if os.path.exists(f"dev/error_lc_test_{date}_classified_samples.csv") else print(f"No records for {date}")
else:
    print("Not enough evidence of true inliers! Please run more tests for more data")

summary = pd.DataFrame()
for log in result_logs:
    log = pd.read_csv(log)
    summary = pd.concat([summary, log], ignore_index=True)
summary = summary.sort_values("index")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model_eval = MachineLearningEvaluateModelStabilityNative(test_records=summary)
true_classes, stability_score = model_eval.run()
print("True classes: ", true_classes)
print("Model stability score: ", stability_score)

Model performance Summary:      index  threshold     score  confidence    class  \
0     158  -0.053316 -0.097339        1.83  outlier   
1     158  -0.066785 -0.078174        1.17  outlier   
2     158  -0.066785 -0.078174        1.17  outlier   
3     158  -0.066785 -0.078174        1.17  outlier   
4     158  -0.046520 -0.078519        1.69  outlier   
..    ...        ...       ...         ...      ...   
68    190  -0.056991 -0.043293        0.76   normal   
69    190  -0.053316 -0.054654        1.03  outlier   
70    190  -0.066785 -0.040508        0.61   normal   
71    190  -0.066785 -0.040508        0.61   normal   
72    190  -0.046520 -0.043293        0.93   normal   

                          date class_true  stability_score  
0   2025-07-23 12-34-26-872389    outlier         0.905672  
1   2025-07-23 14-49-28-169722                    0.905672  
2   2025-07-23 14-49-20-912694                    0.905672  
3   2025-07-23 15-36-48-666559                    0.905672  
4   20

In [None]:
if (true_classes["class_true"] == "not_set").any():
    print("Classification Complete")
else:
    print("Some samples are unverified")

In [None]:

#models have internal stochastic variability due to the way in which they classify/regress. Compare detection results across tests with default parameters to find stability in performance
def get_model_stability(confidence_buffer : float = 0.1, times_classified : int = 2):




    #if non-deterministic model, data should be added to the training set if it has been declared an inlier a minimum of x times with a confidence <= y. add_prediction needs to be tweaked(?) 
    def get_true_classes(group):
        outlier_count = ((group['class'] == 'outlier') & (group['confidence'] > 1 + confidence_buffer)).sum() #1 confidence = threshold value. anomaly > 1, normal < 1
        inlier_count = ((group['class'] == 'normal') & (group['confidence'] < 1 - confidence_buffer)).sum() #values between 0.9 and 1.1 taken as a buffer zone of ambiguous detection accuracy

        if outlier_count > times_classified and outlier_count > inlier_count:
            return "outlier"
        elif inlier_count > times_classified and inlier_count > outlier_count:
            return "normal"
        else:
            return "not set"

    class_results = summary.groupby('index').apply(get_true_classes, include_groups=False).reset_index(name='class_true') ####1####

    summary = summary.merge(class_results, on='index', how='left')

    first_occurrence = ~summary.duplicated(subset='index')

    summary.loc[~first_occurrence, 'class_true'] = ""  
    
    #calculate agreement between results per test index
    label_counts = summary.groupby(['index', 'class']).size().unstack(fill_value=0)
    max_counts = label_counts.max(axis=1)
    total_counts = label_counts.sum(axis=1)
    agreement_ratio = max_counts / total_counts #ranges from 0.5 (flip-flopping) to 1.0 (always same label)

    #find majority class per index ("normal" or "outlier")
    majority_class = label_counts.idxmax(axis=1) ####2####

    #calculate coefficient of variation (cv = std/mean) of confidence for majority class per index
    def get_confidence_variation(group):
        majority = majority_class.loc[group.name]
        confidence_values = group.loc[group['class'] == majority, 'confidence']
        if len(confidence_values) == 0:
            return np.nan  #no data for majority class
        mean_confidence = confidence_values.mean()
        std_confidence = confidence_values.std()
        if mean_confidence == 0 or pd.isna(mean_confidence):
            return np.nan
        return std_confidence / mean_confidence
    
    confidence_variation = summary.groupby('index').apply(get_confidence_variation, include_groups=False)

    #replace any NaN with a large number to simulate instability
    confidence_variation = confidence_variation.fillna(10)

    #convert cv to a consistency score: higher cv means lower consistency ####3####
    confidence_consistency = 1 / (1 + confidence_variation) #consistency = 1 / (1 + cv) to bound between 0 and 1, ~1 means stable(low variation), ~0 means unstable

    #combined stability score (average of agreement and confidence consistency)
    combined_stability = (agreement_ratio + confidence_consistency) / 2

    true_classes = pd.concat(
                                [
                                    class_results.set_index("index"), 
                                    majority_class.rename("majority_class"), 
                                    confidence_consistency.rename("classification_consistency")
                                ], 
                                axis = 1
                            )
    
    true_classes["confidence"] = true_classes["classification_consistency"].apply(lambda x: "high" if x >= 0.8 else "mid" if x > 0.65 and x < 0.8 else "low")
    print("True classes: ", true_classes)

    stability_df = combined_stability.reset_index(name='stability_score')
    summary = summary.merge(stability_df, on='index', how='left')

    return (summary, combined_stability.mean())

# Usage 
summary_with_stability = get_model_stability()
#print(f"Overall model stability score: {overall_stability_score}")
print("Stability summary: ", summary_with_stability[0].head(), "\nModel stability: ", summary_with_stability[1]) if summary_with_stability is not None else print("No results yet.")

