In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
import os
import io
import re
from scipy.io.wavfile import read
from sklearn.datasets import make_blobs
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.cm as cmx
from datetime import datetime, timezone
import datetime
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
from pathlib import Path
from sklearn.cluster import KMeans
import copy
from scipy.stats import mode


In [None]:
def load_waveform(filepath):
    with open(filepath) as dataFile:
        data = dataFile.read()
        # Check for missing commas and insert them
        pattern = r'(\{[^{}]*"[^"]*"[^{}]*\})(?=[^{}]*\{)'
        data = re.sub(pattern, r'\1,', data)

        obj = data[data.find('{') : data.rfind('}')+1]
        jsonObj = json.loads(obj)
        
        if jsonObj['data']:
            waveform_data = pd.DataFrame(jsonObj['data'], columns=["peak_amplitude","integral","phase_angle","cycle_number","rise_time","pulse_width"])    
            return waveform_data
    return None


In [None]:
def load_dataframes(dirpath):
    dataframes = []
    for filename in os.listdir(dirpath):
        if filename.endswith(".js"):
            filepath = os.path.join(dirpath, filename)
            waveform = load_waveform(filepath)
            if not waveform.empty:
                waveform['filename'] = filename # Add filename as a column
                dataframes.append(waveform)
    return dataframes

In [None]:
# load training data
dirpath1 = "ptest"
dirpath2 = "ntest"
dirpath3 = "utest"
pdf = load_dataframes(dirpath1)
ndf = load_dataframes(dirpath2)
udf = load_dataframes(dirpath3)

In [None]:
dependent_vars = {} # create a dictionary with the assigned dependent variables mapped to the filename as the key
for df, y in zip([pdf, ndf, udf], [1, 2, 0]): # 1 is positive, 2 is negative, 0 is unclassified
    for filename in pd.concat(df)['filename'].unique():
        dependent_vars[filename] = y
# this is used to assign the dependent variables to the filename in the transformed dataframe

In [None]:
# combine all list of dataframes into a single list of dataframes once dependent variables have been assigned
dataframes = pdf + ndf + udf

In [None]:
print(dataframes) # check

In [None]:
for dataframe in dataframes: # show data into plots
    plt.scatter(dataframe['phase_angle'], dataframe['peak_amplitude'], s=8)
    plt.title(dataframe['filename'].iloc[0])
    plt.xlabel('Phase angle')
    plt.ylabel('Peak amplitude')
    plt.show()

In [None]:
def determine_baseline(bucket_data, kmeans, predicted_clusters, threshold=5): # algorithm to determine baselines
    centroids = kmeans.cluster_centers_
    centroid_distance = abs(centroids[1] - centroids[0])

    if centroid_distance > threshold:
        # Clusters far apart - Outcome 1
        lower_centroid_index = np.argmin(centroids)
        lower_cluster = bucket_data[predicted_clusters == lower_centroid_index]
        return max(lower_cluster['peak_amplitude'])
    else:
        # Clusters close together - Outcome 2
        higher_centroid_index = np.argmax(centroids)
        return centroids[higher_centroid_index][0]

In [None]:
def denoise_dataframes(dataframes): # function to calculate base-lines and plot the graphs with the new added information
    warnings.filterwarnings('ignore')
    for df in dataframes:
        # if df['peak_amplitude'].max() > 8: # filter out the negative
        #     continue

        # Bucket phase angles
        df['bucket'] = pd.cut(df['phase_angle'], bins=range(0, 361, 10), labels=False)

        # Perform k-means clustering on each bucket and determine baselines
        kmeans_results = []
        bucket_baselines = []
        for i in range(36):
            bucket_data = df[df['bucket'] == i]
            if not bucket_data.empty and len(bucket_data) > 1:
                kmeans = KMeans(n_clusters=2, random_state=0, n_init=10)
                kmeans.fit(bucket_data['peak_amplitude'].values.reshape(-1, 1))
                kmeans_results.append((i, kmeans))

                # Predict the clusters for bucket_data
                predicted_clusters = kmeans.predict(bucket_data['peak_amplitude'].values.reshape(-1, 1))

                # Pass the predicted clusters to the determine_baseline function
                baseline = determine_baseline(bucket_data, kmeans, predicted_clusters)
                bucket_baselines.append((i, baseline))
            else:
                kmeans_results.append((i, None))
                bucket_baselines.append((i, None))

        # Assign cluster labels to each data point
        df['cluster'] = np.nan
        for i, kmeans in kmeans_results:
            if kmeans is not None:
                mask = df['bucket'] == i
                df.loc[mask, 'cluster'] = kmeans.predict(df.loc[mask, 'peak_amplitude'].values.reshape(-1, 1))

        baselines = [baseline for _, baseline in bucket_baselines if baseline is not None]
        basemode = mode(baselines).mode[0]

        # Remove points below the baseline mode (inserted)
        df.loc[:, 'filtered'] = df['peak_amplitude'] >= basemode

        # # Scatter plot with bucketed phase angles and clustered data points
        # plt.scatter(df['phase_angle'], df['peak_amplitude'], c=df['cluster'], cmap='viridis', s=8)
        # plt.title(df['filename'].iloc[0])
        # plt.xlabel('Phase angle')
        # plt.ylabel('Peak amplitude')

        # # Plot baselines
        # for i, baseline in bucket_baselines:
        #     if baseline is not None:
        #         plt.hlines(baseline, i * 10, (i + 1) * 10 - 1, colors='r', linestyles='dashed')
        # plt.hlines(basemode, 0, 359, colors='b', linestyles='solid')
        # plt.show()

    return dataframes

In [None]:
dataframes = denoise_dataframes(dataframes)

In [None]:
print(dataframes[2]) # check

In [None]:
filtered_dataframes = [df[df['filtered'] == True] for df in dataframes] # remove all data points below the baseline AKA removing noise
print(filtered_dataframes[1]) # check if remaining values are True, meaning that the data points remaining are the ones that we want to keep

In [None]:
for dataframe in filtered_dataframes: # show new graphs with threshold removed
    fig, ax = plt.subplots()
    ax.scatter(dataframe['phase_angle'], dataframe['peak_amplitude'], s=8)
    ax.set_title(dataframe['filename'].iloc[0])
    ax.set_xlabel('Phase angle')
    ax.set_ylabel('Peak amplitude')


In [None]:
# Initialize lists for the cluster features
for df in filtered_dataframes:
    # Extract the relevant columns
    X = df[['phase_angle', 'peak_amplitude']].values
    
    # Apply DBSCAN to the data
    dbscan = DBSCAN(eps=7, min_samples=8)
    dbscan.fit(X)
    labels = dbscan.labels_
    
    # Create a copy of the dataframe to avoid SettingWithCopyWarning
    df_copy = df.copy()
    df_copy['cluster_length'] = np.nan
    df_copy['cluster_height'] = np.nan
    df_copy['cluster_gradient_tr'] = np.nan
    df_copy['cluster_gradient_tl'] = np.nan
    
    # Add the cluster labels to the dataframe copy
    df_copy['cluster'] = labels
    
    # Access cluster information
    clusters = set(labels)
    for cluster in clusters:
        if cluster != -1:
            # Get the points belonging to the cluster
            cluster_points = X[labels == cluster]

            
            # Calculate cluster features
            cluster_length = cluster_points[:, 0].max() - cluster_points[:, 0].min()
            cluster_height = cluster_points[:, 1].max() - cluster_points[:, 1].min()
            cluster_gradient_tr = (cluster_points[:, 1].max() - cluster_points[:, 1].min()) / (cluster_points[:, 0].max() - cluster_points[:, 0].min())
            cluster_gradient_tl = (cluster_points[:, 1].max() - cluster_points[:, 1].min()) / (cluster_points[:, 0].min() - cluster_points[:, 0].max())
            
            # Add new columns for cluster features to the dataframe copy
            df_copy.loc[labels == cluster, 'cluster_length'] = cluster_length
            df_copy.loc[labels == cluster, 'cluster_height'] = cluster_height
            df_copy.loc[labels == cluster, 'cluster_gradient_tr'] = cluster_gradient_tr
            df_copy.loc[labels == cluster, 'cluster_gradient_tl'] = cluster_gradient_tl
    
    # Assign the cluster labels and features to the original dataframe
    df.loc[:, 'cluster'] = df_copy['cluster']
    df.loc[:, 'cluster_length'] = df_copy['cluster_length']
    df.loc[:, 'cluster_height'] = df_copy['cluster_height']
    df.loc[:, 'cluster_gradient_tr'] = df_copy['cluster_gradient_tr']
    df.loc[:, 'cluster_gradient_tl'] = df_copy['cluster_gradient_tl']
    
    # # Plot the clustered data
    # plt.scatter(df['phase_angle'], df['peak_amplitude'], c=df['cluster'], cmap='viridis', s=8)
    # plt.title(df['filename'].iloc[0])
    # plt.xlabel('Phase angle')
    # plt.ylabel('Peak amplitude')
    # plt.show()


In [None]:
# def plot_cluster_bounds(cluster_points):
#     x_min, x_max = cluster_points[:, 0].min(), cluster_points[:, 0].max()
#     y_min, y_max = cluster_points[:, 1].min(), cluster_points[:, 1].max()
#     plt.plot([x_min, x_max], [y_min, y_min], 'k-', linewidth=2)
#     plt.plot([x_max, x_max], [y_min, y_max], 'k-', linewidth=2)
#     plt.plot([x_max, x_min], [y_max, y_max], 'k-', linewidth=2)
#     plt.plot([x_min, x_min], [y_max, y_min], 'k-', linewidth=2)


# for df in filtered_dataframes:
#     # Extract the relevant columns
#     X = df[['phase_angle', 'peak_amplitude']].values
    
#     # Apply DBSCAN to the data
#     dbscan = DBSCAN(eps=10, min_samples=10)
#     dbscan.fit(X)
#     labels = dbscan.labels_
    
#     # Add the cluster labels to the dataframe
#     df['cluster'] = labels
    
#     # Create new columns for each cluster label
#     dummies = pd.get_dummies(df['cluster'], prefix='cluster')
#     df = pd.concat([df, dummies], axis=1)
    
#     # Access cluster information
#     clusters = set(labels)
#     for cluster in clusters:
#         if cluster != -1:
#             # Get the points belonging to the cluster
#             cluster_points = X[labels == cluster]
            
#             # Calculate cluster features
#             cluster_length = cluster_points[:, 0].max() - cluster_points[:, 0].min()
#             cluster_height = cluster_points[:, 1].max() - cluster_points[:, 1].min()
#             cluster_gradient_tr = (cluster_points[:, 1].max() - cluster_points[:, 1].min()) / (cluster_points[:, 0].max() - cluster_points[:, 0].min())
#             cluster_gradient_tl = (cluster_points[:, 1].max() - cluster_points[:, 1].min()) / (cluster_points[:, 0].min() - cluster_points[:, 0].max())
            
#             # Print cluster information
#             print(f"Cluster {cluster}: Length={cluster_length}, Height={cluster_height}, GradientTR={cluster_gradient_tr}, GradientTL={cluster_gradient_tl}")
            
#             # Plot the boundaries of the cluster
#             plot_cluster_bounds(cluster_points)
    
#     # Plot the clustered data
#     plt.scatter(df['phase_angle'], df['peak_amplitude'], c=df['cluster'], cmap='viridis', s=8)
#     plt.title(df['filename'].iloc[0])
#     plt.xlabel('Phase angle')
#     plt.ylabel('Peak amplitude')
#     plt.show()


In [None]:
print(filtered_dataframes[0]) # integers in the cluster column indicate the cluster that they belong to, if its a -1 it means that it is an outlier and is not included in any clusters

In [None]:
# once the relevant graphs have been removed, convert list of dataframes into 1 big dataframe
big_df = pd.concat(filtered_dataframes, ignore_index=True)

In [None]:
big_df.isna().sum() # check number of empty values

In [None]:
big_df.fillna(0, inplace=True) # replace missing values with 0
big_df.isna().sum() # check again


In [None]:
# # Calculate additional clustering features based on the cluster labels
# clusters = []
# for filename, file_df in big_df.groupby('filename'):
#     # Assume the cluster labels are stored in the 'cluster' column
#     labels = file_df['cluster'].values
#     unique_labels = np.unique(labels)
#     cluster_count = len(unique_labels)
#     if cluster_count > 0:
#         # Calculate additional clustering features based on the cluster labels
#         # For example, cluster_length, cluster_height, etc.
#         # Append the calculated features to the clusters list
#         cluster_lengths = []
#         cluster_heights = []
#         cluster_gradient_trs = []
#         cluster_gradient_tls = []
#         for label in unique_labels:
#             cluster_points = file_df[file_df['cluster'] == label][['phase_angle', 'peak_amplitude']].values
#             if len(cluster_points) > 1:
#                 cluster_length = cluster_points[:, 0].max() - cluster_points[:, 0].min()
#                 cluster_height = cluster_points[:, 1].max() - cluster_points[:, 1].min()
#                 cluster_gradient_tr = (cluster_points[:, 1].max() - cluster_points[:, 1].min()) / (cluster_points[:, 0].max() - cluster_points[:, 0].min())
#                 cluster_gradient_tl = (cluster_points[:, 1].max() - cluster_points[:, 1].min()) / (cluster_points[:, 0].min() - cluster_points[:, 0].max())
#                 cluster_lengths.append(cluster_length)
#                 cluster_heights.append(cluster_height)
#                 cluster_gradient_trs.append(cluster_gradient_tr)
#                 cluster_gradient_tls.append(cluster_gradient_tl)
#         clusters.append({'filename': filename, 'cluster_count': cluster_count, 'cluster_lengths': cluster_lengths, 'cluster_heights': cluster_heights, 'cluster_gradient_trs': cluster_gradient_trs, 'cluster_gradient_tls': cluster_gradient_tls})

# # Convert the clusters list to a dataframe
# clusters_df = pd.DataFrame(clusters)

# final_df = clusters_df

# final_df['anomaly'] = final_df['filename'].apply(lambda x: dependent_vars.get(x, 0)) # call dictionary with filename keys mapped to dependent variables respective to each file

# print(final_df)


In [None]:
# Calculate additional clustering features based on the cluster labels
clusters = []
for filename, file_df in big_df.groupby('filename'):
    # Assume the cluster labels are stored in the 'cluster' column
    labels = file_df['cluster'].values
    unique_labels = np.unique(labels)
    cluster_count = len(unique_labels)
    if cluster_count > 0:
        # Calculate additional clustering features based on the cluster labels
        # For example, cluster_length, cluster_height, etc.
        # Append the calculated features to the clusters list
        cluster_lengths = []
        cluster_heights = []
        cluster_gradient_trs = []
        cluster_gradient_tls = []
        for i in range(4):
            if i < len(unique_labels):
                label = unique_labels[i]
                cluster_points = file_df[file_df['cluster'] == label][['phase_angle', 'peak_amplitude']].values
                if len(cluster_points) > 1:
                    cluster_length = cluster_points[:, 0].max() - cluster_points[:, 0].min()
                    cluster_height = cluster_points[:, 1].max() - cluster_points[:, 1].min()
                    cluster_gradient_tr = (cluster_points[:, 1].max() - cluster_points[:, 1].min()) / (cluster_points[:, 0].max() - cluster_points[:, 0].min())
                    cluster_gradient_tl = (cluster_points[:, 1].max() - cluster_points[:, 1].min()) / (cluster_points[:, 0].min() - cluster_points[:, 0].max())
                    cluster_lengths.append(cluster_length)
                    cluster_heights.append(cluster_height)
                    cluster_gradient_trs.append(cluster_gradient_tr)
                    cluster_gradient_tls.append(cluster_gradient_tl)
            else:
                # Fill in zeros for any missing clusters
                cluster_lengths.append(0)
                cluster_heights.append(0)
                cluster_gradient_trs.append(0)
                cluster_gradient_tls.append(0)
        clusters.append({'filename': filename, 'cluster_count': cluster_count, 'cluster_lengths': cluster_lengths, 'cluster_heights': cluster_heights, 'cluster_gradient_trs': cluster_gradient_trs, 'cluster_gradient_tls': cluster_gradient_tls})

# Convert the clusters list to a dataframe
clusters_df = pd.DataFrame(clusters)

final_df = clusters_df

final_df['anomaly'] = final_df['filename'].apply(lambda x: dependent_vars.get(x, 0)) # call dictionary with filename keys mapped to dependent variables respective to each file

print(final_df)


In [None]:
final_df.to_csv('test1.csv', index=False)

In [None]:
df = final_df
# Expand the list columns into separate columns
expanded_df = pd.concat([df.drop(['cluster_lengths', 'cluster_heights', 'cluster_gradient_trs', 'cluster_gradient_tls'], axis=1),
                         df['cluster_lengths'].apply(pd.Series).add_prefix('cluster_length_'),
                         df['cluster_heights'].apply(pd.Series).add_prefix('cluster_height_'),
                         df['cluster_gradient_trs'].apply(pd.Series).add_prefix('cluster_gradient_tr_'),
                         df['cluster_gradient_tls'].apply(pd.Series).add_prefix('cluster_gradient_tl_')],
                        axis=1)

# Fill NaN values with 0
expanded_df.fillna(0, inplace=True)

# be sure to include this in your model training code so that the filename and anomalies are not calculated as features!
# Define the features and target variables:
# features = expanded_df.columns.tolist()
# features.remove('filename')
# features.remove('anomaly')
# target = 'anomaly'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
# Split the data into training and testing sets
train_dfs, test_dfs = train_test_split(expanded_df, test_size=0.2, random_state=42)

# Define the features and target variables
features = expanded_df.columns.tolist()
features.remove('filename')
features.remove('anomaly')
target = 'anomaly'

# Train the decision tree model
DecisionTreeClassifierModel = DecisionTreeClassifier(random_state=42)
DecisionTreeClassifierModel.fit(train_dfs[features], train_dfs[target])

# Make predictions on the testing set
predictions = DecisionTreeClassifierModel.predict(test_dfs[features])

# Calculate the accuracy of the model
accuracy = accuracy_score(test_dfs[target], predictions)
print(f"Accuracy: {accuracy}")

# Generate classification report
report = classification_report(test_dfs[target], predictions)
print(report)


In [None]:
print(expanded_df)
expanded_df.to_csv('test1.1.csv', index=False)

In [None]:
def preprocess_data(dataframes):
    dataframes = denoise_dataframes(dataframes) # de-noise the list of dataframes
    filtered_dataframes = [df[df['filtered'] == True] for df in dataframes] # remove all data points below the baseline AKA removing noise
    # Initialize lists for the cluster features
    for df in filtered_dataframes:
        # Extract the relevant columns
        X = df[['phase_angle', 'peak_amplitude']].values
        
        # Apply DBSCAN to the data
        dbscan = DBSCAN(eps=7, min_samples=8)
        dbscan.fit(X)
        labels = dbscan.labels_
        
        # Create a copy of the dataframe to avoid SettingWithCopyWarning
        df_copy = df.copy()
        df_copy['cluster_length'] = np.nan
        df_copy['cluster_height'] = np.nan
        df_copy['cluster_gradient_tr'] = np.nan
        df_copy['cluster_gradient_tl'] = np.nan
        
        # Add the cluster labels to the dataframe copy
        df_copy['cluster'] = labels
        
        # Access cluster information
        clusters = set(labels)
        for cluster in clusters:
            if cluster != -1:
                # Get the points belonging to the cluster
                cluster_points = X[labels == cluster]

                
                # Calculate cluster features
                cluster_length = cluster_points[:, 0].max() - cluster_points[:, 0].min()
                cluster_height = cluster_points[:, 1].max() - cluster_points[:, 1].min()
                cluster_gradient_tr = (cluster_points[:, 1].max() - cluster_points[:, 1].min()) / (cluster_points[:, 0].max() - cluster_points[:, 0].min())
                cluster_gradient_tl = (cluster_points[:, 1].max() - cluster_points[:, 1].min()) / (cluster_points[:, 0].min() - cluster_points[:, 0].max())
                
                # Add new columns for cluster features to the dataframe copy
                df_copy.loc[labels == cluster, 'cluster_length'] = cluster_length
                df_copy.loc[labels == cluster, 'cluster_height'] = cluster_height
                df_copy.loc[labels == cluster, 'cluster_gradient_tr'] = cluster_gradient_tr
                df_copy.loc[labels == cluster, 'cluster_gradient_tl'] = cluster_gradient_tl
        
        # Assign the cluster labels and features to the original dataframe
        df.loc[:, 'cluster'] = df_copy['cluster']
        df.loc[:, 'cluster_length'] = df_copy['cluster_length']
        df.loc[:, 'cluster_height'] = df_copy['cluster_height']
        df.loc[:, 'cluster_gradient_tr'] = df_copy['cluster_gradient_tr']
        df.loc[:, 'cluster_gradient_tl'] = df_copy['cluster_gradient_tl']
        
        # # Plot the clustered data
        # plt.scatter(df['phase_angle'], df['peak_amplitude'], c=df['cluster'], cmap='viridis', s=8)
        # plt.title(df['filename'].iloc[0])
        # plt.xlabel('Phase angle')
        # plt.ylabel('Peak amplitude')
        # plt.show()

    # once the relevant graphs have been removed, convert list of dataframes into 1 big dataframe
    big_df = pd.concat(filtered_dataframes, ignore_index=True)
    big_df.fillna(0, inplace=True) # replace missing values with 0
    # Calculate additional clustering features based on the cluster labels
    # Calculate additional clustering features based on the cluster labels
    clusters = []
    for filename, file_df in big_df.groupby('filename'):
        # Assume the cluster labels are stored in the 'cluster' column
        labels = file_df['cluster'].values
        unique_labels = np.unique(labels)
        cluster_count = len(unique_labels)
        if cluster_count > 0:
            # Calculate additional clustering features based on the cluster labels
            # For example, cluster_length, cluster_height, etc.
            # Append the calculated features to the clusters list
            cluster_lengths = []
            cluster_heights = []
            cluster_gradient_trs = []
            cluster_gradient_tls = []
            for i in range(4):
                if i < len(unique_labels):
                    label = unique_labels[i]
                    cluster_points = file_df[file_df['cluster'] == label][['phase_angle', 'peak_amplitude']].values
                    if len(cluster_points) > 1:
                        cluster_length = cluster_points[:, 0].max() - cluster_points[:, 0].min()
                        cluster_height = cluster_points[:, 1].max() - cluster_points[:, 1].min()
                        cluster_gradient_tr = (cluster_points[:, 1].max() - cluster_points[:, 1].min()) / (cluster_points[:, 0].max() - cluster_points[:, 0].min())
                        cluster_gradient_tl = (cluster_points[:, 1].max() - cluster_points[:, 1].min()) / (cluster_points[:, 0].min() - cluster_points[:, 0].max())
                        cluster_lengths.append(cluster_length)
                        cluster_heights.append(cluster_height)
                        cluster_gradient_trs.append(cluster_gradient_tr)
                        cluster_gradient_tls.append(cluster_gradient_tl)
                else:
                    # Fill in zeros for any missing clusters
                    cluster_lengths.append(0)
                    cluster_heights.append(0)
                    cluster_gradient_trs.append(0)
                    cluster_gradient_tls.append(0)
            clusters.append({'filename': filename, 'cluster_count': cluster_count, 'cluster_lengths': cluster_lengths, 'cluster_heights': cluster_heights, 'cluster_gradient_trs': cluster_gradient_trs, 'cluster_gradient_tls': cluster_gradient_tls})

    # Convert the clusters list to a dataframe
    clusters_df = pd.DataFrame(clusters)

    final_df = clusters_df

    print(final_df)

    df = final_df
    # Expand the list columns into separate columns
    expanded_df = pd.concat([df.drop(['cluster_lengths', 'cluster_heights', 'cluster_gradient_trs', 'cluster_gradient_tls'], axis=1),
                            df['cluster_lengths'].apply(pd.Series).add_prefix('cluster_length_'),
                            df['cluster_heights'].apply(pd.Series).add_prefix('cluster_height_'),
                            df['cluster_gradient_trs'].apply(pd.Series).add_prefix('cluster_gradient_tr_'),
                            df['cluster_gradient_tls'].apply(pd.Series).add_prefix('cluster_gradient_tl_')],
                            axis=1)

    # Fill NaN values with 0
    expanded_df.fillna(0, inplace=True)
    print(expanded_df)
    return expanded_df

def getFeatures(final_df):
    features = final_df.columns.tolist()
    features.remove('filename')
    # features.remove('cluster_count')
    return features

In [None]:
# to apply the trained model onto any new test dataset, the new test dataset must have the same number of features
# meaning that we must apply the HDBScan clustering to the new test data, generate its columns, before finally 
# loading it into the model using predictions = model.predict(new_data[features])

# step 1: load folder containing all testing files into a list of dataframes
dirpath = "test" # name of folder
to_be_predicted_df = load_dataframes(dirpath)

# step 2: preprocess the data, which entails removing noise, getting cluster information for comparison, and dataframe conversion to include only the relevant features
to_be_predicted_df = preprocess_data(to_be_predicted_df)



In [None]:
f = getFeatures(to_be_predicted_df)
X = to_be_predicted_df[f]
predictions = DecisionTreeClassifierModel.predict(X)
# print(getFeatures(to_be_predicted_df).dtypes)
print(predictions)


In [None]:
placeholder_df = load_dataframes(dirpath)
for i, dataframe in enumerate(placeholder_df):
    plt.scatter(dataframe['phase_angle'], dataframe['peak_amplitude'], s=8)
    plt.title(f"{dataframe['filename'].iloc[0]} (Prediction: {predictions[i]})")
    plt.xlabel('Phase angle')
    plt.ylabel('Peak amplitude')
    plt.show()