In [None]:
import os
from src.StreamPort.core import Engine
from src.StreamPort.device.analyses import PressureCurvesAnalyses

path = "C:/Users/Sandeep/Desktop/ExtractedSignals" 
batches = os.listdir(path)
batches = [os.path.join(path, file) for file in batches]

files = []
for batch in batches:
    batch_files = os.listdir(batch)
    batch_files = [os.path.join(batch, file) for file in batch_files if ".D" in file]
    files.extend(batch_files)

eng = Engine(analyses = PressureCurvesAnalyses(files=files))

print("Number of analyses: ", len(eng.analyses.data))

fig_methods = eng.analyses.plot_methods()

#Kaleido requires Google Chrome to be installed!!!
fig_methods.write_image("dev/figures/fig_methods.png", width=1100, height= 350, scale = 3)

fig_methods

In [None]:
pressure_vector = eng.analyses.data[50]['pressure_var']
time = eng.analyses.data[50]['time_var']
import numpy as np
import plotly.graph_objects as go
# Apply baseline correction. Comparing multiple algorithms to choose the best one.

In [None]:

"""
1. Simple Moving Average
- smoothed version of the original signal, with each point replaced by the average of itself and its <window_size - 1> nearest neighbors
- this smoothed vector is then subtracted from the original pressure vector to obtain the baseline corrected vector while retaining noise
"""
# window_size = self.parameters["period"] if self.parameters["period"] % 2 != 0 else self.parameters["period"] - 1
window_size = 10
edges = window_size // 2
smoothed_vector = np.convolve(pressure_vector, np.ones(window_size) / window_size, mode='same')
# baseline correction by subtracting the smoothed vector from the original pressure vector
baseline_corrected_vector = pressure_vector - smoothed_vector
# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
baseline_corrected_vector = baseline_corrected_vector[edges:-edges]
# featrawi["pressure_baseline_corrected"] = baseline_corrected_vector
fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=smoothed_vector, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction Example', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_simple_moving_average.png", width=1100, height= 350, scale = 3)

In [None]:

# """
# 2. Polynomial Least Squares Fitting
# - fit a polynomial of degree <degree> to the original signal, to simulate a blank chromatogram and subtract it from the original signal 
# """
# # window_size = self.parameters["period"]
# smoothed_vector = np.convolve(pressure_vector, np.ones(window_size) / window_size, mode='same')
# # baseline correction by subtracting the smoothed vector from the original pressure vector
# baseline_corrected_vector = pressure_vector - smoothed_vector
# # Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
# baseline_corrected_vector = baseline_corrected_vector[1:-1]
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
# fig.add_trace(go.Scatter(x=time, y=smoothed_vector, mode='lines', name='Smoothed Curve'))
# fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
# fig.update_layout(title='Baseline Correction Example', xaxis_title='Time', yaxis_title='Pressure')
# fig.show()
# fig.write_image("dev/figures/fig_baseline_correction_polynomial_least_squares_fitting.png", width=1100, height= 350, scale = 3)


In [None]:
"""
3. Savitzky-Golay Filter
- applies a polynomial smoothing filter to the data, which is particularly effective for preserving features of the data while reducing noise
- uses a sliding window to fit a polynomial to the data points within the window, and then replaces the central point with the value of the polynomial at that point
"""
from scipy.signal import savgol_filter
window_size = 11  # Must be odd
poly_order = 2  # Polynomial order
smoothed_vector = savgol_filter(pressure_vector, window_size, poly_order)
baseline_corrected_vector = pressure_vector - smoothed_vector
# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
baseline_corrected_vector = baseline_corrected_vector[1:-1]
# featrawi["pressure_baseline_corrected"] = baseline_corrected_vector
fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=smoothed_vector, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction Example', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_savitzky_golay_filter.png", width=1100, height= 350, scale = 3)


In [None]:

"""
4. Asymmetric Least Squares Smoothing
- estimate a baseline in data by minimizing the sum of squared differences between the data and a smooth curve
- does this by applying different penalties to deviations above and below the curve.
- This asymmetry allows the smoother to better fit the baseline while accommodating peaks or other features in the data
"""
from scipy import sparse
from scipy.sparse.linalg import spsolve
# def baseline_als(pressure_vector, smoothness, asymmetry, n_iterations=10):
smoothness = 1e5 
asymmetry = 0.1
n_iterations=10
L = len(pressure_vector)
D = sparse.csc_matrix(np.diff(np.eye(L), 2))
w = np.ones(L)
for i in range(n_iterations):
    W = sparse.spdiags(w, 0, L, L)
    Z = W + smoothness * D.dot(D.transpose())
    baseline = spsolve(Z, w*pressure_vector)
    w = asymmetry * (pressure_vector > baseline) + (1-asymmetry) * (pressure_vector < baseline)

baseline_corrected_vector = pressure_vector - baseline
# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
# baseline_corrected_vector = baseline_corrected_vector[1:-1]
# featrawi["pressure_baseline_corrected"] = baseline_corrected_vector
fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction Example', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_simple_moving_average.png", width=1100, height= 350, scale = 3)


In [None]:
"""
5. airPLS (adaptive iteratively reweighted Penalized Least Squares)
- fit a polynomial of degree <degree> to the original signal, to simulate a blank chromatogram and subtract it from the original signal 
"""
def logistic(x, m, s):
    return 1.0 / (1.0 + np.exp(-(x - m) / s))

smoothness = 1e5
n_iterations = 50
L = len(pressure_vector)
D = sparse.csc_matrix(np.diff(np.eye(L), 2))
w = np.ones(L)
for i in range(n_iterations):
    W = sparse.spdiags(w, 0, L, L)
    Z = W + smoothness * D.dot(D.transpose())
    smoothed_vector = spsolve(Z, w * pressure_vector)
    d = pressure_vector - smoothed_vector
    d_neg = d[d < 0]
    abs_d_neg = np.abs(d_neg).sum()
    abs_y = np.abs(pressure_vector).sum()
    if abs_d_neg < 0.001 * abs_y:
        break
    if len(d_neg) == 0:
        m_d, s_d = 0, 1
    else:
        m_d = np.mean(d_neg)
        s_d = np.std(d_neg) if np.std(d_neg) > 0 else 1
    w = np.ones(L)
    mask = pressure_vector > smoothed_vector
    w[mask] = logistic(pressure_vector[mask] - smoothed_vector[mask], m_d, s_d)
    # w[~mask] = 1  # already set by np.ones(L)
# baseline correction by subtracting the smoothed vector from the original pressure vector
baseline_corrected_vector = pressure_vector - smoothed_vector
# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
# baseline_corrected_vector = baseline_corrected_vector[1:-1]
fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=smoothed_vector, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction Example', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_simple_moving_average.png", width=1100, height= 350, scale = 3)


In [None]:
"""
6. arPLS (asymmetrically reweighted Penalized Least Squares)
- fit a polynomial of degree <degree> to the original signal, to simulate a blank chromatogram and subtract it from the original signal 
"""
import numpy as np
from scipy import sparse
from scipy.sparse.linalg import spsolve
from scipy.linalg import cholesky, solve

def baseline_arPLS(y, lam=1e5, ratio=1e-6, max_iter=50):
    """
    Adaptive iteratively reweighted Penalized Least Squares (arPLS) baseline correction.
    Parameters:
        y : np.ndarray
            Input signal (1D array).
        lam : float
            Smoothness parameter (lambda).
        ratio : float
            Convergence threshold for weight update.
        max_iter : int
            Maximum number of iterations.
    Returns:
        z : np.ndarray
            Estimated baseline.
    """
    N = len(y)
    D = np.diff(np.eye(N), 2)
    H = lam * D.T @ D
    w = np.ones(N)
    for _ in range(max_iter):
        W = sparse.spdiags(w, 0, N, N)
        # Cholesky decomposition
        C = cholesky((W + H).toarray(), lower=True)
        # Solve (W + H) * z = w * y
        z = solve(C.T, solve(C, w * y))
        d = y - z
        dn = d[d < 0]
        m = dn.mean() if len(dn) > 0 else 0
        s = dn.std() if len(dn) > 0 else 1
        wt = 1.0 / (1.0 + np.exp(2 * (d - (2 * s - m)) / s))
        # Check exit condition
        if np.linalg.norm(w - wt) / np.linalg.norm(w) < ratio:
            break
        w = wt
    return z

baseline = baseline_arPLS(pressure_vector, lam=1e5, ratio=1e-6)
# baseline correction by subtracting the smoothed vector from the original pressure vector
baseline_corrected_vector = pressure_vector - baseline
# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
# baseline_corrected_vector = baseline_corrected_vector[1:-1]
fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=smoothed_vector, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction Example', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_simple_moving_average.png", width=1100, height= 350, scale = 3)

In [None]:
"""
7. Rolling Ball
- fit a polynomial of degree <degree> to the original signal, to simulate a blank chromatogram and subtract it from the original signal 
"""
window_size = self.parameters["period"]
smoothed_vector = np.convolve(pressure_vector, np.ones(window_size) / window_size, mode='same')

# baseline correction by subtracting the smoothed vector from the original pressure vector
baseline_corrected_vector = pressure_vector - smoothed_vector

# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
baseline_corrected_vector = baseline_corrected_vector[1:-1]

fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=smoothed_vector, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction Example', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_simple_moving_average.png", width=1100, height= 350, scale = 3)

In [None]:
from src.StreamPort.device.methods import PressureCurvesMethodExtractFeaturesNative
from src.StreamPort.device.methods import PressureCurvesMethodScaleFeaturesScalerSklearn

eng.workflow.clear()
eng.workflow.append(PressureCurvesMethodExtractFeaturesNative(period=20))
eng.run()

method = "SAA_411_Pac.M"
method_indices = eng.analyses.get_method_indices(method)
fig_sel_method = eng.analyses.plot_methods(method_indices)

fig_sel_method.write_image("dev/figures/fig_sel_method.png", width=1100, height= 350, scale = 3)
fig_sel_method.write_image("dev/figures/fig_sel_method_half.png", width=550, height= 350, scale = 3)

fig_sel_method.show()

In [None]:
fig_curves_raw = eng.analyses.plot_pressure_curves(indices = method_indices)

fig_curves_raw.update_layout(showlegend=False)

for trace in fig_curves_raw.data:
    trace.line.color = "black"

fig_curves_raw.write_image("dev/figures/fig_curves_raw.png", width=1100, height= 350, scale = 3)

fig_curves_raw.show()

In [None]:
print("Number of curves: ", len(method_indices))

fig_features=eng.analyses.plot_features(indices = method_indices)

fig_features.update_layout(showlegend=False)
for trace in fig_features.data:
    trace.line.color = "black"

fig_features.write_image("dev/figures/fig_features.png", width=1100, height= 350, scale = 3)

fig_features.show()

In [None]:
import datetime
date_threshold = "2021-08-18"
date_threshold = datetime.datetime.strptime(date_threshold, "%Y-%m-%d")

train_indices = []
for i in method_indices:
    meta = eng.analyses.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = datetime.datetime.fromisoformat(start_time)
    if batch_position > 4 and start_time < date_threshold:
        train_indices.append(i)

train_data = eng.analyses.get_features(train_indices)
train_metadata = eng.analyses.get_metadata(train_indices)
train_data.to_csv("dev/train_features.csv", index=False)
train_metadata.to_csv("dev/train_metadata.csv", index=False)
print("Number of training curves: ", len(train_indices))

In [None]:
fig_train=eng.analyses.plot_pressure_curves(indices = train_indices)
fig_train.update_layout(showlegend=False)
for trace in fig_train.data:
    trace.line.color = "black"
fig_train.write_image("dev/figures/fig_train.png", width=1100, height= 350, scale = 3)
fig_train.show()

In [None]:
fig_train_features=eng.analyses.plot_features(indices = train_indices)
fig_train_features.update_layout(showlegend=False)
for trace in fig_train_features.data:
    trace.line.color = "black"
fig_train_features.write_image("dev/figures/fig_train_features.png", width=1100, height= 350, scale = 3)
fig_train_features.show()

In [None]:
import datetime
date_threshold_min = "2021-08-18"
date_threshold_min= datetime.datetime.strptime(date_threshold_min, "%Y-%m-%d")

date_threshold_max = "2021-08-22"
date_threshold_max= datetime.datetime.strptime(date_threshold_max, "%Y-%m-%d")

test_indices = []
for i in method_indices:
    meta = eng.analyses.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = datetime.datetime.fromisoformat(start_time)
    if start_time < date_threshold_max and start_time > date_threshold_min and batch_position > 0:
        test_indices.append(i)

test_data = eng.analyses.get_features(test_indices)
test_metadata = eng.analyses.get_metadata(test_indices)
test_data.to_csv("dev/test_features.csv", index=False)
test_metadata.to_csv("dev/test_metadata.csv", index=False)
print("Number of test curves: ", len(test_indices))

fig_test=eng.analyses.plot_pressure_curves(indices = test_indices)
fig_test.update_layout(showlegend=False)
#for trace in fig_outliers.data:
#    trace.line.color = "black"
fig_test.write_image("dev/figures/fig_test.png", width=550, height= 350, scale = 3)
fig_test.update_layout(showlegend=True)
fig_test.show()

In [None]:
fig_test = eng.analyses.plot_features(indices = test_indices)
fig_test.update_layout(showlegend=False)
# for trace in fig_text.data:
#     trace.line.color = "black"
fig_test.write_image("dev/figures/fig_test_features.png", width=1100, height= 350, scale = 3)
fig_test.update_layout(showlegend=True)
fig_test.show()

In [None]:
import datetime
date_threshold_min = "2021-09-09"
date_threshold_min= datetime.datetime.strptime(date_threshold_min, "%Y-%m-%d")

date_threshold_max = "2021-09-12"
date_threshold_max= datetime.datetime.strptime(date_threshold_max, "%Y-%m-%d")

test2_indices = []
for i in method_indices:
    meta = eng.analyses.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = datetime.datetime.fromisoformat(start_time)
    if start_time < date_threshold_max and start_time > date_threshold_min and batch_position > 0:	
        test2_indices.append(i)

test2_indices = test2_indices[:1]
test2_indices.append(47)
test2_indices.append(17)
test2_indices.append(116)
test2_indices.append(16)

test2_data = eng.analyses.get_features(test2_indices)
test2_metadata = eng.analyses.get_metadata(test2_indices)
test2_data.to_csv("dev/test2_features.csv", index=False)
test2_metadata.to_csv("dev/test2_metadata.csv", index=False)
print("Number of test curves: ", len(test2_indices))

fig_test2=eng.analyses.plot_pressure_curves(indices = test2_indices)
fig_test2.update_layout(showlegend=False)
#for trace in fig_outliers.data:
#    trace.line.color = "black"
fig_test2.write_image("dev/figures/fig_test2.png", width=1100, height= 350, scale = 3)
fig_test2.update_layout(showlegend=True)
fig_test2.show()

In [None]:
fig_test2 = eng.analyses.plot_features(indices = test2_indices)
fig_test2.update_layout(showlegend=False)
# for trace in fig_text.data:
#     trace.line.color = "black"
fig_test2.write_image("dev/figures/fig_test2_features.png", width=1100, height= 350, scale = 3)
fig_test2.update_layout(showlegend=True)
fig_test2.show()

In [None]:

test_2_indices = [21, 116, 117, 153]
test_2_data = eng.analyses.get_features(test_2_indices)
test_2_metadata = eng.analyses.get_metadata(test_2_indices)
test_2_data.to_csv("dev/test_2_features.csv", index=False)
test_2_metadata.to_csv("dev/test_2_metadata.csv", index=False)
fig_text_2_curves=eng.analyses.plot_pressure_curves(indices = test_2_indices)
fig_text_2_curves.update_layout(showlegend=False)
# for trace in fig_text_2_curves.data:
#     trace.line.color = "black"
fig_text_2_curves.write_image("dev/figures/fig_text_2_curves.png", width=1000, height= 350, scale = 3)
fig_text_2_curves.update_layout(showlegend=True)
fig_text_2_curves.show()

In [None]:
eng.analyses.plot_features(indices = test_2_indices)

In [None]:
fig_test2_features_raw_no_fourier = eng.analyses.plot_features_raw(indices = test_2_indices)
fig_test2_features_raw_no_fourier.update_layout(showlegend=False)
fig_test2_features_raw_no_fourier.write_image("dev/figures/fig_test2_features_raw_no_fourier.png", width=1100, height= 350, scale = 3)
fig_test2_features_raw_no_fourier.update_layout(showlegend=True)
fig_test2_features_raw_no_fourier.show()