In [1]:
import os
from src.StreamPort.core import Engine
from src.StreamPort.device.analyses import PressureCurvesAnalyses

path = "C:/Users/Sandeep/Desktop/ExtractedSignals" 
batches = os.listdir(path)
batches = [os.path.join(path, file) for file in batches]

files = []
for batch in batches:
    batch_files = os.listdir(batch)
    batch_files = [os.path.join(batch, file) for file in batch_files if ".D" in file]
    files.extend(batch_files)

eng = Engine(analyses = PressureCurvesAnalyses(files=files))

print("Number of analyses: ", len(eng.analyses.data))

fig_methods = eng.analyses.plot_methods()

fig_methods.write_image("dev/figures/fig_methods.png", width=1100, height= 350, scale = 3)

fig_methods

Number of analyses:  374


In [None]:
pressure_vector = eng.analyses.data[116]['pressure_var']
time = eng.analyses.data[116]['time_var']
import numpy as np
import plotly.graph_objects as go
# Apply baseline correction. Comparing multiple algorithms to choose the best one. 
# Selected algorithms 1.SMA and 3.SGF are implemented in the methods.py file.

In [None]:

"""
1. Simple Moving Average
- smoothed version of the original signal, with each point replaced by the average of itself and its <window_size - 1> nearest neighbors
- this smoothed vector is then subtracted from the original pressure vector to obtain the baseline corrected vector while retaining noise
"""
# window_size = self.parameters["period"] if self.parameters["period"] % 2 != 0 else self.parameters["period"] - 1
window_size = 10
edges = window_size // 2
smoothed_vector = np.convolve(pressure_vector, np.ones(window_size) / window_size, mode='same')
# baseline correction by subtracting the smoothed vector from the original pressure vector
baseline_corrected_vector = pressure_vector - smoothed_vector
# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
baseline_corrected_vector = baseline_corrected_vector[edges:-edges]
# featrawi["pressure_baseline_corrected"] = baseline_corrected_vector
fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=smoothed_vector, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction - Simple Moving Average', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_simple_moving_average.png", width=1100, height= 350, scale = 3)

In [None]:

"""
2. Polynomial Least Squares Fitting
- fit a polynomial of degree <degree> to the original signal, to simulate a blank chromatogram and subtract it from the original signal 
"""
coefficients = np.polyfit(time, pressure_vector, 8)  # degree 3 polynomial
smoothed_vector = np.polyval(coefficients, time)
# baseline correction by subtracting the smoothed vector from the original pressure vector
baseline_corrected_vector = pressure_vector - smoothed_vector
# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
# baseline_corrected_vector = baseline_corrected_vector[1:-1]
fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=smoothed_vector, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction - Polynomial Least Squares Fitting', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_polynomial_least_squares_fitting.png", width=1100, height= 350, scale = 3)


In [None]:
"""
3. Savitzky-Golay Filter
- applies a polynomial smoothing filter to the data, which is particularly effective for preserving features of the data while reducing noise
- uses a sliding window to fit a polynomial to the data points within the window, and then replaces the central point with the value of the polynomial at that point
"""
from scipy.signal import savgol_filter
window_size = 13  # Must be odd
poly_order = 2  # Polynomial order
smoothed_vector = savgol_filter(pressure_vector, window_size, poly_order)
baseline_corrected_vector = pressure_vector - smoothed_vector
# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
# baseline_corrected_vector = baseline_corrected_vector[1:-1]
# featrawi["pressure_baseline_corrected"] = baseline_corrected_vector
fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=smoothed_vector, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction - Savitzky-Golay Filter', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_savitzky_golay_filter.png", width=1100, height= 350, scale = 3)


In [None]:
bins = 4
num_bins = np.array_split(baseline_corrected_vector, bins)
fig = go.Figure()
feats = ["max_amplitude", "min_amplitude", "amplitude_range"]
y = []
for i, vector_bin in enumerate(num_bins):
    if len(vector_bin) == 0:
        continue
    max_amplitude = np.max(vector_bin)
    min_amplitude = np.min(vector_bin)
    amplitude_range = max_amplitude - min_amplitude
    y.extend([max_amplitude, min_amplitude, amplitude_range])
    fig.add_trace(go.Scatter(x=feats, y=y, mode='lines', name='Amplitude'))
fig.update_layout(title='Binned Amplitudes of Baseline Corrected Curves', xaxis_title='Features', yaxis_title='Values')
fig.show()            

In [None]:

"""
4. Asymmetric Least Squares Smoothing
- estimate a baseline in data by minimizing the sum of squared differences between the data and a smooth curve
- does this by applying different penalties to deviations above and below the curve.
- This asymmetry allows the smoother to better fit the baseline while accommodating peaks or other features in the data
"""
from scipy import sparse
from scipy.sparse.linalg import spsolve
# def baseline_als(pressure_vector, smoothness, asymmetry, n_iterations=10):
smoothness = 1e4 
asymmetry = 0.1
n_iterations=10
L = len(pressure_vector)
D = sparse.csc_matrix(np.diff(np.eye(L), 2))
w = np.ones(L)
for i in range(n_iterations):
    W = sparse.spdiags(w, 0, L, L)
    Z = W + smoothness * D.dot(D.transpose())
    baseline = spsolve(Z, w*pressure_vector)
    w = asymmetry * (pressure_vector > baseline) + (1-asymmetry) * (pressure_vector < baseline)

baseline_corrected_vector = pressure_vector - baseline
# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
# baseline_corrected_vector = baseline_corrected_vector[1:-1]
# featrawi["pressure_baseline_corrected"] = baseline_corrected_vector
fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction - Asymmetric Least Squares Smoothing', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_AsLS_smoothing.png", width=1100, height= 350, scale = 3)


In [None]:
"""
5. airPLS (adaptive iteratively reweighted Penalized Least Squares)
- fit a polynomial of degree <degree> to the original signal, to simulate a blank chromatogram and subtract it from the original signal 
"""
def logistic(x, m, s):
    return 1.0 / (1.0 + np.exp(-(x - m) / s))

smoothness = 1e5
n_iterations = 50
L = len(pressure_vector)
D = sparse.csc_matrix(np.diff(np.eye(L), 2))
w = np.ones(L)
for i in range(n_iterations):
    W = sparse.spdiags(w, 0, L, L)
    Z = W + smoothness * D.dot(D.transpose())
    smoothed_vector = spsolve(Z, w * pressure_vector)
    d = pressure_vector - smoothed_vector
    d_neg = d[d < 0]
    abs_d_neg = np.abs(d_neg).sum()
    abs_y = np.abs(pressure_vector).sum()
    if abs_d_neg < 0.001 * abs_y:
        break
    if len(d_neg) == 0:
        m_d, s_d = 0, 1
    else:
        m_d = np.mean(d_neg)
        s_d = np.std(d_neg) if np.std(d_neg) > 0 else 1
    w = np.ones(L)
    mask = pressure_vector > smoothed_vector
    w[mask] = logistic(pressure_vector[mask] - smoothed_vector[mask], m_d, s_d)
    # w[~mask] = 1  # already set by np.ones(L)
# baseline correction by subtracting the smoothed vector from the original pressure vector
baseline_corrected_vector = pressure_vector - smoothed_vector
# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
# baseline_corrected_vector = baseline_corrected_vector[1:-1]
fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=smoothed_vector, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction - adaptive iteratively reweighted Penalized Least Squares', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_airPLS.png", width=1100, height= 350, scale = 3)


In [None]:
"""
6. arPLS (asymmetrically reweighted Penalized Least Squares)
- fit a polynomial of degree <degree> to the original signal, to simulate a blank chromatogram and subtract it from the original signal 
"""
import numpy as np
from scipy.linalg import cholesky, solve

smoothness=1e6
ratio=1e-5 
max_iter=30
N = len(pressure_vector)
D = np.diff(np.eye(N), 2).T
H = smoothness * D.T @ D
w = np.ones(N)
for _ in range(max_iter):
    W = sparse.spdiags(w, 0, N, N)
    # Cholesky decomposition
    C = cholesky(W.toarray() + H, lower=True)
    # Solve (W + H) * z = w * y
    z = solve(C.T, solve(C, w * pressure_vector))
    d = pressure_vector - z
    dn = d[d < 0]
    m = dn.mean() if len(dn) > 0 else 0
    s = dn.std() if len(dn) > 0 else 1
    wt = 1.0 / (1.0 + np.exp(2 * (d - (2 * s - m)) / s))
    # Check exit condition
    if np.linalg.norm(w - wt) / np.linalg.norm(w) < ratio:
        break
    w = wt

baseline = z
# baseline correction by subtracting the smoothed vector from the original pressure vector
baseline_corrected_vector = pressure_vector - baseline
# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
# baseline_corrected_vector = baseline_corrected_vector[1:-1]
fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=smoothed_vector, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction - asymmetrically reweighted Penalized Least Squares', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_arPLS.png", width=1100, height= 350, scale = 3)

In [None]:
"""
7. SNIP(Statistical Non-linear Iterative Peak)
-  
"""
# apply a double logarithm transformation to the pressure vector
lls_vector = np.log(np.log(np.sqrt(pressure_vector + 1) + 1) + 1)
# Define a function to compute the minimum filter
def min_filter(lls_vector, m):
    """Applies the SNIP minimum filter"""
    lls_filtered = np.copy(lls_vector)
    for i in range(m, len(lls_vector) - m):
        lls_filtered[i] = min(lls_vector[i], (lls_vector[i-m] + lls_vector[i + m])/2)
    return lls_filtered

# Apply the filter for the first 100 iterations
lls_filtered = np.copy(lls_vector)
for m in range(5):
    lls_filtered = min_filter(lls_vector, m)

smoothed_vector = (np.exp(np.exp(lls_filtered) - 1) - 1) ** 2 - 1
# baseline correction by subtracting the smoothed vector from the original pressure vector
baseline_corrected_vector = pressure_vector - smoothed_vector
# Remove the elements from the beginning and end to avoid edge effects. Typically <window_size // 2>
# baseline_corrected_vector = baseline_corrected_vector[1:-1]
fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=smoothed_vector, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction - Statistical Non-linear Iterative Peak', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_SNIP.png", width=1100, height= 350, scale = 3)

In [None]:
"""
8. Smoothing Spline Baseline Correction
- fit a smoothing spline to the original signal, allowing some deviation from the data to obtain a smooth baseline
"""
from scipy.interpolate import UnivariateSpline

# Choose a smoothing factor s (higher s = smoother baseline)
smoothing_factor = 1e4  # You may need to tune this value
spline = UnivariateSpline(time, pressure_vector, s=smoothing_factor)
baseline = spline(time)

baseline_corrected_vector = pressure_vector - baseline

fig = go.Figure()
fig.add_trace(go.Scatter(x=time, y=pressure_vector, mode='lines', name='Raw Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline, mode='lines', name='Smoothed Curve'))
fig.add_trace(go.Scatter(x=time, y=baseline_corrected_vector, mode='lines', name='Baseline Corrected Vector'))
fig.update_layout(title='Baseline Correction - Smoothing Spline', xaxis_title='Time', yaxis_title='Pressure')
fig.show()
fig.write_image("dev/figures/fig_baseline_correction_smoothing_spline.png", width=1100, height=350, scale=3)

In [2]:
from src.StreamPort.device.methods import PressureCurvesMethodExtractFeaturesNative
#from src.StreamPort.device.methods import PressureCurvesMethodScaleFeaturesScalerSklearn

eng.workflow.clear()
eng.workflow.append(PressureCurvesMethodExtractFeaturesNative(period=20))
eng.run()

method = "SAA_411_Pac.M"
method_indices = eng.analyses.get_method_indices(method)
fig_sel_method = eng.analyses.plot_methods(method_indices)

fig_sel_method.write_image("dev/figures/fig_sel_method.png", width=1100, height= 350, scale = 3)
fig_sel_method.write_image("dev/figures/fig_sel_method_half.png", width=550, height= 350, scale = 3)

fig_sel_method.show()

Running the workflow
Processing method PressureCurvesMethodExtractFeaturesNative (1 / 1)


In [3]:
fig_curves_raw = eng.analyses.plot_pressure_curves(indices = method_indices)

fig_curves_raw.update_layout(showlegend=False)

for trace in fig_curves_raw.data:
    trace.line.color = "black"

fig_curves_raw.write_image("dev/figures/fig_curves_raw.png", width=1100, height= 350, scale = 3)

fig_curves_raw.show()

In [4]:
print("Number of curves: ", len(method_indices))

fig_features=eng.analyses.plot_features(indices = method_indices)

fig_features.update_layout(showlegend=False)
for trace in fig_features.data:
    trace.line.color = "black"

fig_features.write_image("dev/figures/fig_features.png", width=1100, height= 350, scale = 3)

fig_features.show()

Number of curves:  93


In [5]:
import datetime
date_threshold = "2021-08-18"
date_threshold = datetime.datetime.strptime(date_threshold, "%Y-%m-%d")

train_indices = []
for i in method_indices:
    meta = eng.analyses.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = datetime.datetime.fromisoformat(start_time)
    if batch_position > 4 and start_time < date_threshold:
        train_indices.append(i)

train_data = eng.analyses.get_features(train_indices)
train_metadata = eng.analyses.get_metadata(train_indices)
train_data.to_csv("dev/train_features.csv", index=False)
train_metadata.to_csv("dev/train_metadata.csv", index=False)
print("Number of training curves: ", len(train_indices))

Number of training curves:  20


In [6]:
fig_train=eng.analyses.plot_pressure_curves(indices = train_indices)
fig_train.update_layout(showlegend=False)
for trace in fig_train.data:
    trace.line.color = "black"
fig_train.write_image("dev/figures/fig_train.png", width=1100, height= 350, scale = 3)
fig_train.show()

In [7]:
fig_train_features=eng.analyses.plot_features(indices = train_indices)
fig_train_features.update_layout(showlegend=False)
for trace in fig_train_features.data:
    trace.line.color = "black"
fig_train_features.write_image("dev/figures/fig_train_features.png", width=1100, height= 350, scale = 3)
fig_train_features.show()

In [8]:
import datetime
date_threshold_min = "2021-08-18"
date_threshold_min= datetime.datetime.strptime(date_threshold_min, "%Y-%m-%d")

date_threshold_max = "2021-08-22"
date_threshold_max= datetime.datetime.strptime(date_threshold_max, "%Y-%m-%d")

#test_indices = list(set(method_indices) - set(train_indices))

test_indices = []
for i in method_indices:
    meta = eng.analyses.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = datetime.datetime.fromisoformat(start_time)
    if start_time < date_threshold_max and start_time > date_threshold_min and batch_position > 0:
        test_indices.append(i)

test_data = eng.analyses.get_features(test_indices)
test_metadata = eng.analyses.get_metadata(test_indices)
test_data.to_csv("dev/test_features.csv", index=False)
test_metadata.to_csv("dev/test_metadata.csv", index=False)
print("Number of test curves: ", len(test_indices))

fig_test=eng.analyses.plot_pressure_curves(indices = test_indices)
fig_test.update_layout(showlegend=False)
#for trace in fig_outliers.data:
#    trace.line.color = "black"
fig_test.write_image("dev/figures/fig_test.png", width=550, height= 350, scale = 3)
fig_test.update_layout(showlegend=True)
fig_test.show()

Number of test curves:  17


In [9]:
fig_test = eng.analyses.plot_features(indices = test_indices)
fig_test.update_layout(showlegend=False)
# for trace in fig_text.data:
#     trace.line.color = "black"
fig_test.write_image("dev/figures/fig_test_features.png", width=1100, height= 350, scale = 3)
fig_test.update_layout(showlegend=True)
fig_test.show()

In [10]:
import datetime
date_threshold_min = "2021-09-09"
date_threshold_min= datetime.datetime.strptime(date_threshold_min, "%Y-%m-%d")

date_threshold_max = "2021-09-12"
date_threshold_max= datetime.datetime.strptime(date_threshold_max, "%Y-%m-%d")

test2_indices = []
for i in method_indices:
    meta = eng.analyses.get_metadata(i)
    batch_position = meta["batch_position"].item()
    start_time = meta["start_time"].item()
    if isinstance(start_time, str):
        start_time = datetime.datetime.fromisoformat(start_time)
    if start_time < date_threshold_max and start_time > date_threshold_min and batch_position > 0:	
        test2_indices.append(i)

test2_indices = test2_indices[:1]
test2_indices.append(47)
test2_indices.append(17)
test2_indices.append(116)
test2_indices.append(16)

test2_data = eng.analyses.get_features(test2_indices)
test2_metadata = eng.analyses.get_metadata(test2_indices)
test2_data.to_csv("dev/test2_features.csv", index=False)
test2_metadata.to_csv("dev/test2_metadata.csv", index=False)
print("Number of test curves: ", len(test2_indices))

fig_test2=eng.analyses.plot_pressure_curves(indices = test2_indices)
fig_test2.update_layout(showlegend=False)
#for trace in fig_outliers.data:
#    trace.line.color = "black"
fig_test2.write_image("dev/figures/fig_test2.png", width=1100, height= 350, scale = 3)
fig_test2.update_layout(showlegend=True)
fig_test2.show()

Number of test curves:  5


In [11]:
fig_test2 = eng.analyses.plot_features(indices = test2_indices)
fig_test2.update_layout(showlegend=False)
# for trace in fig_text.data:
#     trace.line.color = "black"
fig_test2.write_image("dev/figures/fig_test2_features.png", width=1100, height= 350, scale = 3)
fig_test2.update_layout(showlegend=True)
fig_test2.show()

In [12]:

test_2_indices = [21, 116, 117, 153]
test_2_data = eng.analyses.get_features(test_2_indices)
test_2_metadata = eng.analyses.get_metadata(test_2_indices)
test_2_data.to_csv("dev/test_2_features.csv", index=False)
test_2_metadata.to_csv("dev/test_2_metadata.csv", index=False)
fig_text_2_curves=eng.analyses.plot_pressure_curves(indices = test_2_indices)
fig_text_2_curves.update_layout(showlegend=False)
# for trace in fig_text_2_curves.data:
#     trace.line.color = "black"
fig_text_2_curves.write_image("dev/figures/fig_text_2_curves.png", width=1000, height= 350, scale = 3)
fig_text_2_curves.update_layout(showlegend=True)
fig_text_2_curves.show()

In [13]:
eng.analyses.plot_features(indices = test_2_indices)

In [14]:
fig_test2_features_raw_no_fourier = eng.analyses.plot_features_raw(indices = test_2_indices)
fig_test2_features_raw_no_fourier.update_layout(showlegend=False)
fig_test2_features_raw_no_fourier.write_image("dev/figures/fig_test2_features_raw_no_fourier.png", width=1100, height= 350, scale = 3)
fig_test2_features_raw_no_fourier.update_layout(showlegend=True)
fig_test2_features_raw_no_fourier.show()

In [15]:
import random
rest_indices = list(set(method_indices) - set(train_indices) - set(test_indices) - set(test2_indices) - set(test_2_indices))
test_source_indices = random.sample(rest_indices, 10)
print("New test curve indices: ", test_source_indices)

New test curve indices:  [238, 162, 154, 236, 150, 273, 247, 158, 234, 278]


In [16]:
test3_indices = test_source_indices
test3_data = eng.analyses.get_features(test3_indices)
test3_metadata = eng.analyses.get_metadata(test3_indices)
test3_data.to_csv("dev/test3_features.csv", index=False)
test3_metadata.to_csv("dev/test3_metadata.csv", index=False)
fig_text3_curves=eng.analyses.plot_pressure_curves(indices = test3_indices)
fig_text3_curves.update_layout(showlegend=False)
# for trace in fig_text_2_curves.data:
#     trace.line.color = "black"
fig_text3_curves.write_image("dev/figures/fig_text3_curves.png", width=1000, height= 350, scale = 3)
fig_text3_curves.update_layout(showlegend=True)
fig_text3_curves.show()
fig_text3_features=eng.analyses.plot_features(indices = test3_indices)
fig_text3_features.update_layout(showlegend=False)
fig_text3_features.write_image("dev/figures/fig_text3_features.png", width=1100, height= 350, scale = 3)
fig_text3_features.update_layout(showlegend=True)
fig_text3_features.show()