In [None]:
# %load ../source_stability.py
""" For our analysis we distinguished the source operation into two
modes: stable operation and unstable operation. 

We make this distinction based on rolling windows over the BCT25 currents
mean and standard deviation/variance. If they exceed certain thresholds, we
consider the current as unstable.
"""

import pandas as pd
import numpy as np


def calculate_source_running(source_current):
    """ Determines whether the source was running, i.e. if the `source_current` 
    (typically BCT05 current) was above 0.004mA.

    Parameters:
        source_current (Series): A series of values of the current you want to use (typically BCT05)

    Returns:
        Series: A series with `1` at the indices where the current was above 0.004mA, and a `0` elsewhere.
    """

    is_zero_threshold = 0.004
    result = np.zeros(source_current.size, dtype=bool)
    result[source_current > is_zero_threshold] = 1
    return result


def stability_mean_variance_classification(
    df,
    value_column,
    weight_column,
    sliding_window_size_mean=500,
    sliding_window_size_std=1000,
    minimum_mean=0.025,
    maximum_variance=0.00005,
):
    """ Classifies all points in the data frame into the categories source stable/unstable, based on a rolling window and a minimum mean and maximum variance in this window.

    Parameters:
        df (DataFrame): The data input loaded as a DataFrame
        current_column (string): name of the column that contains the beam current we are interested in, typically BCT25
        sliding_window_size (int): size of the sliding window, by default 5000 (100 Minutes of data every 1.2 seconds)
        minimum_mean (double): minimal intensity of the beam in the sliding window for it to be considered stable
        maximum_variance (double): maximum variance of intensity of the beam in the sliding window for it to be considered stable

    Returns:
        Series: A series that for every data point indicates if the source was running stable or not (1 is stable, 0 is unstable)
    """

    df["wvalue"] = df[value_column] * df[weight_column]

    mean_weight_sum = (
        df[["wvalue", weight_column]]
        .rolling("{}s".format(sliding_window_size_mean), closed="left")
        .sum()
    )
    wmean = mean_weight_sum["wvalue"] / mean_weight_sum[weight_column]
    wmean.name = "wmean"

    df["wdeviation"] = df[value_column] - wmean
    df["wdeviation"] = df["wdeviation"] ** 2
    df["wdeviation"] *= df[weight_column]
    var_weight_sum = (
        df[["wdeviation", weight_column]]
        .rolling("{}s".format(sliding_window_size_mean), closed="left")
        .sum()
    )
    wvar = var_weight_sum["wdeviation"] / (var_weight_sum[weight_column] - 1)
    wvar.name = "wvar"

    df.drop(["wvalue", "wdeviation"], axis=1, inplace=True)

    stats = pd.concat([wmean, wvar], axis=1)
    stats["result"] = 0
    stats.loc[
        (stats["wmean"] > minimum_mean) & (stats["wvar"] < maximum_variance), "result"
    ] = 1

    return stats["result"]


In [None]:
# %load ../voltage_breakdowns.py
""" Sometimes, the high voltage of the source can break down.
This disturbs the beam and should be avoided. Hence we wrote
the `voltage_breakdowns.py` module to count how often it happens. Furthermore, as during
these breakdowns the HT current makes a spark, we want to exclude
data points that belong to a breakdown from the analysis, to not
induce noise into the results.

During a spark the following happens: First, the voltage breaks down,
from around 20000V during operation to <1000V. This can also be seen in
the HT current, that typically rapidly drops to zero A, shoots up to three A,
and then normalizes again. Shortly after this is registered by the system,
the extraction solenoid current is ramped down to around 850A.

The `voltage_breakdowns.py` module provides two tools: 

1. The first one, `detect_breakdowns` finds
periods where the HT current variance exceeds a threshold in a short window.
Typically, the current has a low variance, and hence the sparks above can be
found reliably with this method. It marks the whole window as a breakdown,
so that all these data points can be ignored in the future analysis.

2. The second one, `detect_sparks`, detects where exactly the the voltage 
broke down. If two breakdowns happened shortly after each other, method 1
would count only one breakdown, but we are interested in the exact number.
This methods counts the local minima of the HT voltage that are below a
certain threshold.

It is important to understand the difference between sparks and breakdowns. For
this library a breakdown always marks a (short) window of very high instability of the
HT current, while a spark is a precise moment in time where the voltage tripped.
"""

import pandas as pd
import numpy as np

from scipy import signal


def classify_using_var_threshold(values, threshold):
    """ Classify values based on the variance exceeding a certain threshold 
    
    Parameters:
        values (np.ndarray): Values for which the variance is to be computed.
        threshold (float): Threshold agains which to check.

    Returns:
        int: `1` if the variance is greater or equal than the threshold, `0` otherwise.
    """

    var = np.var(values)
    return int(var >= threshold)


def detect_breakdowns(df, ht_current_column, window_size=40, threshold=0.5):
    """ Detection of high voltage breakdown based on standard deviation exceding a certain threshold that has to be determined by experiments.
    
    Parameters:
        df (DataFrame): The frame containing the data
        column (string): High voltage current, typically this should be 'IP.NSRCGEN:SOURCEHTAQNI' 
        window_size (int): Size of the rolling window. Once a breakdown is detected, every value in this window will be set to 1.
        threshold (double): Threshold for the standard deviation.
    
    Returns: 
        np.array: For each data point that lies inside of a breakdown window, this array contains the timestamp of the start of the window, 
        otherwise it is zero. So for each value greater that zero, all data points with the same value were in the same breakdown window.
    """

    if not ht_current_column in df:
        raise ValueError("Error: The column cannot be found in the dataframe.")

    result = np.zeros(len(df.index))
    values = df[ht_current_column].values
    times = (df.index.astype("int64") * 1e-9).values

    current_breakdown = 0
    for i in range(len(values) - window_size):
        is_breakdown = classify_using_var_threshold(
            values[i : i + window_size], threshold
        )
        if is_breakdown:
            if not result[i]:
                current_breakdown = times[i]

            result[i : (i + window_size)] = current_breakdown

    return result


def detect_sparks(ht_voltage, breakdowns, threshold=1000):
    """ Detect all sparks, i.e. the number of downward peaks of the HT voltage below a certain threshold.

    Parameters
    ----------
        ht_voltage (np.array): The HT voltage
        breakdowns (np.array): An array where the breakdown windows are marked (output of `detect_breakdowns`).    
                            Only peaks in these windows are counted as sparks.
        threshold (float): Maximum value of the HT current for a peak to be counted as breakdowns

    Returns
    -------
        np.array: At each point where a spark occurred the sparks timestamp, otherwise zero.
    """

    ht_voltage = ht_voltage.copy()
    ht_voltage[breakdowns == 0] = threshold + 1

    result = np.zeros(len(ht_voltage.index), dtype="int64")
    values = ht_voltage.values
    times = (ht_voltage.index.astype("int64") * 1e-9).values

    peaks, _ = signal.find_peaks(-values, height=-threshold, prominence=threshold / 2)
    result[peaks] = times[peaks]

    return result
