# Cleaning Function to generate windows free of currupted signals

In [None]:
# returns the number of constant samples in the given signals

# num_sig = number of signals to consider
# min_const_num = minimum number of successive constant samples to consider a signal corrupted
def count_constants(signal_arr, num_sig, min_const_num = MIN_CONST_NUM):
    tot = 0
    for col in range(num_sig):
        count = 0
        curr = signal_arr[0][col]
        for el in signal_arr[:,col]:
            if el == curr:
                count += 1
            else:
                if count >= min_const_num:
                    tot += count
                count = 1
                curr = el
        if count >= min_const_num:
            tot += count
    return tot

In [None]:
# df = DataFrame from wich to build sequences
# window = number of elements to give as input to the network (intended as number of different instants)
# telescope = number of elements the network is expected to output (intended as number of different instants)
# stride = amount to distance between the beginnig of two windows
# const_threshold = fraction of corrupted signal acceptable in the values to be predicted
# adjust =  if True: makes windows fit exactly by discarding the first samples of the dataset
#           else: expects parameters to fit with no check
def build_sequences(df,
                    window=WINDOW_SIZE,
                    target_labels=TARGET_LABELS,
                    telescope=TELESCOPE,
                    stride=STRIDE,
                    const_threshold=CONST_THRESHOLD,
                    adjust=False):
    
    # Check that at least one window can be created
    tot_win_len = window + telescope
    assert tot_win_len <= len(df)
    
    # amount that does not fit with the specified parameters
    waste = (len(df) - (window + telescope)) % stride
    
    if adjust:
        df = df.iloc[waste:]
        waste = (len(df) - (window + telescope)) % stride
        assert waste == 0
        assert tot_win_len >= len(df)
    #  else:
    #      assert waste == 0
    
    dataset = []
    labels = []
    temp_df = df.copy().values
    temp_label = df[target_labels].copy().values
    
    discarded_win = []
    discarded_lab = []
    for idx in tqdm(np.arange(0, len(temp_df) - tot_win_len, stride)):
        
        t_telescope = temp_label[idx + window: idx + window + telescope]
        const_count = count_constants(t_telescope, len(target_labels))
        
        if const_count / (telescope*(len(target_labels))) <= const_threshold:
            dataset.append(temp_df[idx: idx+window])
            labels.append(temp_label[idx+window: idx+window+telescope])
        else:
            discarded_win.append(temp_df[idx: idx+window])
            discarded_lab.append(temp_label[idx+window: idx+window+telescope])

    dataset = np.array(dataset)
    labels = np.array(labels)
    discarded_win = np.array(discarded_win)
    discarded_lab = np.array(discarded_lab)
    return dataset, labels, discarded_win, discarded_lab