In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.fft import fft, fftfreq
import os
import glob
from collections import defaultdict

In [9]:
sample_rate = 100 # samples/s
chunk_seconds = 1
chunk_size = chunk_seconds * sample_rate

In [10]:
#data_dir = os.path.expanduser("~/Downloads/Clinic/tarsalis_data_clean/")
data_dir = os.path.expanduser("../../Data Cleaning/aegypti_data_clean")
os.chdir(data_dir)
filenames = glob.glob('*.csv')

In [11]:
dataframes = [pd.read_csv(f) for f in filenames]

In [12]:
%matplotlib qt

QSocketNotifier: Can only be used with threads started with QThread


In [14]:
dataframes[0]

Unnamed: 0.1,Unnamed: 0,time,pre_rect,post_rect,labels
0,0,0.00,-0.062256,0.466309,NP
1,1,0.01,-0.075684,0.466309,NP
2,2,0.02,-0.061035,0.463867,NP
3,3,0.03,-0.061035,0.457764,NP
4,4,0.04,-0.063477,0.452881,NP
...,...,...,...,...,...
227528,227528,2275.28,-0.001221,-0.001221,NP
227529,227529,2275.29,-0.001221,-0.001221,NP
227530,227530,2275.30,-0.001221,-0.001221,NP
227531,227531,2275.31,-0.001221,-0.001221,NP


In [16]:
transformed_dataframes = []

#df = dataframes[0]
for ii, df in enumerate(dataframes):

    print(ii, end=' ')
    
    num_chunks = len(df) // chunk_size
    chunks = np.array_split(df[:num_chunks*chunk_size], num_chunks)

    columns = defaultdict(list)

    #chunk = chunks[0]
    for chunk in chunks:

        # start at 1 to ignore the DC component
        chunk_fft = np.abs(fft(chunk["pre_rect"].values))[1:chunk_size//2]
        chunk_freqs = fftfreq(chunk_size, 1 / sample_rate)[1:chunk_size//2]
        
        # https://stackoverflow.com/a/20826735
        # TODO: maybe do peak extraction on FFT instead of just indices of max vals
        # Extract indices of frequencies with greatest amplitude
        num_largest = 6
        indices = (-chunk_fft).argpartition(num_largest, axis=None)[:num_largest]

        # Sort indices based on amplitudes of corresponding frequencies
        indices = sorted(indices, key=lambda x: chunk_fft[x], reverse=True)
        
        # Get the frequencies
        peak_freqs = chunk_freqs[indices]
        
        #print(peak_freqs)
        #plt.plot(chunk_freqs, chunk_fft, '.-')
        #plt.grid()
        #for i in indices:
        #    plt.axvline(chunk_freqs[i])

        labels, label_counts = np.unique(chunk["labels"], return_counts=True)
        label = labels[np.argmax(label_counts)]

        for i in range(num_largest):
            columns[f"F{i}"].append(peak_freqs[i])
        columns["mean"].append(np.mean(chunk["pre_rect"]))
        columns["std"].append(np.std(chunk["pre_rect"]))
        #columns["resistance"].append(chunk["resistance"].values[0])
        # AeA data has no voltage column
        if "voltage" in chunk.columns:
            columns["volts"].append(chunk["voltage"].values[0])
        columns["time"].append(chunk["time"].values[0])
        # AeA data has no current column
        if "current" in chunk.columns:
            columns["current"].append(0 if chunk["current"].values[0] == "AC" else 1)
        columns["label"].append(label)

    # lookbehind stuff
    '''
    n_lookbehind = 5
    lookbehinds = range(1, n_lookbehind + 1)
    #lookbehinds = [1, 10, 20]

    look_columns = \
        [f"F{i}" for i in range(num_largest)] + \
        ["mean", "std", "volts", "current"]
        #["time"]
        #["label"]
    #for i in range(num_largest):
    for c in look_columns:
        for j in lookbehinds:
            #col = columns[f"F{i}"]
            col = columns[c]
            arr = np.append(np.zeros(j), col[:-j])
            assert len(col) == len(arr)
            columns[f"{c}-{j}"] = arr
    '''

    df_out = pd.DataFrame(columns)

    transformed_dataframes.append(df_out)

#df_out

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 

In [17]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [23]:
test_data = transformed_dataframes[:8]
train_data = transformed_dataframes[8:]
test = pd.concat(test_data)
train = pd.concat(train_data)

X_test = test.drop(["label"], axis=1)
Y_test = test["label"]
X_train = train.drop(["label"], axis=1)
Y_train = train["label"]

#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=190)
model = RandomForestClassifier(n_estimators=100, random_state=41, class_weight="balanced")
model.fit(X_train, Y_train)
predicted = model.score(X_test, Y_test)
#scores

In [24]:
#scores.mean(), scores.std()
predicted

0.857998244980127

In [29]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(model, X_test, Y_test)
plt.title("Random Forest Classifier Confusion Matrix")

Text(0.5, 1.0, 'Random Forest Classifier Confusion Matrix')

Make a nice plot so we can see how the classifier was classifying things

In [26]:
Y_test.unique()

array(['NP', 'J', 'K', 'L', 'M', 'N'], dtype=object)

In [12]:
assert False

AssertionError: 

In [30]:
label_to_color = {
    "NP": "red",
    "J": "blue",
    "K": "green",
    "L": "purple",
    "M": "pink",
    "N": "cyan",
    "Z": "orange"
}

def plot_labels(time, voltage, true_labels, pred_labels, probs = None):
    """
    plot_labels produced a matplotlib figure containing three subplots
        that visualize a waveform along with the true and predicted labels
    Input:
        time: a series of time values
        voltage: a time series of voltage values from the waveform
        true_labels: a time series of the true label for each time point
        pred_labels: a time series of the predicted labels for each time point
    Output:
        (fig, axs): a tuple
    """
    fig, axs = plt.subplots(4 if len(probs) > 0 else 3, 1, sharex = True)
    recording = 1
    fill_min, fill_max = voltage.min(), voltage.max()
    
    # First plot will be the true labels
    axs[0].plot(time, voltage, color = "black")
    for label, color in label_to_color.items():
        fill = axs[0].fill_between(time, fill_min, fill_max, 
                where = (true_labels == label), color=color, alpha = 0.5)
        fill.set_label(label)
    axs[0].legend(bbox_to_anchor=(0.5, 1), 
                  bbox_transform=fig.transFigure, loc="upper center", ncol=9)
    axs[0].set_title("True Labels")
    # Second plot will be the predicted labels
    axs[1].plot(time, voltage, color = "black")
    for label, color in label_to_color.items():
        axs[1].fill_between(time[:len(pred_labels)], fill_min, fill_max, 
                where = (pred_labels == label), color=color, alpha = 0.5)
    axs[1].set_title("Predicted Labels")
    # Third plot will be marked where there is a difference between the two
    axs[2].plot(time, voltage, color = "black")
    axs[2].fill_between(time[:len(pred_labels)], fill_min, fill_max, 
            where = (pred_labels != true_labels[:len(pred_labels)]), color = "gray", alpha = 0.5)
    axs[2].set_title("Incorrect Labels")
    if len(probs) > 0:
        # Fourth plot with confidences
        axs[3].plot(time[:len(probs)], probs)
        axs[3].set_title("Confidence")
        axs[3].set_ylim([0, 1])
        # Axes titles and such
    fig.supxlabel("Time (s)")
    fig.supylabel("Volts")
    fig.tight_layout()
    
for i in range(8):
    recording = i
    df = dataframes[recording]
    transformed_df = test_data[recording]
    pred_labels = model.predict(transformed_df.drop(["label"], axis=1)).repeat(sample_rate)
    probs = model.predict_proba(transformed_df.drop(["label"], axis=1)).max(axis=1).repeat(sample_rate)
    plot_labels(df["time"], df["pre_rect"], df["labels"], pred_labels, probs)
    plt.show(block=True)

qt.qpa.wayland: Wayland does not support QWindow::requestActivate()
qt.qpa.wayland: Wayland does not support QWindow::requestActivate()
qt.qpa.wayland: Wayland does not support QWindow::requestActivate()
qt.qpa.wayland: Wayland does not support QWindow::requestActivate()
qt.qpa.wayland: Wayland does not support QWindow::requestActivate()
qt.qpa.wayland: Wayland does not support QWindow::requestActivate()
qt.qpa.wayland: Wayland does not support QWindow::requestActivate()
qt.qpa.wayland: Wayland does not support QWindow::requestActivate()


KeyboardInterrupt: 

QSocketNotifier: Invalid socket 75 and type 'Read', disabling...
