## Extracting different feature examples

This sections outlines some code snippets for how you can extract some slightly different features
The first shows how you "resample" the signal so that all of the signal fits into the same amount of timesteps
by stretching or compressing them, kind of like how you can do this with an image as well

In [None]:
from keras.layers import Dense, Conv1D, MaxPool1D, Flatten
from keras.preprocessing import sequence
from keras.models import Sequential
from scipy.signal import resample  # New line
from helpers import train_to_id5
from helpers import load_dataset
import matplotlib.pyplot as plt
import numpy as np


# Change this to set how many steps long you want your time-series to be
input_length = 10000


# A function to extract the values we need as input and output for the model training
# Note: You can make changes here to look at different features
def extract_features(signals, train_types):
    model_input = []
    model_target = []
    
    # Iterate over all signals and corresponding train types
    for signal, train_type in zip(signals, train_types):
                
        # Assemble the signal one data point
        signal = resample(signal, input_length)
        input_vector = np.reshape(signal, (-1, 1))  # special case if you have only 1 time series
    
        # Convert train type to number
        target = train_to_id5(train_type)
        
        # Add to dataset to be fed to a machine learning algorithm
        model_input.append(input_vector)
        model_target.append(target)
    
    # Convert to a more digestable format and return the data, also makes also signals equally long
    model_input = sequence.pad_sequences(model_input, input_length)
    model_target = np.array(model_target)
    return model_input, model_target


# Load the data
training_x, training_y = load_dataset(dataset='training')
validate_x, validate_y = load_dataset(dataset='validate')

# Transform the data / extract features
training_x, training_y = extract_features(training_x, training_y)
validate_x, validate_y = extract_features(validate_x, validate_y)


# Build a Convolutional Neural Network
model = Sequential()
model.add(Conv1D(filters=8, kernel_size=5, padding='valid', input_shape=training_x.shape[1:]))
model.add(MaxPool1D(2))
model.add(Conv1D(filters=8, kernel_size=5, padding='valid'))
model.add(MaxPool1D(2))
model.add(Flatten())
model.add(Dense(units=5, activation='softmax'))
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])


# Fit a model to the data. Note less epochs are needed here
logger = model.fit(training_x, training_y, epochs=50, batch_size=32, validation_data=[validate_x, validate_y])

# Visualize the fitting process to learn about how the model likely is
plt.title('Accuracy over epochs')
plt.plot(logger.history['accuracy'])
plt.plot(logger.history['val_accuracy'])
plt.legend(['training accuracy', 'validation accuracy'])
plt.xlabel('Epochs')
plt.tight_layout()
plt.show()


This following snippet shows how you can extract a "periodogram" from the signal. This is a type of transformation which gives you information about what kind of frequencies are present in the vibration

In [None]:
from keras.layers import Dense, Conv1D, MaxPool1D, Flatten
from keras.preprocessing import sequence
from keras.models import Sequential
from scipy.signal import periodogram  # New line
from helpers import train_to_id5
from helpers import load_dataset
import matplotlib.pyplot as plt
import numpy as np


# Change this to set how many steps long you want your time-series to be
input_length = 10000


# A function to extract the values we need as input and output for the model training
# Note: You can make changes here to look at different features
def extract_features(signals, train_types):
    model_input = []
    model_target = []
    
    # Needs to be done before iterating over the signals in this case
    signals = sequence.pad_sequences(signals, input_length * 2)
    
    # Iterate over all signals and corresponding train types
    for signal, train_type in zip(signals, train_types):
                
        # Assemble the signal one data point
        signal = periodogram(signal)[1]
        input_vector = np.reshape(signal, (-1, 1))  # special case if you have only 1 time series
    
        # Convert train type to number
        target = train_to_id5(train_type)
        
        # Add to dataset to be fed to a machine learning algorithm
        model_input.append(input_vector)
        model_target.append(target)
    
    # Convert to a more digestable format and return the data, also makes also signals equally long
    model_input = np.array(model_input)
    model_target = np.array(model_target)
    return model_input, model_target


# Load the data
training_x, training_y = load_dataset(dataset='training')
validate_x, validate_y = load_dataset(dataset='validate')

# Transform the data / extract features
training_x, training_y = extract_features(training_x, training_y)
validate_x, validate_y = extract_features(validate_x, validate_y)


# Build a Convolutional Neural Network
model = Sequential()
model.add(Conv1D(filters=8, kernel_size=5, padding='valid', input_shape=training_x.shape[1:]))
model.add(MaxPool1D(2))
model.add(Conv1D(filters=8, kernel_size=5, padding='valid'))
model.add(MaxPool1D(2))
model.add(Flatten())
model.add(Dense(units=5, activation='softmax'))
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])


# Fit a model to the data. Note less epochs are needed here
logger = model.fit(training_x, training_y, epochs=50, batch_size=32, validation_data=[validate_x, validate_y])

# Visualize the fitting process to learn about how the model likely is
plt.title('Accuracy over epochs')
plt.plot(logger.history['accuracy'])
plt.plot(logger.history['val_accuracy'])
plt.legend(['training accuracy', 'validation accuracy'])
plt.xlabel('Epochs')
plt.tight_layout()
plt.show()


In [None]:
from keras.layers import Dense
from keras.models import Sequential
from helpers import train_to_id5
from helpers import load_dataset


# A function to extract the values we need as input and output for the model training
# Note: You can make changes here to look at different features
def extract_features(signals, train_types):
    model_input = []
    model_target = []
    
    # Iterate over all signals and corresponding train types
    for signal, train_type in zip(signals, train_types):
        
        # Extract signal features with suggestion for alternative features
        rms_sub = np.sqrt(np.mean(np.square(signal[2000:6000])))  # Only extracts rms values for a subset of the signal
        signal_mean = np.mean(signal)
        signal_abs_mean = np.mean(np.abs(signal))
        signal_median = np.median(signal)
        signal_abs_median = np.median(np.abs(signal))
        quantile_25 = np.percentile(signal, 25)
        quantile_75 = np.percentile(signal, 75)
        length = len(signal)
        
        # Assemble these values into a single data point / array
        # You can combine any or all of the above in any way you want
        input_vector = [rms_sub, signal_mean, signal_abs_mean, length]
    
        # Convert train type to number
        target = train_to_id5(train_type)
        
        # Add to dataset to be fed to a machine learning algorithm
        model_input.append(input_vector)
        model_target.append(target)
    
    # Convert to a more digestable format and return the data
    model_input = np.array(model_input)
    model_target = np.array(model_target)
    return model_input, model_target


# Load the data
training_x, training_y = load_dataset(dataset='training')
validate_x, validate_y = load_dataset(dataset='validate')

# Transform the data / extract features
training_x, training_y = extract_features(training_x, training_y)
validate_x, validate_y = extract_features(validate_x, validate_y)

# Build a simple Neural Network
model = Sequential()
model.add(Dense(units=5, input_dim=training_x.shape[1]))
model.add(Dense(units=10))
model.add(Dense(units=5, activation='softmax'))
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])

# Apply the data and the train types and have the algorithm fit a model from x to y
logger = model.fit(training_x, training_y, epochs=250, batch_size=32, validation_data=[validate_x, validate_y])

# You can add a filter after getting the model predictions to augment what it does and then validate the results
correct = 0
predicted_y = model.predict(validate_x)
for i in range(len(predicted_y)):
    # The below if statement checks if the i-th signal had a length shorter than 50000 timesteps, assuming no changes
    # were made above. keep index number in mind for easy access and additional checks if you find good filters
    if validate_x[i][3] < 50000:
        if np.argmax(predicted_y[i]) == np.argmax(validate_y[i]):  # Compares model output with model target
            correct += 1
    else:
        if np.sum(validate_y[i]) == 0:  # Verifies that the model target is also an unknown train type
            correct += 1
# Validation accuracy after filtering
print('True validation accuracy: %.2f%%' % (100.0 * float(correct) / float(len(predicted_y))))


# Visualize the fitting process to learn about how good the model likely is
# Note, validation accuracy is the metric of how good the model probably is, while training accuracy shows
# how quickly the algorithm found a way to map the input to the desired output
plt.title('Accuracy over epochs')
plt.plot(logger.history['accuracy'])
plt.plot(logger.history['val_accuracy'])
plt.legend(['training accuracy', 'validation accuracy'])
plt.xlabel('Epochs')
plt.tight_layout()
plt.show()


## Changing the model target

Until now, we have been setting "unknown" trains as it's own train type which we try to classify. We can however also try to only classify the 4 relevant types of trains and treating the absence of this as an "unknown" type.
This can be achieved by using the helper function "train_to_id4" and changing a couple of things in the model, in particular we'll use "binary_crossentropy" instead of "categorical_crossentropy", which means we try to determine how likely it is that this signal is that particular train type, meaning it could theoretically possible for it to think that it could be 2 types of trains at the same time. This also changes the output of the validation accuracy a bit since it now check for accuracy over 4 outputs instead of one per sample, i.e. if 3 of the 4 output values say it is not this train type and the last one say it is this one, even if it got the train type wrong it would still be 50% correct (2 correct "not this type" and 1 incorrect "not this type" as well as 1 incorrect "it is this type"). Feel free to ask me for more details about this


In [None]:
from keras.layers import Dense, Conv1D, MaxPool1D, Flatten
from keras.models import Sequential
from helpers import train_to_id4
from helpers import load_dataset
import matplotlib.pyplot as plt
import numpy as np


# Change this to set how many steps long you want your time-series to be
input_length = 10000


# A function to extract the values we need as input and output for the model training
# Note: You can make changes here to look at different features
def extract_features(signals, train_types):
    model_input = []
    model_target = []
    
    # Iterate over all signals and corresponding train types
    for signal, train_type in zip(signals, train_types):
                
        # Assemble the signal one data point
        input_vector = np.reshape(signal, (-1, 1))  # special case if you have only 1 time series
    
        # Convert train type to number
        target = train_to_id4(train_type)
        
        # Add to dataset to be fed to a machine learning algorithm
        model_input.append(input_vector)
        model_target.append(target)
    
    # Convert to a more digestable format and return the data, also makes also signals equally long
    model_input = np.array(model_input)
    model_target = np.array(model_target)
    return model_input, model_target


# Load the data
training_x, training_y = load_dataset(dataset='training')
validate_x, validate_y = load_dataset(dataset='validate')

# Transform the data / extract features
training_x, training_y = extract_features(training_x, training_y)
validate_x, validate_y = extract_features(validate_x, validate_y)


# Build a Convolutional Neural Network
model = Sequential()
model.add(Conv1D(filters=8, kernel_size=5, padding='valid', input_shape=training_x.shape[1:]))
model.add(MaxPool1D(2))
model.add(Conv1D(filters=8, kernel_size=5, padding='valid'))
model.add(MaxPool1D(2))
model.add(Flatten())
model.add(Dense(units=4, activation='sigmoid'))  # softmax does not work well with binary_crossentropy
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])


# Fit a model to the data. Note less epochs are needed here
logger = model.fit(training_x, training_y, epochs=50, batch_size=32, validation_data=[validate_x, validate_y])

# Some more code needs to be used to determine the true validation accuracy when using this type of model output
correct = 0
predicted_y = model.predict(validate_x)
for i in range(len(predicted_y)):
    if np.max(predicted_y[i]) >= 0.50:  # This is a threshold to determine if any train types were identified
        if np.argmax(predicted_y[i]) == np.argmax(validate_y[i]):  # Compares model output with model target
            correct += 1
    else:
        if np.sum(validate_y[i]) == 0:  # Verifies that the model target is also an unknown train type
            correct += 1
print('True validation accuracy: %.2f%%' % (100.0 * float(correct) / float(len(predicted_y))))

# Visualize the fitting process to learn about how the model likely is
plt.title('Accuracy over epochs')
plt.plot(logger.history['accuracy'])
plt.plot(logger.history['val_accuracy'])
plt.legend(['training accuracy', 'validation accuracy'])
plt.xlabel('Epochs')
plt.tight_layout()
plt.show()
