# Preprocess Data for CANClassify

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline 
# important! this will break plots on some windows running jupyter notebook - jupyter lab must be used instead
from main import *

In [None]:
import tensorflow as tf
from tensorflow.python.client import device_lib
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
device_lib.list_local_devices()

In [None]:
import keras
from keras import layers

# Prepare the data

Download the data from cyverse and place the data into the data/ folder.

The toyota vehicle (vehicle ids 2T3Y1RFV8KC014025, 2T3MWRFVXLW056972) has labeled radar data.

The honda vehicle (vehicle id 5FNYF6H05HB089022) has some labeled data, but no labeled radar data.

In [None]:
csv_paths_toyota = [
    "data/2020-08-13-13-26-45_2T3Y1RFV8KC014025_CAN_Messages.csv",
    "data/2020-09-04-10-07-55_2T3Y1RFV8KC014025_CAN_Messages.csv",
    "data/2020-10-17-10-34-29_2T3MWRFVXLW056972_CAN_Messages.csv",
    "data/2020-10-17-13-40-39_2T3MWRFVXLW056972_CAN_Messages.csv",
    "data/2020-11-05-09-10-00_2T3Y1RFV8KC014025_CAN_Messages.csv"
]

csv_paths_honda = [
    "data/2020-09-18-11-36-54_5FNYF6H05HB089022_CAN_Messages.csv",
    "data/2020-11-05-08-39-21_5FNYF6H05HB089022_CAN_Messages.csv"
]

# we don't have a csv with Nissan Leaf 2018 data yet. We need to get this

csv_paths_nissan = [
    "data/nissan_3_test.csv"
    #"data/2021-11-11-00-20-27_JN1BJ1CW3LW375199_CAN_Messages.csv",
]

# Load the dbc file

The dbc file stores correct labeling of CAN signals.

In [None]:
import cantools
from pprint import pprint
db_toyota = cantools.database.load_file("data/toyota_rav4_2020.dbc")
db_honda = cantools.database.load_file("data/honda_pilot_2017.dbc")
db_nissan = cantools.database.load_file("data/nissan_leaf_2018.dbc")

Make a dictionary to associate messages to signals

In [None]:
def get_mess2sig_dict(db):
    dictionary = {}
    for message in db.messages:
        for signal in db.get_message_by_name(message.name).signals:
            if message.name in dictionary:
                dictionary[message.name] += [signal.name]
            else:
                dictionary[message.name] = [signal.name]
    return dictionary

mess2sig_toyota = get_mess2sig_dict(db_toyota)
mess2sig_honda = get_mess2sig_dict(db_honda)
mess2sig_nissan = get_mess2sig_dict(db_nissan)

In [None]:
print("Toyota")
print(mess2sig_toyota)
print("Honda")
print(mess2sig_honda)
print("Nissan")
print(mess2sig_nissan)

## Identify signals to train on

Additional labeled signals can be added for training. The signals chosen for CANClassify, and their relevant Message/Signal name in the cantools database are listed below:

- wheel_speed_fr, wheel_speed_fl, wheel_speed_rr, wheel_speed_rl

Toyota: WHEEL_SPEEDS: ['WHEEL_SPEED_FR', 'WHEEL_SPEED_FL', 'WHEEL_SPEED_RR', 'WHEEL_SPEED_RL']

Honda: 'WHEEL_SPEEDS': ['WHEEL_SPEED_FL', 'WHEEL_SPEED_FR', 'WHEEL_SPEED_RL', 'WHEEL_SPEED_RR', 'CHECKSUM']

Nissan: 'WHEEL_SPEEDS_FRONT': ['WHEEL_SPEED_FR', 'WHEEL_SPEED_FL'], 'WHEEL_SPEEDS_REAR': ['WHEEL_SPEED_RR', 'WHEEL_SPEED_RL']

- steer_angle, steer_angle_rate

Toyota: 'STEER_ANGLE_SENSOR': ['STEER_ANGLE', 'STEER_FRACTION', 'STEER_RATE']

Honda: 'STEERING_SENSORS': ['STEER_ANGLE', 'STEER_ANGLE_RATE', 'COUNTER', 'CHECKSUM']

Nissan: 'STEER_ANGLE_SENSOR': ['STEER_ANGLE', 'STEER_ANGLE_RATE', 'SET_ME_X07', 'COUNTER']

- brake_pedal

Toyota: 'BRAKE': ['BRAKE_AMOUNT', 'BRAKE_PEDAL']

Honda: 'POWERTRAIN_DATA': ['PEDAL_GAS', 'ENGINE_RPM', 'GAS_PRESSED', 'ACC_STATUS', 'BOH_17C', 'BRAKE_SWITCH', 'BOH2_17C', 'BRAKE_PRESSED', 'BOH3_17C', 'COUNTER', 'CHECKSUM']

Nissan: 'BRAKE_PEDAL': ['BRAKE_PEDAL']

We will directly save the binary values for each of these as {car}\_{signal name}, which will store a list of trajectories for the specific car and signal name


## <font color='red'> In order to change what to train on, it is necessary to change main.py's labels and label_to_messig dictionaries </font>

## Make a list of names to refer to as vehicle identifiers

In [None]:
TOYOTA = VEHICLE('toyota_rav4_2020', db_toyota, label_to_messig_toyota, csv_paths_toyota)
HONDA = VEHICLE('honda_pilot_2017', db_honda, label_to_messig_honda, csv_paths_honda)
NISSAN = VEHICLE('nissan_leaf_2018', db_nissan, label_to_messig_nissan, [])
VEHICLES = [TOYOTA, HONDA, NISSAN]

## <font style='color: red'>The following cells are the cells which train the model. They may take a while and may require more intensive computing resources.

## For each csv file, collect the raw data for these signals. This takes a while. 

## should only be run once! Use the pickled files below to load the data again.

### Definition: raw data

Raw data refers to a mapping from a label to a list of pairs. These pairs are x and y data.
This x and y data are directly from the can bus, so x will be an ordered collection of timestamps, and y will be an ordered collection of np boolean arrays.

In [None]:
trajectories_toyota = {k: [] for k in labels}
trajectories_honda = {k: [] for k in labels}
# we don't have nissan data yet
# trajectories_nissan = {k: [] for k in labels}

trajectories_toyota = get_trajectory_dict(csv_paths_toyota, db_toyota, labels, 
                                          label_to_messig_toyota, more_info_to_print='toyota')
trajectories_honda = get_trajectory_dict(csv_paths_honda, db_honda, labels, 
                                         label_to_messig_honda, more_info_to_print='honda')

## Save and load trajectories using pickle

In [None]:
with open("data/raw_trajectories_toyota", "wb") as f:
    pickle.dump(trajectories_toyota, f)

with open("data/raw_trajectories_honda", "wb") as f:
    pickle.dump(trajectories_honda, f)

In [None]:
with open("data/raw_trajectories_toyota", "rb") as f:
    trajectories_toyota = pickle.load(f)

with open("data/raw_trajectories_honda", "rb") as f:
    trajectories_honda = pickle.load(f)

### Note: We can take a look at what is inside trajectories_toyota, which contains raw data

In [None]:
print(f"The possible labels we can have inside the trajectories are: {labels}")

In [None]:
print(f"There are {len(trajectories_toyota[labels[0]])} '{labels[0]}' trajectories for the toyota vehicle.")
print(f"The first of these has {len(trajectories_toyota[labels[0]][0][0])} timepoints.")
print(f"This signal has a binary length of {len(trajectories_toyota[labels[0]][0][1].iloc[0])}.")
print(f"There are {len(trajectories_honda['radar_lat'])} '{'radar_lat'}' trajectories for the honda vehicle.")

## Algorithm 1: For each binary-valued trajectory, place it randomly into a 64-bit-long message, where the padding is random values and zeroed values

There should be 8 random placements per message, and for each, pad with random values and pad with zeroed values once each.

In [None]:
random.seed(0)

full_trajectories = {k: [] for k in labels}

partial_trajectories = {
    "toyota": trajectories_toyota, 
    "honda": trajectories_honda
}

for vehicle_name, car_trajectories in partial_trajectories.items():
    for k, value_list in car_trajectories.items():
        for xs, ys in value_list:
            if type(xs) == pd.Series:
                xs = xs.values
            if type(ys) == pd.Series:
                ys = ys.values
            
            print(f"Got trajectory w/ {len(ys)} timepoints, {vehicle_name}")
            
            signal_length = len(ys[0])            
            start_positions = random.sample(range(0, 64-signal_length), 10)
            
            # randomized start positions for each signal
            for start_position in start_positions:
                
                randomized_signal = np.array([[random.choice([False, True]) for _ in range(start_position)] + 
                                     list(yi) + 
                                     [random.choice([False, True]) for _ in range(64 - start_position - signal_length)]
                                     for yi in ys])
                zeroed_randomized_signal = np.array([ [False] * start_position + 
                                     list(yi) + 
                                     [False] * (64 - start_position - signal_length)
                                     for yi in ys])
                
                full_trajectories[k].append( (xs, randomized_signal) )
                full_trajectories[k].append( (xs, zeroed_randomized_signal) )

In [None]:
for label in labels:
    print(f"The number of {label} trajectories is: {len(full_trajectories[label])}")

In [None]:
# Given a 30MB/s write speed, pickle dumping takes about 3 minutes for a 26GB file for ~6 hours of driving

In [None]:
with open("data/full_trajectories", "wb") as f:
    pickle.dump(full_trajectories, f)

In [None]:
with open("data/full_trajectories", "rb") as f:
    full_trajectories = pickle.load(f)

## Algorithm 2: Interpretive Convolutions (Preprocess Data with Masked Interpretations)

We have a 64 bit signal. The full_trajectories contains labels, which map to trajectories. We wish to convert these trajectories to a convolved interpretation of the signals, and make a dictionary which map from label to convolved interpretations.

We have masks of size 4 (big/little unsigned), 8 (big/little signed/unsigned), 12 (big/little unsigned), 16 (big/little signed/unsigned), running across the signal and generating a single value each time.

This results in:

- (64 - 4 + 1) * 2 + 
- (64 - 8 + 1) * 4 +
- (64 - 12 + 1) * 2 + 
- (64 - 12 + 1) * 4 + 
- 64 (for just the raw signal) 

values, that is, 716 inputs.



In [None]:
(64 - 4 + 1) * 2 + (64 - 8 + 1) * 4 + (64 - 12 + 1) * 2 + (64 - 16 + 1) * 4 + 64

In [None]:
labels_to_convolved_interpretation = {}

for k, v in full_trajectories.items():
    new_v = []
    count = 0
    for _, y in v: # for each time/vals pair
        # we don't need x, we only need y
        
        timestamp = datetime.strftime(datetime.now(), "%Y/%M/%D %H:%M:%S")
        print(f"{timestamp}: Interpreting key: {k}, timeseries {count+1}/{len(v)}")
        count += 1
        new_values = []
        
        # intepretation should interpolate to a rougher timescale, 1/100
        for i in range(0, len(y), 100): # for each 01010011 value
            # convert from pd series if it is, else assume numpy array
            if type(y) == pd.Series:
                y = y.values
                
            # here, the length of the values should be 64 long. Ignore if not.
            if len(y[i]) != 64:
                print(f"This had length {len(y[i])}, not 64. Skipping.")
                break
            
            new_values.append(convert_ith_original_signal_to_convolved_signal(y, i))
            
        new_values = np.array(new_values)
        new_v.append(new_values)
    labels_to_convolved_interpretation[k] = new_v

### Center and Scale Data

In [None]:
ranges = [
    (0, 61),
    (61, 122),
    (122, 179),
    (179, 236),
    (236, 293),
    (293, 350),
    (350, 403),
    (403, 456),
    (456, 505),
    (505, 554),
    (554, 603),
    (603, 652),
    (652, 716)
]

In [None]:
means = []

variances = []

# independent variances and means should be calculated for each of these ranges

for r in ranges:
    print(f"Processing range {r}")
    collected_range = []
    for _, list_of_new_vals in labels_to_convolved_interpretation.items():
        for y in list_of_new_vals:
            collected_range.extend([y_i[r[0]:r[1]] for y_i in y])
    means.append(np.mean(np.array(collected_range), axis=0))
    variances.append(np.var(np.array(collected_range), axis=0))

In [None]:
with open("data/data_means", "wb") as f:
    pickle.dump(means, f)

with open("data/data_vars", "wb") as f:
    pickle.dump(variances, f)

In [None]:
with open("data/data_means", "rb") as f:
    means = pickle.load(f)

with open("data/data_vars", "rb") as f:
    variances = pickle.load(f)

In [None]:
# center and mean data
for i in range(len(ranges)):
    print(f"Centering and scaling range {ranges[i]}")
    for _, list_of_new_vals in labels_to_convolved_interpretation.items():
        for y in list_of_new_vals:
            for j in range(len(y)):
                y[j][ranges[i][0]:ranges[i][1]] -= means[i]
                y[j][ranges[i][0]:ranges[i][1]] /= variances[i]

In [None]:
with open("data/labels_to_convolved_interpretation", "wb") as f:
    pickle.dump(labels_to_convolved_interpretation, f)

In [None]:
with open("data/labels_to_convolved_interpretation", "rb") as f:
    labels_to_convolved_interpretation = pickle.load(f)

## Finalizing the data: making X/Y arrays

Convert data into X and Y arrays before train/test split

In [None]:
X = []
Y = []

labels_to_indices = {label: i for i, label in enumerate(labels)}

def generate_label_vector_from_label(input_label):
    vector = np.zeros(len(labels))
    vector[labels_to_indices[input_label]] = 1
    return vector

for label, list_of_new_vals in labels_to_convolved_interpretation.items():
    y_vector = generate_label_vector_from_label(label)
    for y in list_of_new_vals:
        # grab data in large chunks, 100 each. 
        # 100 timesteps is a good amount of time for the detection of patterns in a signal. Longer also works, but 
        # shorter, means there are not enough patterns for the LSTM to pick up on
        for i in range(0, len(y) - 100, 100):
            X.append(np.array(y[i:i+100]))
            Y.append(y_vector)

In [None]:
with open("data/data_x", "wb") as f:
    pickle.dump(X, f)

with open("data/data_y", "wb") as f:
    pickle.dump(Y, f)