In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras
from os import listdir, path
import numpy as np
import random
from collections import defaultdict
import datetime

random.seed(42)
seq_length = 40
dim = 6


In [2]:
def padding(data):
    """Get neighboor padding."""
    padded_data = []
    noise_level = [ 20, 20, 20, 0.2, 0.2, 0.2 ]
    
    # Before- Neighbour padding
    tmp_data = (np.random.rand(seq_length, dim) - 0.5) * noise_level + data[0]
    tmp_data[(seq_length -
              min(len(data), seq_length)):] = data[:min(len(data), seq_length)]
    padded_data.append(tmp_data)
    # After- Neighbour padding
    tmp_data = (np.random.rand(seq_length, dim) - 0.5) * noise_level + data[-1]
    tmp_data[:min(len(data), seq_length)] = data[:min(len(data), seq_length)]
    padded_data.append(tmp_data)
    return padded_data

In [3]:
def build_dataset(data, label):
    """Support function for format.(Helps format train, valid and test.)"""
    # Add 2 padding, initialize data and label
    padded_num = 2
    length = len(data) * padded_num
    features = np.zeros((length, seq_length, dim))
    labels = np.zeros(length)
    # Get padding for train, valid and test
    for idx, (data, label) in enumerate(zip(data, label)):
        padded_data = padding(data)
        for num in range(padded_num):
            features[padded_num * idx + num] = padded_data[num]
            labels[padded_num * idx + num] = label
    # Turn into tf.data.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((features, labels.astype("int32")))
    return length, dataset

In [4]:
def time_warping(molecule, denominator, data):
  """Generate (molecule/denominator)x speed data."""
  tmp_data = [[0
               for i in range(len(data[0]))]
              for j in range((int(len(data) / molecule) - 1) * denominator)]
  for i in range(int(len(data) / molecule) - 1):
    for j in range(len(data[i])):
      for k in range(denominator):
        tmp_data[denominator * i +
                 k][j] = (data[molecule * i + k][j] * (denominator - k) +
                          data[molecule * i + k + 1][j] * k) / denominator
  return tmp_data


In [5]:
def augment_data(original_data, original_label):
  """Perform data augmentation."""
  new_data = []
  new_label = []
  for idx, (data, label) in enumerate(zip(original_data, original_label)):  # pylint: disable=unused-variable
    # Original data
    new_data.append(data)
    new_label.append(label)
    # Sequence shift
    for num in range(5):  # pylint: disable=unused-variable
      new_data.append((np.array(data, dtype=np.float32) +
                       (random.random() - 0.5) * 200).tolist())
      new_label.append(label)
    # Random noise
    tmp_data = [[0 for i in range(len(data[0]))] for j in range(len(data))]
    for num in range(5):
      for i in range(len(tmp_data)):
        for j in range(len(tmp_data[i])):
          tmp_data[i][j] = data[i][j] + 5 * random.random()
      new_data.append(tmp_data)
      new_label.append(label)
    # Time warping
    fractions = [(3, 2), (5, 3), (2, 3), (3, 4), (9, 5), (6, 5), (4, 5)]
    for molecule, denominator in fractions:
      new_data.append(time_warping(molecule, denominator, data))
      new_label.append(label)
    # Movement amplification
    for molecule, denominator in fractions:
      new_data.append(
          (np.array(data, dtype=np.float32) * molecule / denominator).tolist())
      new_label.append(label)
  return new_data, new_label

In [6]:
def load_data(data_type, files):
    data   = []
    labels = []
    random.shuffle(files)
    
    for file in files:
        with open(file) as f:
            label = path.splitext(file)[0][-1]
            labels.append(label)
            readings = []
            for line in f:
                reading = line.strip().split(',')
                readings.append([float(i) for i in reading[0:6]])

            data.append(readings)
            
    #if data_type == 'train':
        #data, labels = augment_data(data, labels)
    
    return build_dataset(data, labels)

In [7]:
import math

files_path = defaultdict(list)
dir = './data'
for filename in listdir(dir):
    if filename.endswith('.csv'):
        digit = path.splitext(filename)[0][-1]
        files_path[digit].append(path.join(dir, filename))

train_files      = []
validation_files = []
test_files       = []

for digit in files_path:
    random.shuffle(files_path[digit])
    
    train_split = math.floor(len(files_path[digit]) * 0.6) # 60%
    validation_split = train_split + math.floor(len(files_path[digit]) * 0.2) # 20%

    train_files += files_path[digit][:train_split]
    validation_files += files_path[digit][train_split:validation_split]
    # remaining 20%
    test_files += files_path[digit][validation_split:]

train_length, train_data = load_data('train', train_files)
validation_length, validation_data = load_data('validation', validation_files)
test_length, test_data = load_data('test', test_files )

print(train_length, validation_length, test_length)

for (ds, lb) in test_data.take(1):
    print(ds)

120 40 40
tf.Tensor(
[[ 8.58776867e+02 -9.64611596e+01  5.29959143e+02 -8.01293340e-01
  -2.24079661e+01 -3.74029432e+00]
 [ 8.66848130e+02 -1.06877687e+02  5.30007849e+02 -7.48403183e-01
  -2.24246152e+01 -3.81076774e+00]
 [ 8.54553214e+02 -1.06163804e+02  5.25145795e+02 -7.96543459e-01
  -2.24170970e+01 -3.85714578e+00]
 [ 8.48982102e+02 -9.96251427e+01  5.23474362e+02 -6.37492171e-01
  -2.24975155e+01 -3.87392787e+00]
 [ 8.52574715e+02 -1.04975429e+02  5.35612770e+02 -6.42654750e-01
  -2.23914031e+01 -3.89449329e+00]
 [ 8.58890000e+02 -1.04000000e+02  5.28810000e+02 -7.10000000e-01
  -2.24400000e+01 -3.80000000e+00]
 [ 9.03810000e+02 -1.50390000e+02  3.87210000e+02 -1.70300000e+01
  -2.97300000e+01 -1.68000000e+00]
 [ 8.43750000e+02 -8.49600000e+01  3.59380000e+02  1.59000000e+01
  -3.39800000e+01 -4.53000000e+00]
 [ 9.25780000e+02 -1.54300000e+02  2.64650000e+02  1.70800000e+01
  -4.19800000e+01 -1.41400000e+01]
 [ 1.10449000e+03 -1.69430000e+02  4.70210000e+02  2.99900000e+01
  -2

In [8]:
model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(8, (4, 6), padding="same", activation="relu", input_shape=(seq_length, dim, 1)),
      tf.keras.layers.MaxPool2D((3, 3)),
      tf.keras.layers.Dropout(0.1),
      tf.keras.layers.Conv2D(16, (4, 1), padding="same", activation="relu"),
      tf.keras.layers.MaxPool2D((3, 1), padding="same"),
      tf.keras.layers.Dropout(0.1),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(16, activation="relu"),
      tf.keras.layers.Dropout(0.1),
      tf.keras.layers.Dense(10, activation="softmax")
  ])

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 40, 6, 8)          200       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 13, 2, 8)          0         
_________________________________________________________________
dropout (Dropout)            (None, 13, 2, 8)          0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 13, 2, 16)         528       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 5, 2, 16)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 5, 2, 16)          0         
_________________________________________________________________
flatten (Flatten)            (None, 160)               0

In [10]:
epochs = 50
batch_size = 16

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

def reshape_function(data, label):
  reshaped_data = tf.reshape(data, [-1, 6, 1])
  return reshaped_data, label

train_data = train_data.map(reshape_function)
validation_data = validation_data.map(reshape_function)

train_data = train_data.batch(batch_size).repeat()
validation_data = validation_data.batch(batch_size)

logdir = "logs/scalars/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)

model.fit(
  train_data,
  epochs=epochs,
  validation_data=validation_data,
  steps_per_epoch=1000,
  validation_steps=int((validation_length - 1) / batch_size + 1),
  callbacks=[tensorboard_callback])

Train for 1000 steps, validate for 3 steps
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x13b33a050>

In [11]:
test_data = test_data.map(reshape_function)
test_labels = np.zeros(test_length)

idx = 0
for data, label in test_data:
    test_labels[idx] = label.numpy()
    idx += 1
    
test_data = test_data.batch(batch_size)

loss, acc = model.evaluate(test_data)
pred = np.argmax(model.predict(test_data), axis=1)
confusion = tf.math.confusion_matrix(labels=tf.constant(test_labels), predictions=tf.constant(pred), num_classes=10)
print(confusion)


tf.Tensor(
[[0 4 0 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0 0 0]], shape=(10, 10), dtype=int32)
