### conv2d-train
ref: http://int-info.com/PyLearn/PyLearnKeras04.html

Mar05, 2023, ms

This is to upload GitHub.

##### import

In [1]:
import datetime
import glob
#import matplotlib
import numpy as np
import os
import pickle
import shutil
from scipy.interpolate import interp1d

import matplotlib.pyplot as plt

from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.optimizers import RMSprop
from keras.layers import Dense, Flatten, Conv2D
from keras import initializers, callbacks

import tensorflow as tf

#### functions

In [2]:
# interporate data
def interporateAxisData(ax, x_points):
    """
    ax: ndarray, values from single axis
    x_points: points to interporate
    """
    # squeeze original x range into (1 to x_points)
    ori_x_scaled = np.array([(x_points/len(ax)) * i
                                            for i in range(1, len(ax)+1)])
    # linear range 1, 2,,, x_points
    new_x = np.array([i for i in range(1, x_points+1)])
    # interporate function
    myfunc = interp1d(ori_x_scaled, ax, fill_value="extrapolate")
    # return interporated ax values
    return myfunc(new_x)


def interporateData(reads, x_points):
    """
    """
    read_count = len(reads)  # eq. to csv count
    # axis count.  get it from the first read
    ax_count = reads[0].shape[1]

    # prepare a bucket with the same sape of reads and
    # override this with interporated data axis by axis
    reads_bucket = []

    for read_idx in range(read_count):
        temp_read = np.zeros((x_points, ax_count))
        for ax_idx in range(ax_count):
            temp_read[:, ax_idx] = interporateAxisData(
                                        reads[read_idx][:, ax_idx], x_points
                                        )
        reads_bucket.append(temp_read)

    return reads_bucket


# collect data from all csv files in given dir
def prepareLabelDicts(labels):
    """
    labels is a list of strings with a lot of duplicates
    create following dicts
    label2idx
    idx2label
    """
    label2idx = {l: idx for idx, l in enumerate(sorted(list(set(labels))))}
    # flip key value. Values are all unique (no replicates).
    idx2label = {idx: label for label, idx in label2idx.items()}

    return label2idx, idx2label


def parseAllCSVs(csv_dir):
    """
    raw csv cols
    0: time stamp
    1-3: acc x, y, z <--- take these and,
    4-6: gyr x, y, z <--- these
    7: A1, touch to read
    8: A3, touch to stop
    """
    reads = []
    labels = []
    for csv in glob.glob(os.path.normpath(os.path.join(csv_dir, '*.csv'))):
        reads.append(np.loadtxt(csv, delimiter=',', usecols=range(1, 7)))
        labels.append(os.path.basename(csv).split('-')[0])

    return reads, labels  # list of ndarray and a list


def normalizeMinMax(array_x):
    """
    min max normalization
    values will be packed into 0 to 1
    """
    return (array_x - array_x.min()) / (array_x.max() - array_x.min())


def normalizeData(reads):
    """
    this is used for interporated reads in which read shape is the same for
    all reads in given reads
    """
    read_count = len(reads)  # eq. to csv count
    # get the read shape from the first read
    x_points, ax_count = reads[0].shape

    # as done in another func, prepare a bucket with the same sape of reads and
    # override this with interporated data axis by axis
    reads_bucket = []

    for read_idx in range(read_count):
        temp_read = np.zeros((x_points, ax_count))
        for ax_idx in range(ax_count):
            temp_read[:, ax_idx] = normalizeMinMax(
                                        reads[read_idx][:, ax_idx]
                                        )
        reads_bucket.append(temp_read)

    return reads_bucket


# save file related
def createOutFileName(head, ext, ts, test_size, batch_size, epochs, test_acc):
    out_name = head
    out_name += "-ts" + str(test_size)
    out_name += "-bs" + str(batch_size)
    out_name += "-es" + str(epochs)
    out_name += "-val_acc" + "{:.4f}".format(test_acc)
    out_name += "-" + ts + ext
    return out_name

#### main story

In [3]:
#
# parse csv file data into a list of ndarray
#

#data_dir = './data-pen-alphabet2/'  # raw reads
data_dir = './data-a-n-Feb27-2023-chopped-Mar05-2023/'
#data_dir = 'PATH TO THE CSV FILE DIR'
reads, labels = parseAllCSVs(data_dir)

# prepare label handling dicts
label2idx, idx2label = prepareLabelDicts(labels)
# one hot label
one_hot_labels = to_categorical(
                    np.array([label2idx[l] for l in labels]),
                    dtype=np.uint8)

In [4]:
# ちょい見
print(len(reads))
print(labels)

920
['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'ha', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'he', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'ho', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'hu', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'ka', 'ka', 'ka', 'ka', 'ka', 'ka', 'ka', 'ka', 'ka', 'ka', 'ka', 'ka', 'ka', 'ka', 'ka', 'ka',

In [5]:
#
# interporate data
#
reads_itp = interporateData(reads, x_points=30)

#
# normalization
#
reads_itp_norm = normalizeData(reads_itp)

In [None]:
# ---------------------------------------------------
# draw 3 graphs
# check before after interporation and normalization
# ---------------------------------------------------

fig = plt.figure(figsize=(10, 3.5*3))
spec = fig.add_gridspec(nrows=3, ncols=1)

ax_raw = fig.add_subplot(spec[0, 0])
ax_itp = fig.add_subplot(spec[1, 0])
ax_itp_norm = fig.add_subplot(spec[2, 0])

# choose what you want to see
# if you want to see 'ho' data,
# use labels.find('ho') to find the first read_idx for 'ho'
read_idx = 105  # ho
ax_idx = 2  # acc_z

char = labels[read_idx]
ax_names = [
    'acc_x', 'acc_y', 'acc_z',
    'gyr_x', 'gyr_y', 'gyr_z'
]
ax_name = ax_names[ax_idx]

suptitle = 'raw->interporated->normalized graph check: '
suptitle += char + ', ' + ax_name
fig.suptitle(suptitle)

ax_raw.plot(reads[read_idx][:, ax_idx], label='raw')
ax_itp.plot(reads_itp[read_idx][:, ax_idx], label='itp')
ax_itp_norm.plot(reads_itp_norm[read_idx][:, ax_idx], label='itp-norm')

leg_raw = ax_raw.legend()
leg_itp = ax_itp.legend()
leg_itp_norm = ax_itp_norm.legend()

out = 'read_idx-' + str(read_idx)
out += '-' + char + '-' + ax_names[ax_idx] + '.png'
plt.savefig(out)

plt.show()

In [6]:
#
# re-shaping for cnn
#
'''
now reads_itp_norm is a list of (30=datapoints, 6=num_of_sensor_axis) ndarray.
to pour this data into cnn, it should have a shape something like
(num_of_reads, datapoints, num_of_sensor_axis, channel=1)
'''
datapoints, sensor_axes = reads_itp_norm[0].shape
# prepare zero bucket
reads_cnn = np.zeros((len(reads_itp_norm), datapoints, sensor_axes, 1))
for j in range(len(reads_itp_norm)):
    reads_cnn[j, :, :, 0] = reads_itp_norm[j]

print(reads_cnn.shape)
# print(reads_cnn)

(920, 30, 6, 1)


In [None]:
# ------------------------------
# see 6 axis data as an image
# This is to see one read data.
# ------------------------------
read_idx = 105  # ho

char = labels[read_idx]

fig = plt.figure(figsize=(10, 2.5))

suptitle = '6 axis in an image: '
suptitle += char
fig.suptitle(suptitle)

plt.imshow(reads_cnn[read_idx, :, :, 0].T, cmap='gray')

out = 'read_idx-' + str(read_idx) + '-' + char + '-6x30.png'
plt.savefig(out)

plt.show()

In [None]:
# -------------------------------------------------------
# batch plotting, batch saving
# print 6 axis image from all 20 reads for all hiragana!
# -------------------------------------------------------
# ref: https://matplotlib.org/stable/gallery/subplots_axes_and_figures/subplots_demo.html

for idx, char in idx2label.items():
    fig, axs = plt.subplots(2, 10, sharey=True)

    fig.suptitle('image-ed data: ' + char)
    out = './image-ed-data-' + char + '.png'

    for i in range(2):
        for j in range(10):
            #print('i:j=', i, j, (k*20+i*10)+j)
            axs[i, j].imshow(reads_cnn[(idx*20+i*10)+j, :, :, 0], cmap='gray')

    plt.savefig(out)
    plt.close()

In [7]:
#
# split train and test
#

# -------------------
test_size = 0.2
# -------------------

x = reads_cnn
y = one_hot_labels

print('x.shape', x.shape)
print('y.shape', y.shape)
# print(y)

# split!
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=test_size, stratify=y,
    #random_state=11
    )

print('x_train.shape', x_train.shape)
print('x_test.shape', x_test.shape)
print('y_train.shape', y_train.shape)
print('y_test.shape', y_test.shape)

x.shape (920, 30, 6, 1)
y.shape (920, 46)
x_train.shape (736, 30, 6, 1)
x_test.shape (184, 30, 6, 1)
y_train.shape (736, 46)
y_test.shape (184, 46)


In [9]:
#
# creating CNN
#

input_size1 = 30  # data points used in the interporation
input_size2 = 6  # axis number from LSM6D
num_class = len(label2idx)  # isn't it?

model = Sequential()

# change here to play around
kernel_size = (6, 1)
strides = (1, 1)

model.add(
    Conv2D(
        filters=4, kernel_size=kernel_size,
        strides=strides,
        activation='relu',
        input_shape=(input_size1, input_size2, 1),
        kernel_initializer=initializers.TruncatedNormal(),
        name='conv_filter'
        )
    )

model.add(Flatten())
model.add(Dense(num_class, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=RMSprop(),
                metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv_filter (Conv2D)        (None, 25, 6, 4)          28        
                                                                 
 flatten (Flatten)           (None, 600)               0         
                                                                 
 dense (Dense)               (None, 46)                27646     
                                                                 
Total params: 27,674
Trainable params: 27,674
Non-trainable params: 0
_________________________________________________________________


In [10]:
# this is for tensorboard
# dir with ts
postfix = '-kernel61-strides11-filter4'
ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_folder = "logs/fit/" + ts + postfix
shutil.rmtree(log_folder, ignore_errors=True)

tsb_callbacks = callbacks.TensorBoard(log_dir=log_folder, histogram_freq=1)

# ---------------
batch_size = 46  # set this to the char count for now
epochs = 150
# ---------------

#
# TRAINING!
#
learning_process = model.fit(
                                x_train, y_train,
                                batch_size=batch_size,
                                epochs=epochs,
                                validation_data=(x_test, y_test),
                                callbacks= [tsb_callbacks],
                                verbose=1
                            )

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

#### post training

In [11]:
#
# saving files
#

# model ---------------------------------------------------------------------
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=1)

ts += postfix

model_name = createOutFileName(
    "hiragana-model-CNN", ".h5", ts, test_size, batch_size, epochs, test_acc)
model.save(model_name)
print(model_name, "saved.")

# idx2labels dict -----------------------------------------------------------
# pkl it and save
i2l_pkl = createOutFileName(
    "idx2label", ".pkl", ts, test_size, batch_size, epochs, test_acc)
with open(i2l_pkl, 'wb', -1) as PKL:
    pickle.dump(idx2label, PKL)
print(i2l_pkl, "saved.")

hiragana-model-CNN-ts0.2-bs46-es150-val_acc0.9946-20230306-210944-kernel61-strides11-filter4.h5 saved.
idx2label-ts0.2-bs46-es150-val_acc0.9946-20230306-210944-kernel61-strides11-filter4.pkl saved.


#### NOT IN USE

In [None]:

# confusion matrix ----------------------------------------------------------
y_pred = model.predict(x_test)

cm = tf.math.confusion_matrix(
    y_test.argmax(axis=1),  # y_test is in one-hot expression
    y_pred.argmax(axis=1),  # y_pred is array of probabilities
    dtype=tf.dtypes.int32
    )
