In [1]:
import glob
import os.path as path
import ndjson
import random
import pickle
import numpy as np
import torch
import torch.nn as nn
import time
import math

In [2]:
def preprocess(dataset_path,class_name):
    i = 0
    train_strokes = None
    train_labels = []
    valid_labels = []
    test_labels = []
    for file_path in glob.iglob(path.join(dataset_path,'*.npz')):
        label=file_path.split('/')[-1].split('.')[0]
        if class_name!=-1 and label not in class_name:
            continue
        print('readding data from {}'.format(file_path))
        i = i+1
        data = np.load(file_path, encoding ='latin1')
        if train_strokes is None:
            train_strokes = data['train']
            valid_strokes = data['valid']
            test_strokes = data['test']
            print('this class has',str(len(data['train'])+len(data['valid'])+len(data['test'])))
            train_labels.append([i-1]*data['train'].shape[0])
            valid_labels.append([i-1]*data['valid'].shape[0])
            test_labels.append([i-1]*data['test'].shape[0])
        else: 
            train_strokes = np.concatenate((train_strokes, data['train']))
            valid_strokes = np.concatenate((valid_strokes, data['valid']))
            test_strokes = np.concatenate((test_strokes, data['test']))
            print('this class has',str(len(data['train'])+len(data['valid'])+len(data['test'])))
            train_labels.append([i-1]*data['train'].shape[0])
            valid_labels.append([i-1]*data['valid'].shape[0])
            test_labels.append([i-1]*data['test'].shape[0])
    train_labels = [item for sublist in train_labels for item in sublist]
    test_labels = [item for sublist in test_labels for item in sublist]
    valid_labels = [item for sublist in valid_labels for item in sublist]
            
    return train_strokes,valid_strokes,test_strokes,train_labels,valid_labels,test_labels
        
        
    

In [3]:
dataset_path='/home/heylamourding/quickdraw/data/sy_data/quick_draw'
class_name = ['calendar', 'snowman', 'penguin', 'blackberry', 'teddy-bear']  # class_name=-1 means all class
country_code = ['JP', 'CN', 'DE']  # country_code=-1 means all countries
output_path='/home/heylamourding/quickdraw/data/sy_data/quick_draw_output'

In [4]:
train_strokes,valid_strokes,test_strokes,train_labels,valid_labels,test_labels = preprocess(dataset_path,class_name)

readding data from /home/heylamourding/quickdraw/data/sy_data/quick_draw/teddy-bear.npz
this class has 75000
readding data from /home/heylamourding/quickdraw/data/sy_data/quick_draw/blackberry.npz
this class has 75000
readding data from /home/heylamourding/quickdraw/data/sy_data/quick_draw/penguin.npz
this class has 75000
readding data from /home/heylamourding/quickdraw/data/sy_data/quick_draw/snowman.npz
this class has 75000
readding data from /home/heylamourding/quickdraw/data/sy_data/quick_draw/calendar.npz
this class has 75000


In [5]:
def find_max(train_strokes, valid_stroke, test_strokes):
    max_stroke = 0
    for i in range(train_strokes.shape[0]):
        temp = train_strokes[i].shape[0]
        if temp >= max_stroke:
            max_stroke = temp

    for i in range(valid_strokes.shape[0]):
        temp = valid_strokes[i].shape[0]
        if temp >= max_stroke:
            max_stroke = temp
    
    for i in range(test_strokes.shape[0]):
        temp = test_strokes[i].shape[0]
        if temp >= max_stroke:
            max_stroke = temp
    return max_stroke

def find_min(train_strokes, valid_strokes, test_strokes):
    min_stroke = 100
    for i in range(train_strokes.shape[0]):
        temp = train_strokes[i].shape[0]
        if temp <= min_stroke:
            min_stroke = temp

    for i in range(valid_strokes.shape[0]):
        temp = valid_strokes[i].shape[0]
        if temp <= min_stroke:
            min_stroke = temp
    
    for i in range(test_strokes.shape[0]):
        temp = test_strokes[i].shape[0]
        if temp <= min_stroke:
            min_stroke = temp
    return min_stroke

def filter_stroke(data_stroke, data_label, target_stroke):
    target=[]
    target_label = []
    for i in range(data_stroke.shape[0]):
        orig = data_stroke[i].shape[0]
        if orig <= target_stroke:
            target.append(data_stroke[i])
            target_label.append(data_label[i])
        else:
            continue
    return target, target_label

def pad_stroke(data_stroke, max_stroke):
    for i in range(len(data_stroke)):
        orig = data_stroke[i].shape[0]
        data_stroke[i] = np.vstack((data_stroke[i], 
                                    np.hstack((np.zeros((max_stroke-orig,2)),np.ones((max_stroke-orig,1))))))
    return data_stroke

def display_num_param(net):
    nb_param = 0
    for param in net.parameters():
        nb_param += param.numel()
    print('There are {} ({:.2f} million) parameters in this neural network'.format(
        nb_param, nb_param/1e6)
         )

In [6]:
target_stroke = 60
valid_strokes_ls, valid_label_ls = filter_stroke(valid_strokes,valid_labels,target_stroke)
train_strokes_ls, train_label_ls = filter_stroke(train_strokes,train_labels,target_stroke)
test_strokes_ls, test_label_ls = filter_stroke(test_strokes,test_labels,target_stroke)

In [7]:
valid_strokes_pad = pad_stroke(valid_strokes_ls,target_stroke)
train_strokes_pad = pad_stroke(train_strokes_ls,target_stroke)
test_strokes_pad = pad_stroke(test_strokes_ls,target_stroke)

In [8]:
valid_strokes_ar = np.stack(valid_strokes_pad)
train_strokes_ar = np.stack(train_strokes_pad)
test_strokes_ar = np.stack(test_strokes_pad)

In [10]:
print(valid_strokes_ar.shape)

(4390, 60, 3)


In [17]:
valid_inputs_ts = torch.Tensor(valid_strokes_ar)
train_inputs_ts = torch.Tensor(train_strokes_ar)
test_inputs_ts = torch.Tensor(test_strokes_ar)

In [18]:
overall_inputs = torch.cat((valid_inputs_ts, train_inputs_ts, test_inputs_ts))

In [20]:
valid_label_ts = torch.Tensor(np.array(valid_label_ls))
train_label_ts = torch.Tensor(np.array(train_label_ls))
test_label_ts = torch.Tensor(np.array(test_label_ls))

In [21]:
overall_labels = torch.cat((valid_label_ts, train_label_ts, test_label_ts))

In [23]:
print('Label 0 has', sum(overall_labels.numpy()==0))
print('Label 1 has', sum(overall_labels.numpy()==1))
print('Label 2 has', sum(overall_labels.numpy()==2))
print('Label 3 has', sum(overall_labels.numpy()==3))
print('Label 4 has', sum(overall_labels.numpy()==4))

Label 0 has 4388
Label 1 has 10349
Label 2 has 34566
Label 3 has 52950
Label 4 has 31292


In [40]:
# Filter Balanced Data for Train 
from random import shuffle
def balance_filter(train_label_ts, SL, EL,train_inputs_ts, name):
    label1_indices = np.array(np.where(train_label_ts.numpy() == 0)).reshape(-1)[SL:EL].tolist()
    label2_indices = np.array(np.where(train_label_ts.numpy() == 1)).reshape(-1)[SL:EL].tolist()
    label3_indices = np.array(np.where(train_label_ts.numpy() == 2)).reshape(-1)[SL:EL].tolist()
    label4_indices = np.array(np.where(train_label_ts.numpy() == 3)).reshape(-1)[SL:EL].tolist()
    label5_indices = np.array(np.where(train_label_ts.numpy() == 4)).reshape(-1)[SL:EL].tolist()
    train_indices = []
    temp = [label1_indices, label2_indices, label3_indices, label4_indices, label5_indices]
    for i in range(5):
        target = temp[i]
        for item in range(len(target)):
            train_indices.append(target[item])
    print('test indices')
    shuffle(train_indices)
    print(train_indices[0])
    train_sub_inputs_ts = train_inputs_ts[train_indices,:,:]
    train_sub_label_ts = torch.Tensor(train_label_ts.numpy()[train_indices])

    print(name +'label 0 has', sum(train_sub_label_ts.numpy()==0))
    print(name +'label 1 has', sum(train_sub_label_ts.numpy()==1))
    print(name +'label 2 has', sum(train_sub_label_ts.numpy()==2))
    print(name +'label 3 has', sum(train_sub_label_ts.numpy()==3))
    print(name + 'label 4 has', sum(train_sub_label_ts.numpy()==4))
    return train_sub_inputs_ts, train_sub_label_ts

train_sub_inputs_ts, train_sub_label_ts = balance_filter(overall_labels,0,4000, overall_inputs, name='train')
test_sub_inputs_ts, test_sub_label_ts = balance_filter(overall_labels, 4000,4300, overall_inputs,'test')
#valid_sub_inputs_ts, valid_sub_label_ts = balance_filter(overall_labels, 4000,4300, overall_inputs, 'valid')

test indices
10316
trainlabel 0 has 4000
trainlabel 1 has 4000
trainlabel 2 has 4000
trainlabel 3 has 4000
trainlabel 4 has 4000
test indices
21292
testlabel 0 has 300
testlabel 1 has 300
testlabel 2 has 300
testlabel 3 has 300
testlabel 4 has 300


In [41]:
trainX, trainY = train_sub_inputs_ts.numpy(), train_sub_label_ts.numpy()
testX, testY = test_sub_inputs_ts.numpy(), test_sub_label_ts.numpy()
# valX, valY = valid_sub_inputs_ts.numpy(), valid_sub_label_ts.numpy() 

In [95]:
# One Hot Encoder
trainY_Onehot = np.zeros((trainY.shape[0],5))
trainY_Onehot[np.arange(trainY.shape[0]),trainY.astype(int)] = 1 
testY_Onehot = np.zeros((testY.shape[0],5))
testY_Onehot[np.arange(testY.shape[0]),testY.astype(int)] = 1 

In [103]:
IND = 10316
print(trainY_Onehot[IND])
print(trainY[IND])

[0. 0. 0. 0. 1.]
4.0


In [104]:
# KERAS model (explained above)
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, GRU, LSTM
from keras.layers import Conv2D, MaxPooling2D, Input, Reshape
from keras.layers.convolutional import ZeroPadding2D
from keras.models import load_model

In [109]:
args = {}
args['LSTMUnits'] = 80
args['batch'] = 64
args['epochs'] = 10
args['dropout'] = 0.2
args['len_category'] = 5
args['enc_dense'] = 256
args['num_filters'] = 1
args['kernelS'] = 3
args['stride'] = 2
args['poolS'] = 2 
args['dec_dense'] = 32

In [110]:
model = Sequential()
model.add(LSTM(args['LSTMUnits'],return_sequences=False,input_shape=(60,3)))
model.add(Dense(args['enc_dense'], activation='relu'))
model.add(Reshape((16,16,1)))
model.add(Conv2D(args['num_filters'],args['kernelS'],strides=(args['stride'],args['stride']), activation ='relu'))
#model.add(MaxPooling2D(pool_size=(args['poolS'],args['poolS'])))
model.add(Flatten())
model.add(Dense(args['dec_dense'], activation='relu'))
model.add(Dropout(args['dropout']))
model.add(Dense(args['len_category'], activation='softmax'))

model.compile(loss='mean_squared_error',
          optimizer='adam',
          metrics=['accuracy'])

In [111]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_15 (LSTM)               (None, 80)                26880     
_________________________________________________________________
dense_23 (Dense)             (None, 256)               20736     
_________________________________________________________________
reshape_9 (Reshape)          (None, 16, 16, 1)         0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 7, 7, 1)           10        
_________________________________________________________________
flatten_9 (Flatten)          (None, 49)                0         
_________________________________________________________________
dense_24 (Dense)             (None, 32)                1600      
_________________________________________________________________
dropout_6 (Dropout)          (None, 32)                0         
__________

In [112]:
model.fit(trainX, trainY_Onehot,
          batch_size = args['batch'], nb_epoch= args['epochs'], 
          verbose=1,validation_split=0.2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f359c708588>

In [113]:
def evaluate_acc(model, X_test, y_test):
    y_predict = model.predict(X_test)
    y_predict_label = np.argmax(y_predict,axis = 1)
    y_test_label = np.argmax(y_test,axis=1)
    print(accuracy_score(y_test_label, y_predict_label))


In [114]:
from sklearn.metrics import accuracy_score
evaluate_acc(model,testX,testY_Onehot)

0.9093333333333333
