In [2]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import re
import tensorflow as tf
from pathlib import Path
from keras.utils.np_utils import to_categorical

In [3]:
data = pd.read_csv( '/Volumes/Cisco/Fall2021/onnx-exchange/Training/IMDB Dataset.csv')
print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [4]:
english_stops = set(stopwords.words('english'))

In [5]:
def load_dataset():
    df = pd.read_csv('/Volumes/Cisco/Fall2021/onnx-exchange/Training/IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [6]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
17866    [there, vogue, past, years, often, ironic, zom...
31472    [three, giant, sabretooth, tigers, created, la...
9636     [the, film, opens, director, talking, camera, ...
28275    [being, independent, filmmaker, huge, fan, edw...
42919    [i, went, movie, theater, afternoon, expecting...
                               ...                        
49328    [michael, ritchie, the, couch, trip, wonderful...
833      [all, i, ever, heard, raised, equality, sexes,...
5937     [i, never, expected, much, film, trashy, b, mo...
19673    [there, never, dull, moment, movie, wonderful,...
18382    [like, another, reviewer, wife, bought, movie,...
Name: review, Length: 40000, dtype: object 

28050    [will, smith, delivers, yet, film, man, weight...
23513    [on, distant, planet, psychopath, saved, execu...
17323    [i, give, less, star, i, tried, at, moment, im...
32257    [there, seems, money, behind, film, would, imp...
25660    [i, remember, seeing, movie, shown, several, y...
 

In [7]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [8]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[   50 12879   422 ...     4 21832 39825]
 [  182  1216 12882 ...  1680  2957 11137]
 [    2     4  1982 ...     0     0     0]
 ...
 [    1    40   742 ...   365   102     3]
 [   50    40   667 ...     0     0     0]
 [    6    73  2103 ...     0     0     0]] 

Encoded X Test
 [[ 1435  1244  1462 ...  1560     1  1299]
 [  433  3625  1140 ...   409   754   745]
 [    1   105   242 ...     0     0     0]
 ...
 [10938  4080  4281 ... 24973  9015  3908]
 [  715   739   136 ...     0     0     0]
 [   90  9514     2 ...     0     0     0]] 

Maximum review length:  130


In [9]:
training_id = 10
model_short_name = 'gru'
framework = 'keras'

In [10]:
import time
import os
import copy
import csv
import pandas as pd
from datetime import datetime

date = datetime.today().strftime('%Y-%m-%d')

In [11]:
path = '/Volumes/Cisco/Fall2021/onnx-exchange/Training/{}/{}/'.format(framework, model_short_name)
since_0 = time.time()
#model_path = 'tf_Lenet5_mnist_2021-08-24-10:35:35'
model_name = 'tf_gru-imdb_2021-10-31_{}'.format(training_id)
model = tf.keras.models.load_model(path+ model_name+'.h5')
t_elapsed_0 = time.time() - since_0
size0 = os.path.getsize(path+ model_name+'.h5')
size0



240310344

In [12]:
from torch.utils.data import TensorDataset, DataLoader
import torch

In [13]:
import onnx
import onnxruntime
import coremltools
import time
import tf2onnx



In [14]:
onnx_path = '/Volumes/Cisco/Fall2021/onnx-exchange/conversion/onnx/'
coreml_path = '/Volumes/Cisco/Fall2021/onnx-exchange/conversion/coremltools/'
error_path = '/Volumes/Cisco/Fall2021/onnx-exchange/miss-classification/errors/'

In [15]:
def convert_category(y):
    list_ = []
    for i in y:
        val = 1
        if i < 0.5:
            val = 0
        list_.append(val)
    return np.array(list_)
#convert_category(k_predict)

In [16]:
def model_scores(y_test, test_predict):
    correct_ = np.sum(y_test == test_predict)
    accuracy  = correct_*100./np.sum(y_test == y_test)
    return accuracy

In [17]:
def to_onnx(i, x, y, data_writer_run, batch_size):
    
    # Input to the model
    #device_reset = cuda.get_current_device()
    #device_reset.reset()
    #x.cuda()
    print("converting for batch: ", i)
    
    #torch.random.manual_seed(42)
    #x = torch.randn(10000, 3, 32, 32, requires_grad=True)
    
    ### Original Model
    since_1 = time.time()
    #model = torch.load(path+model_name+'.pth')
    try:
        with tf.device('/cpu:0'): 
            k_predict = model.predict(x)
    except Exception as e:
        print('Error keras: ', e)
        return
    inference_time_original = time.time() - since_1
    y0 = convert_category(k_predict)
    correct_original = np.sum(y0 == y)
    accuracy_original = model_scores(y, y0)
    # ONNX Model
    
    t_elapsed_2 = time.time() - since_1
    since_1 = time.time()
    onnx_model = onnx.load(onnx_path+framework+"/{}/{}.onnx".format(model_short_name, model_name))
    load_time_onnx = time.time() - since_1
    onnx.checker.check_model(onnx_model)
    #def to_numpy(tensor):
    #    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
    ort_session = onnxruntime.InferenceSession(onnx_path+framework+"/{}/{}.onnx".format(model_short_name, model_name))
    since_1 = time.time()
    ort_inputs = {ort_session.get_inputs()[0].name: x}
    ort_outs = ort_session.run(None, ort_inputs)
    inference_time_onnx = time.time() - since_1
    # compare ONNX Runtime and PyTorch results
    print("\n*********\n\n")
    #time_diff = t_elapsed_0+t_elapsed_1, t_elapsed_2, t_elapsed_3
    
    ####### Mis-classification ONNX ######################################
    y2 = convert_category(ort_outs[0])
    correct_onnx = np.sum(y2 == y)
    accuracy_onnx = model_scores(y, y2)
    miss_perc_val_original_runtime = 0
    try:
        np.testing.assert_array_equal(k_predict, ort_outs[0])
    except Exception as e:
        #print(e)
        for line_ in str(e).split('\n'):
            #print(' ---- : ', line_)
            if 'Mismatched elements' in line_:
                value = line_.replace('Mismatched elements: ', '').strip()
                miss_perc_val_original_runtime = value[value.find("(")+1:value.find(")")]
                #print(value, perc_val)
                break
    encoded_miss_perc_val_original_onnx = 0
    try:
        np.testing.assert_array_equal(y0, y2)
    except Exception as e:
        #print(e)
        for line_ in str(e).split('\n'):
            #print(' ---- : ', line_)
            if 'Mismatched elements' in line_:
                value = line_.replace('Mismatched elements: ', '').strip()
                encoded_miss_perc_val_original_onnx = value[value.find("(")+1:value.find(")")]
                #print(value, perc_val)
                break
    
    miss_perc_val_test_runtime = 0
    try:
        np.testing.assert_array_equal(y, y2)
    except Exception as e:
        #print(e)
        for line_ in str(e).split('\n'):
            #print(' ---- : ', line_)
            if 'Mismatched elements' in line_:
                value = line_.replace('Mismatched elements: ', '').strip()
                miss_perc_val_test_runtime = value[value.find("(")+1:value.find(")")]
                #print(value, perc_val)
                break
    ####### End of mis-classification ONNX ###################################### 
    
    
    ## CoreML
    
    since_1 = time.time()
    coreml_model = coremltools.models.MLModel(coreml_path+framework+"/{}/{}.mlmodel".format(model_short_name, model_name))
    load_time_coreml = time.time() - since_1
    
    #spec = coreml_model.get_spec()
    #coreml_model = coremltools.models.MLModel(spec)
    split_ = str(coreml_model.get_spec().description.input[0]).split('\n')
    name_1 = split_[0].replace('name: "', '')
    name_1 = name_1.replace('"', '')
    
    since_1 = time.time()
    output_dict_test = coreml_model.predict({name_1:x})
    inference_time_coreml = time.time() - since_1
    ####### Mis-classification coreML ######################################
    y3 = convert_category(output_dict_test['Identity'])
    correct_coreml = np.sum(y3 == y)
    accuracy_coreml = model_scores(y, y3)
    
    #print(correct_original, correct_coreml, correct_onnx, np.sum(y == y))
    ## Part 1
    
    miss_perc_val_original_runtime2 = 0
    try:
        np.testing.assert_array_equal(k_predict, output_dict_test['Identity'])
    except Exception as e:
        #print(e)
        for line_ in str(e).split('\n'):
            #print(' ---- : ', line_)
            if 'Mismatched elements' in line_:
                value = line_.replace('Mismatched elements: ', '').strip()
                miss_perc_val_original_runtime2 = value[value.find("(")+1:value.find(")")]
                #print(value, perc_val)
                break
    
    ####### Part 2
    #print('default-shape: ',k_predict.shape, 'onnx-shape: ',ort_outs[0].shape, 'coreml-shape: ',output_dict_test['Identity'].shape)
    miss_perc_val_original_coreml = 0
    try:
        np.testing.assert_array_equal(y0, y3)
    except Exception as e:
        #print(e)
        for line_ in str(e).split('\n'):
            #print(' ---- : ', line_)
            if 'Mismatched elements' in line_:
                value = line_.replace('Mismatched elements: ', '').strip()
                miss_perc_val_original_coreml = value[value.find("(")+1:value.find(")")]
                #print(value, perc_val)
                break
    
    miss_perc_val_test_runtime2 = 0
    try:
        np.testing.assert_array_equal(y, y3)
    except Exception as e:
        #print(e)
        for line_ in str(e).split('\n'):
            #print(' ---- : ', line_)
            if 'Mismatched elements' in line_:
                value = line_.replace('Mismatched elements: ', '').strip()
                miss_perc_val_test_runtime2 = value[value.find("(")+1:value.find(")")]
                #print(value, perc_val)
                break
    ####### End of mis-classification coreML ######################################
    data_writer_run.writerow([model_short_name,framework, training_id, model_name, batch_size, i,'onnx',t_elapsed_0, inference_time_original, load_time_onnx, 
                          inference_time_onnx,  miss_perc_val_original_runtime,'',  encoded_miss_perc_val_original_onnx, miss_perc_val_test_runtime, '', accuracy_original, accuracy_onnx])
    
    data_writer_run.writerow([model_short_name,framework, training_id, model_name, batch_size, i,'coremltools',t_elapsed_0, inference_time_original, load_time_coreml, 
                          inference_time_coreml,  miss_perc_val_original_runtime2,'',  miss_perc_val_original_coreml, miss_perc_val_test_runtime2, '', accuracy_original,accuracy_coreml])
    
    #return correct_original,correct_onnx,correct_coreml 

In [18]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

def _lets_convert(data, data_writer_run, batch_size):
    since = time.time()
    for i, (inputs, labels) in enumerate(data):
        inputs = to_numpy(inputs)
        labels = to_numpy(labels)
        to_onnx(i, inputs,labels, data_writer_run, batch_size)
        if i == 50:
            break
    time_elapsed = time.time() - since
    print('Conversion complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60) )

In [19]:
import pandas as pd 
if not os.path.exists(error_path+framework+"/{}".format(model_short_name)):
        Path(error_path+framework+"/{}".format(model_short_name)).mkdir(parents=True, exist_ok=True)
data_file_run = open(error_path+framework+"/{}/runtime_miss-classification_{}.csv".format(model_short_name,model_name), mode='w', newline='',
                                  encoding='utf-8')
data_writer_run = csv.writer(data_file_run, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
data_writer_run.writerow(['model','framework', 'training_id', 'model_full', "batch_size", 'round','runtime','original_load_time', 'original_infererence_time', 'runtime_load_time', 
                          'runtime_inference_time',  'miss_classified_original_runtime_percentage','',  'encoded_miss_classified_original_runtime_percentage','encoded_miss_classified_original_test_runtime_percentage', '', 'accuracy_original', 'accuracy_runtime'])

for batch_size in [128]:
    print("################ Batch size: ", batch_size)
    # create Tensor datasets
    train_data = TensorDataset(torch.as_tensor(np.array(x_train).astype('int32')), torch.as_tensor(np.array(y_train).astype('int32')))
    valid_data = TensorDataset(torch.as_tensor(np.array(x_test).astype('int32')), torch.as_tensor(np.array(y_test).astype('int32')))

    # dataloaders
    #batch_size = batch_size_

    # make sure to SHUFFLE your data
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
    _lets_convert(valid_loader, data_writer_run, batch_size)
    #data_writer_acc.writerow([model_short_name,framework, training_id, model_name, batch_size, correct_original, correct_onnx, correct_coreml])
data_file_run.close()

################ Batch size:  128
converting for batch:  0

*********


converting for batch:  1

*********


converting for batch:  2

*********


converting for batch:  3

*********


converting for batch:  4

*********


converting for batch:  5

*********


converting for batch:  6

*********


converting for batch:  7

*********


converting for batch:  8

*********


converting for batch:  9

*********


converting for batch:  10

*********


converting for batch:  11

*********


converting for batch:  12

*********


converting for batch:  13

*********


converting for batch:  14

*********


converting for batch:  15

*********


converting for batch:  16

*********


converting for batch:  17

*********


converting for batch:  18

*********


converting for batch:  19

*********


converting for batch:  20

*********


converting for batch:  21

*********


converting for batch:  22

*********


converting for batch:  23

*********


converting for batch:  24

*********


c

In [19]:
valid_data = TensorDataset(torch.as_tensor(np.array(x_test).astype('int32')), torch.as_tensor(np.array(y_test).astype('int32')))
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=1)

In [21]:
for i, (inputs, labels) in enumerate(valid_loader):
    inputs = to_numpy(inputs)
    labels = to_numpy(labels)
    break

In [23]:
try:
    with tf.device('/cpu:0'): 
        k_predict = model.predict(inputs)
except Exception as e:
    print('Error keras: ', e)
    #return
y0 = convert_category(k_predict)

In [24]:
k_predict

array([[0.00763112]], dtype=float32)

In [25]:
y0

array([0])

In [26]:
y00 = np.argmax(k_predict)
y00

0

In [27]:
np.argmax(labels)

0