# About
Convert MobileBERT_SQuAD to Core ML

Convert SavedModel to ConcreteFunction then convert ConreteFunction to Core ML.

**Quantization**:
16 bits worked, but 8 bits quantized model failed to load later in iOS with error:

```
/Library/Caches/com.apple.xbs/Sources/MetalImage/MetalImage-124.2.4/MPSCore/Types/MPSMatrix.mm, line 222: error '[MPSMatrix initWithBuffer:descriptor:] not enough rowBytes for all the columns.'
/Library/Caches/com.apple.xbs/Sources/MetalImage/MetalImage-124.2.4/MPSCore/Types/MPSMatrix.mm:222: failed assertion `[MPSMatrix initWithBuffer:descriptor:] not enough rowBytes for all the columns.'
```


# Download model and setup enviroments

In [1]:
!wget https://storage.googleapis.com/cloud-tpu-checkpoints/mobilebert/mobilebert_squad_savedmodels.tar.gz
!tar -zxvf mobilebert_squad_savedmodels.tar.gz

--2021-08-18 12:38:45--  https://storage.googleapis.com/cloud-tpu-checkpoints/mobilebert/mobilebert_squad_savedmodels.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.188.128, 64.233.189.128, 108.177.97.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.188.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 184730530 (176M) [application/octet-stream]
Saving to: ‘mobilebert_squad_savedmodels.tar.gz.1’


2021-08-18 12:38:53 (31.5 MB/s) - ‘mobilebert_squad_savedmodels.tar.gz.1’ saved [184730530/184730530]

mobilebert_squad_savedmodels/
mobilebert_squad_savedmodels/quant_saved_model/
mobilebert_squad_savedmodels/quant_saved_model/saved_model.pb
mobilebert_squad_savedmodels/quant_saved_model/variables/
mobilebert_squad_savedmodels/quant_saved_model/variables/variables.index
mobilebert_squad_savedmodels/quant_saved_model/variables/variables.data-00000-of-00001
mobilebert_squad_savedmodels/float/
mobilebert_squad_

In [2]:
!pip install coremltools==4.1



In [3]:
%tensorflow_version 2.x

In [4]:
import sys
print(sys.version)

import tensorflow as tf
print(tf.__version__)

import coremltools as ct
print(ct.__version__)

3.7.11 (default, Jul  3 2021, 18:01:19) 
[GCC 7.5.0]
2.6.0




4.1


# Inspect the TF model

In [5]:
def load_saved_model(path):
  print(f"Loading saved_model.pb from {path}")
  the_graph = tf.Graph()
  with tf.compat.v1.Session(graph=the_graph) as sess:
    tags = [tf.compat.v1.saved_model.tag_constants.SERVING]
    tf.compat.v1.saved_model.loader.load(sess, tags, path)
  return the_graph


def export_ops_name(the_graph, filename):
  with open(filename, "w") as text_file:
    ops = the_graph.get_operations()
    N = len(ops)
    for i in range(N):
      text_file.write('\n\nop id {} , op type: "{}"'.format(str(i), ops[i].type))
      
      text_file.write('\ninput(s):'),
      for x in ops[i].inputs:
        text_file.write("name = {}, shape: {}, ".format(x.name, x.get_shape()))
      
      text_file.write('\noutput(s):'),
      for x in ops[i].outputs:
        text_file.write("name = {}, shape: {},".format(x.name, x.get_shape()))
  print('Exported to:', filename)    


saved_model_dir = 'mobilebert_squad_savedmodels/float'

export_ops_name(load_saved_model(saved_model_dir),
                'mobilebert_squad_savedmodels_ops.txt')

Loading saved_model.pb from mobilebert_squad_savedmodels/float
Exported to: mobilebert_squad_savedmodels_ops.txt


# Convert from TF to Core ML

In [6]:
tfmodel = tf.saved_model.load(saved_model_dir, tags='serve')
print(tfmodel)

sv = tfmodel.signatures.values()
print(sv)

cfs = sv if isinstance(sv, list) else list(sv)
print(cfs)

<tensorflow.python.training.tracking.tracking.AutoTrackable object at 0x7fae78e560d0>
ValuesView(_SignatureMap({'serving_default': <ConcreteFunction pruned(input_ids, input_mask, segment_ids) at 0x7FADBB1BD7D0>}))
[<ConcreteFunction pruned(input_ids, input_mask, segment_ids) at 0x7FADBB1BD7D0>]


In [9]:
mlmodel = ct.convert(cfs, source='TensorFlow')

print(mlmodel)

Running TensorFlow Graph Passes: 100%|██████████| 5/5 [00:03<00:00,  1.49 passes/s]
Converting Frontend ==> MIL Ops: 100%|██████████| 4430/4430 [00:18<00:00, 242.95 ops/s]
Running MIL optimization passes: 100%|██████████| 18/18 [00:15<00:00,  1.14 passes/s]
Translating MIL ==> MLModel Ops: 100%|██████████| 4271/4271 [00:04<00:00, 988.88 ops/s]


input {
  name: "input_ids"
  type {
    multiArrayType {
      shape: 1
      shape: 384
      dataType: FLOAT32
    }
  }
}
input {
  name: "input_mask"
  type {
    multiArrayType {
      shape: 1
      shape: 384
      dataType: FLOAT32
    }
  }
}
input {
  name: "segment_ids"
  type {
    multiArrayType {
      shape: 1
      shape: 384
      dataType: FLOAT32
    }
  }
}
output {
  name: "end_logits"
  type {
    multiArrayType {
      dataType: FLOAT32
    }
  }
}
output {
  name: "start_logits"
  type {
    multiArrayType {
      dataType: FLOAT32
    }
  }
}
metadata {
  userDefined {
    key: "com.github.apple.coremltools.source"
    value: "tensorflow==2.6.0"
  }
  userDefined {
    key: "com.github.apple.coremltools.version"
    value: "4.1"
  }
}



In [10]:
from coremltools.models.neural_network import quantization_utils

spec = quantization_utils.quantize_weights(mlmodel, nbits=16)

mlmodel = ct.models.MLModel(spec)

Quantizing using linear quantization
Quantizing layer bert/embeddings/embedding_lookup_1
Quantizing layer bert/embeddings/embedding_lookup
Quantizing layer bert/embeddings/embedding_transformation/add
Quantizing layer bert/encoder/layer_0/bottleneck/attention/dense/Tensordot/MatMul
Quantizing layer bert/encoder/layer_0/attention/self/value/add
Quantizing layer bert/encoder/layer_0/bottleneck/input/dense/add
Quantizing layer bert/encoder/layer_0/attention/self/query/add
Quantizing layer bert/encoder/layer_0/attention/self/key/add
Quantizing layer bert/encoder/layer_0/attention/self/MatMul
Quantizing layer bert/encoder/layer_0/attention/self/MatMul_1
Quantizing layer bert/encoder/layer_0/attention/output/dense/add
Quantizing layer bert/encoder/layer_0/ffn_layer_0/intermediate/dense/add
Quantizing layer bert/encoder/layer_0/ffn_layer_0/output/dense/add
Quantizing layer bert/encoder/layer_0/ffn_layer_1/intermediate/dense/add
Quantizing layer bert/encoder/layer_0/ffn_layer_1/output/dense/ad

In [11]:
# rename input, ouput
mlmodel.input_description['input_ids'] = 'an int32 Tensor of shape [seq_length] with the token ids of the packed input sequence (that is, including a start-of-sequence token, end-of-segment tokens, and padding).'
mlmodel.input_description['input_mask'] = 'an int32 Tensor of shape [seq_length] with value 1 at the position of all input tokens present before padding and value 0 for the padding tokens.'
mlmodel.input_description['segment_ids'] = 'an int32 Tensor of shape [seq_length] with the index of the input segment that gave rise to the input token at the respective position. The first input segment (index 0) includes the start-of-sequence token and its end-of-segment token. The second and later segments (if present) include their respetive end-of-segment token. Padding tokens get index 0 again.'

mlmodel.output_description['start_logits'] = "Start token scores of size 384. The argmax is the start index of the predicted answer in the input sequence"
mlmodel.output_description['end_logits'] = "End token scores of size 384. The argmax is the end index of the predicted answer in the input sequence"

spec = mlmodel.get_spec()

ct.utils.rename_feature(spec, 'input_ids', 'inputIds')
ct.utils.rename_feature(spec, 'input_mask', 'inputMask')
ct.utils.rename_feature(spec, 'segment_ids', 'segmentIds')

ct.utils.rename_feature(spec, 'start_logits', 'startLogits')
ct.utils.rename_feature(spec, 'end_logits', 'endLogits')

# set shape info

del spec.description.input[0].type.multiArrayType.shape[0]
del spec.description.input[1].type.multiArrayType.shape[0]
del spec.description.input[2].type.multiArrayType.shape[0]

spec.description.input[0].type.multiArrayType.dataType = ct.proto.FeatureTypes_pb2.ArrayFeatureType.INT32
spec.description.input[1].type.multiArrayType.dataType = ct.proto.FeatureTypes_pb2.ArrayFeatureType.INT32
spec.description.input[2].type.multiArrayType.dataType = ct.proto.FeatureTypes_pb2.ArrayFeatureType.INT32

seq_length = 384
spec.description.output[0].type.multiArrayType.shape.append(seq_length)
spec.description.output[1].type.multiArrayType.shape.append(seq_length)

spec.description.output[0].type.multiArrayType.dataType = ct.proto.FeatureTypes_pb2.ArrayFeatureType.DOUBLE
spec.description.output[1].type.multiArrayType.dataType = ct.proto.FeatureTypes_pb2.ArrayFeatureType.DOUBLE

# set model info
spec.description.metadata.versionString = "2021-08-18"
spec.description.metadata.shortDescription = "Converted from TF model at https://storage.googleapis.com/cloud-tpu-checkpoints/mobilebert/mobilebert_squad_savedmodels.tar.gz"
spec.description.metadata.author = "Converted to Core ML by Anh"

mlmodel_mod = ct.models.MLModel(spec)
mlmodel_mod.save('MobileBERT_SQuAD.mlmodel')
print(mlmodel_mod)

input {
  name: "inputIds"
  shortDescription: "an int32 Tensor of shape [seq_length] with the token ids of the packed input sequence (that is, including a start-of-sequence token, end-of-segment tokens, and padding)."
  type {
    multiArrayType {
      shape: 384
      dataType: INT32
    }
  }
}
input {
  name: "inputMask"
  shortDescription: "an int32 Tensor of shape [seq_length] with value 1 at the position of all input tokens present before padding and value 0 for the padding tokens."
  type {
    multiArrayType {
      shape: 384
      dataType: INT32
    }
  }
}
input {
  name: "segmentIds"
  shortDescription: "an int32 Tensor of shape [seq_length] with the index of the input segment that gave rise to the input token at the respective position. The first input segment (index 0) includes the start-of-sequence token and its end-of-segment token. The second and later segments (if present) include their respetive end-of-segment token. Padding tokens get index 0 again."
  type {
   