# BERT-language2label trainings

In [1]:
%install-location /notebooks/language2motion.gt/swift-install
%install-swiftpm-flags -c release
%install '.package(path: "/notebooks/language2motion.gt/code")' Batcher ModelSupport Datasets TextModels

import Datasets
import Foundation
import ModelSupport
import TensorFlow
import TextModels
import PythonKit

Installing packages:
	.package(path: "/notebooks/language2motion.gt/code")
		Batcher
		ModelSupport
		Datasets
		TextModels
With SwiftPM flags: ['-c', 'release']
Working in: /tmp/tmprgyxdz3f/swift-install
[1/5] Compiling STBImage stb_image_write.c
[2/5] Compiling Batcher Backend.swift
[3/5] Compiling STBImage stb_image.c
[4/5] Compiling SwiftProtobuf AnyMessageStorage.swift
  var readBuffer = UnsafeMutablePointer<UInt8>.allocate(capacity: 1)
  ~~~ ^
  let
[5/6] Compiling ModelSupport BijectiveDictionary.swift
[6/7] Compiling Datasets CIFAR10.swift
[7/8] Compiling TextModels Attention.swift
[8/9] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
[9/9] Linking libjupyterInstalledPackages.so
Initializing Swift...
Installation complete!


In [2]:
let bertPretrained = BERT.PreTrainedModel.bertBase(cased: false, multilingual: false)
let workspaceURL = URL(
    fileURLWithPath: "bert_models", isDirectory: true,
    relativeTo: URL(
        fileURLWithPath: NSTemporaryDirectory(),
        isDirectory: true))
let bert = try BERT.PreTrainedModel.load(bertPretrained)(from: workspaceURL)
var bertClassifier = BERTClassifier(bert: bert, classCount: 5)

Loading BERT pre-trained model 'BERT Base Uncased'.
Loading resource: uncased_L-12_H-768_A-12


## load dataset

In [3]:
%include "/notebooks/language2motion.gt/code/Sources/BERT-language2label/Language2Label.swift"

In [4]:
let maxSequenceLength = 30
let batchSize = 2048

let dsURL = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/labels_ds_v2_manual_labels.csv")

var dataset = try Language2Label(
  datasetURL: dsURL,
  maxSequenceLength: maxSequenceLength,
  batchSize: batchSize,
  entropy: SystemRandomNumberGenerator()
) { (example: Language2LabelExample) -> LabeledTextBatch in
  let textBatch = bertClassifier.bert.preprocess(
    sequences: [example.text],
    maxSequenceLength: maxSequenceLength)
   return (data: textBatch, 
           label: example.label.map { 
               (label: Language2LabelExample.LabelTuple) in Tensor(Int32(label.idx))
           }!
          )
}

print("Dataset acquired.")

Dataset acquired.


In [12]:
dataset.trainingExamples.count

2409


In [13]:
dataset.trainingExamples[0]

▿ 2 elements
  ▿ data : TextBatch
    - tokenIds : [[ 101, 1037, 2711, 2003, 5645, 1012,  102]]
    - tokenTypeIds : [[0, 0, 0, 0, 0, 0, 0]]
    - mask : [[1, 1, 1, 1, 1, 1, 1]]
  - label : 0


In [14]:
dataset.validationExamples[0]

▿ 2 elements
  ▿ data : TextBatch
    - tokenIds : [[  101,  1037,  2711,  7365,  1999,  1037,  4418,  4675, 20464,  7432, 14244,   102]]
    - tokenTypeIds : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    - mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
  - label : 1


## train

In [15]:
var optimizer = WeightDecayedAdam(
    for: bertClassifier,
    learningRate: LinearlyDecayedParameter(
        baseParameter: LinearlyWarmedUpParameter(
            baseParameter: FixedParameter<Float>(2e-5),
            warmUpStepCount: 10,
            warmUpOffset: 0),
        slope: -5e-7,  // The LR decays linearly to zero in 100 steps.
        startStep: 10),
    weightDecayRate: 0.01,
    maxGradientGlobalNorm: 1)

In [16]:
print("Training BERT for the Language2Label task!")

for (epoch, epochBatches) in dataset.trainingEpochs.prefix(3).enumerated() {
    print("[Epoch \(epoch + 1)]")
    Context.local.learningPhase = .training
    var trainingLossSum: Float = 0
    var trainingBatchCount = 0
    print("epochBatches.count: \(epochBatches.count)")

    for batch in epochBatches {
        let (documents, labels) = (batch.data, Tensor<Int32>(batch.label))
        let (loss, gradients) = valueWithGradient(at: bertClassifier) { model -> Tensor<Float> in
            let logits = model(documents)
            return softmaxCrossEntropy(logits: logits, labels: labels)
        }

        trainingLossSum += loss.scalarized()
        trainingBatchCount += 1
        optimizer.update(&bertClassifier, along: gradients)

        print(
            """
              Training loss: \(trainingLossSum / Float(trainingBatchCount))
            """
        )
    }

    print("dataset.validationBatches.count: \(dataset.validationBatches.count)")
    Context.local.learningPhase = .inference
    var devLossSum: Float = 0
    var devBatchCount = 0
    var correctGuessCount = 0
    var totalGuessCount = 0

    for batch in dataset.validationBatches {
        let valBatchSize = batch.data.tokenIds.shape[0]

        let (documents, labels) = (batch.data, Tensor<Int32>(batch.label))
        let logits = bertClassifier(documents)
        let loss = softmaxCrossEntropy(logits: logits, labels: labels)
        devLossSum += loss.scalarized()
        devBatchCount += 1

        let correctPredictions = logits.argmax(squeezingAxis: 1) .== labels

        correctGuessCount += Int(Tensor<Int32>(correctPredictions).sum().scalarized())
        totalGuessCount += valBatchSize
    }
    
    let accuracy = Float(correctGuessCount) / Float(totalGuessCount)
    print(
        """
        Accuracy: \(correctGuessCount)/\(totalGuessCount) (\(accuracy)) \
        Eval loss: \(devLossSum / Float(devBatchCount))
        """
    )
}

Training BERT for the Language2Label task!
[Epoch 1]
epochBatches.count: 35
  Training loss: 0.75371706
  Training loss: 0.7014136
  Training loss: 0.66437346
  Training loss: 0.6026812
  Training loss: 0.6085679
  Training loss: 0.59096843
  Training loss: 0.59411484
  Training loss: 0.58522356
  Training loss: 0.579469
  Training loss: 0.56056
  Training loss: 0.55604786
  Training loss: 0.5380627
  Training loss: 0.54076207
  Training loss: 0.5436632
  Training loss: 0.53137785
  Training loss: 0.5426758
  Training loss: 0.547717
  Training loss: 0.5518289
  Training loss: 0.5500601
  Training loss: 0.5412386
  Training loss: 0.5422206
  Training loss: 0.54708993
  Training loss: 0.5429764
  Training loss: 0.5422126
  Training loss: 0.53946966
  Training loss: 0.53733534
  Training loss: 0.5341239
  Training loss: 0.53327227
  Training loss: 0.5280953
  Training loss: 0.5234589
  Training loss: 0.5168105
  Training loss: 0.51504934
  Training loss: 0.5105922
  Training loss: 0.51094

In [17]:
struct Prediction {
    public let classIdx: Int
    public let className: String
    public let probability: Float
}

In [18]:
// TODO: get num_best preds
func predict(_ texts: [String], bertClassifier: BERTClassifier) -> [Prediction] {
    print("predict()")
    print("texts: \(texts.count)")

    let validationExamples = texts.map {
        (text) -> TextBatch in
        return bertClassifier.bert.preprocess(
            sequences: [text],
            maxSequenceLength: maxSequenceLength
        )
    }
    
    print("validationExamples.count: \(validationExamples.count)")

    print("batchSize: \(batchSize)")
    print("maxSequenceLength: \(maxSequenceLength)")
    print("batchSize / maxSequenceLength: \(batchSize / maxSequenceLength)")

    let validationBatches = validationExamples.inBatches(of: batchSize / maxSequenceLength).map { 
        $0.paddedAndCollated(to: maxSequenceLength)
    }
    print("validationBatches: \(validationBatches.count)")
    var preds: [Prediction] = []
    for batch in validationBatches {
        print("batch")
        let logits = bertClassifier(batch)
        let probs = softmax(logits, alongAxis: 1)
        let classIdxs = logits.argmax(squeezingAxis: 1)
        let batchPreds = (0..<classIdxs.shape[0]).map { 
            (idx) -> Prediction in
            let classIdx: Int = Int(classIdxs[idx].scalar!)
            let prob = probs[idx, classIdx].scalar!
            return Prediction(classIdx: classIdx, className: dataset.labels[classIdx], probability: prob)
        }
        preds.append(contentsOf: batchPreds)
    }
    return preds
}

let texts = [
    "A person is walking forwards.", 
    "A person walks 4 steps forward.", 
    "A person walks in a circle counter clockwise.", 
    "A person getting done on their knees"
]

## do inference on whole dataset

In [5]:
let dsURL = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/labels_ds_v2_manual_labels.csv")
let df = pd.read_csv(dsURL.path)

In [6]:
let labels = df.label.unique().sorted().map {String($0)!}
labels

▿ 5 elements
  - 0 : "Doing something"
  - 1 : "Walking and turning"
  - 2 : "Walking backwards"
  - 3 : "Walking few steps"
  - 4 : "Walking or running"


In [7]:
let texts2: [String] = Array(df.text.to_list())! // .iloc[0..<2000]
texts2.count

3012


In [8]:
df.label.value_counts()

Doing something        1216
Walking or running      649
Walking and turning     644
Walking few steps       400
Walking backwards       103
Name: label, dtype: int64


In [23]:
let preds2 = predict(texts2, bertClassifier: bertClassifier)

predict()
texts: 3012
validationExamples.count: 3012
batchSize: 2048
maxSequenceLength: 30
batchSize / maxSequenceLength: 68
validationBatches: 45
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
