In [9]:
from tokenizers import Tokenizer
from bastionlab.polars.policy import Policy, Aggregation, Log
from bastionlab.polars import train_test_split
import polars as pl
from bastionlab import Connection
from bastionlab.tokenizers import RemoteTokenizer
from bastionlab.torch.remote_torch import RemoteDataset
from bastionlab.torch.learner import RemoteLearner


file_path = "./SMSSpamCollection"

# Read CSV file using Polars and rename columns with `text`, `label`
df = pl.read_csv(file_path, has_header=False, sep="\t", new_columns=["label", "text"])

# Transform `spam` labels to `1` and `0` for any other column label
df = df.with_column(
    pl.when(pl.col("label") == "spam").then(1).otherwise(0).alias("label")
)

# View the first few elements of the DataFrame
df.head()

label,text
i64,str
0,"""Go until juron..."
0,"""Ok lar... Joki..."
1,"""Free entry in ..."
0,"""U dun say so e..."
0,"""Nah I don't th..."


In [10]:
tokenizer = RemoteTokenizer.from_hugging_face_pretrained("distilbert-base-uncased")
tokenizer.enable_padding(length=32)
tokenizer.enable_truncation(max_length=32)

connection = Connection("localhost")

policy = Policy(safe_zone=Aggregation(min_agg_size=10), unsafe_handling=Log())


rdf = connection.client.polars.send_df(df.limit(64), policy=policy)

# Split dataframe into train and test sets.
train_rdf, test_rdf = train_test_split(rdf, test_size=0.2)

# Create Inputs(train, test) RemoteSeries objects
train_inputs = train_rdf.column("text")
test_inputs = test_rdf.column("text")


# Create Label(train, test) RemoteSeries objects
train_label = train_rdf.column("label").to_tensor()
test_label = test_rdf.column("label").to_tensor()

# Tokenize `text` fields
train_ids, train_mask = tokenizer.encode(train_inputs)
test_ids, test_mask = tokenizer.encode(test_inputs)

# Create train RemoteDataset
train_rds = RemoteDataset(
    inputs=[train_ids, train_mask], label=train_label, privacy_limit=1_000_000
)

# Create test RemoteDataset
test_rds = RemoteDataset(inputs=[test_ids, test_mask], label=test_label)

In [11]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from bastionlab.torch.utils import MultipleOutputWrapper

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
    torchscript=True,
)
model = MultipleOutputWrapper(model, 0)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

In [12]:
from bastionlab.torch.optimizer_config import Adam

remote_learner = connection.client.torch.RemoteLearner(
    model,
    train_rds,
    max_batch_size=2,
    loss="cross_entropy",
    optimizer=Adam(lr=5e-5),
    expand=False,
    model_name="DistilBERT",
)

remote_learner.fit(nb_epochs=1, eps=6.0)  # , poll_delay=1.0)

Sending DistilBERT: 100%|████████████████████| 268M/268M [00:05<00:00, 50.9MB/s] 


identifier: "93bd90460bfe3249370bcf07f4a23c730c37490c2fa935b3d5020db8bf78044b"
name: "DistilBERT"

{"inputs": [{"identifier": "efb8fb86-876f-402b-b8b5-56885d592483"},{"identifier": "c46ddbad-8f6d-40da-bd50-ece29a82ee4e"}], "label": {"identifier": "510cc36c-af08-416a-a589-1a61061818e4"}, "nb_samples": 51, "privacy_limit": 1000000}


GRPCException: Internal server error: code=StatusCode.INTERNAL message=Torch error: invalid shape: Failed to stack per-sample gradients, are you using a model with expanded weights? Initial error: Internal torch error: stack expects each tensor to be equal size, but got [3072] at entry 0 and [] at entry 1
Exception raised from get_stack_inputs at ../aten/src/ATen/native/TensorShape.cpp:2345 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x6b (0x7f7014502d4b in /home/kbamponsem/libtorch-13/libtorch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xce (0x7f70144fe6fe in /home/kbamponsem/libtorch-13/libtorch/lib/libc10.so)
frame #2: <unknown function> + 0x19f5e29 (0x7f7015f48e29 in /home/kbamponsem/libtorch-13/libtorch/lib/libtorch_cpu.so)
frame #3: at::native::stack(c10::ArrayRef<at::Tensor>, long) + 0xaa (0x7f7015f4e09a in /home/kbamponsem/libtorch-13/libtorch/lib/libtorch_cpu.so)
frame #4: <unknown function> + 0x25fcb66 (0x7f7016b4fb66 in /home/kbamponsem/libtorch-13/libtorch/lib/libtorch_cpu.so)
frame #5: at::_ops::stack::redispatch(c10::DispatchKeySet, c10::ArrayRef<at::Tensor>, long) + 0x98 (0x7f70164f5fd8 in /home/kbamponsem/libtorch-13/libtorch/lib/libtorch_cpu.so)
frame #6: <unknown function> + 0x386a6a8 (0x7f7017dbd6a8 in /home/kbamponsem/libtorch-13/libtorch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0x386abb9 (0x7f7017dbdbb9 in /home/kbamponsem/libtorch-13/libtorch/lib/libtorch_cpu.so)
frame #8: at::_ops::stack::call(c10::ArrayRef<at::Tensor>, long) + 0x220 (0x7f701652f410 in /home/kbamponsem/libtorch-13/libtorch/lib/libtorch_cpu.so)
frame #9: <unknown function> + 0x70c7e2 (0x55ce1a3a37e2 in ./bastionlab)
frame #10: <unknown function> + 0x3ad3d2 (0x55ce1a0443d2 in ./bastionlab)
frame #11: <unknown function> + 0x3af4c8 (0x55ce1a0464c8 in ./bastionlab)
frame #12: <unknown function> + 0x3abbd7 (0x55ce1a042bd7 in ./bastionlab)
frame #13: <unknown function> + 0x3b8ee4 (0x55ce1a04fee4 in ./bastionlab)
frame #14: <unknown function> + 0x3b9699 (0x55ce1a050699 in ./bastionlab)
frame #15: <unknown function> + 0x38abb6 (0x55ce1a021bb6 in ./bastionlab)
frame #16: <unknown function> + 0x39ecd5 (0x55ce1a035cd5 in ./bastionlab)
frame #17: <unknown function> + 0x39a0a6 (0x55ce1a0310a6 in ./bastionlab)
frame #18: <unknown function> + 0x18b9d0b (0x55ce1b550d0b in ./bastionlab)
frame #19: <unknown function> + 0x18b9372 (0x55ce1b550372 in ./bastionlab)
frame #20: <unknown function> + 0x18c9d79 (0x55ce1b560d79 in ./bastionlab)
frame #21: <unknown function> + 0x18b8e70 (0x55ce1b54fe70 in ./bastionlab)
frame #22: <unknown function> + 0x18cd892 (0x55ce1b564892 in ./bastionlab)
frame #23: <unknown function> + 0x18b045c (0x55ce1b54745c in ./bastionlab)
frame #24: <unknown function> + 0x18b281f (0x55ce1b54981f in ./bastionlab)
frame #25: <unknown function> + 0x18b5779 (0x55ce1b54c779 in ./bastionlab)
frame #26: <unknown function> + 0x18cc177 (0x55ce1b563177 in ./bastionlab)
frame #27: <unknown function> + 0x18bb92e (0x55ce1b55292e in ./bastionlab)
frame #28: <unknown function> + 0x19583d3 (0x55ce1b5ef3d3 in ./bastionlab)
frame #29: <unknown function> + 0x8609 (0x7f7014433609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #30: clone + 0x43 (0x7f7014203133 in /lib/x86_64-linux-gnu/libc.so.6)
