In [None]:
from tokenizers import Tokenizer
from bastionlab.polars.policy import Policy, Aggregation, Log
from bastionlab.polars import train_test_split
import polars as pl
from bastionlab import Connection
from bastionlab.tokenizers import RemoteTokenizer
from bastionlab.torch.remote_torch import RemoteDataset
from bastionlab.torch.learner import RemoteLearner


file_path = "./SMSSpamCollection"

# Read CSV file using Polars and rename columns with `text`, `label`
df = pl.read_csv(file_path, has_header=False, sep="\t", new_columns=["label", "text"])

# Transform `spam` labels to `1` and `0` for any other column label
df = df.with_column(
    pl.when(pl.col("label") == "spam").then(1).otherwise(0).alias("label")
)

# View the first few elements of the DataFrame
df.head()

In [None]:
tokenizer = RemoteTokenizer.from_hugging_face_pretrained("distilbert-base-uncased")
tokenizer.enable_padding(length=32)
tokenizer.enable_truncation(max_length=32)

connection = Connection("localhost")


rdf = connection.client.polars.send_df(df.limit(64))

# Split dataframe into train and test sets.
train_rdf, test_rdf = train_test_split(rdf, test_size=0.2, shuffle=True)

# Create Inputs(train, test) RemoteSeries objects
train_inputs = train_rdf.select("text").to_array()
test_inputs = test_rdf.select("text").to_array()

# Create Label(train, test) RemoteSeries objects
train_label = train_rdf.select("label").to_array().to_tensor()
test_label = test_rdf.select("label").to_array().to_tensor()

# Tokenize `text` fields
train_ids, train_mask = tokenizer.encode(train_inputs)
test_ids, test_mask = tokenizer.encode(test_inputs)

# Create train RemoteDataset
train_rds = RemoteDataset(inputs=[train_ids, train_mask], labels=train_label)

# Create test RemoteDataset
test_rds = RemoteDataset(inputs=[test_ids, test_mask], labels=test_label)

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from bastionlab.torch.utils import MultipleOutputWrapper

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
    torchscript=True,
)
model = MultipleOutputWrapper(model, 0)

In [None]:
from bastionlab.torch.optimizer_config import Adam

remote_learner = connection.client.torch.RemoteLearner(
    model,
    train_rds,
    max_batch_size=2,
    loss="cross_entropy",
    optimizer=Adam(lr=5e-5),
    model_name="DistilBERT",
)

In [None]:
remote_learner.fit(nb_epochs=1)  # , poll_delay=1.0)

In [None]:
# from torchvision.datasets import CIFAR10
# from torchvision.models import resnet50, ResNet50_Weights
# from torch.utils.data import DataLoader
# from torchvision.transforms import ToTensor, Normalize, Compose
# from bastionlab.torch.remote_torch import RemoteDataset
# import torch
# from bastionlab import Connection


# client = Connection("localhost").client.torch


# transform = Compose(
#     [
#         ToTensor(),
#         Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
#         lambda x: [x],
#     ]
# )

# train = CIFAR10("data", train=True, transform=transform, download=True)

# dataset = RemoteDataset.from_dataset(train)

# model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)

# remote_learner = client.RemoteLearner(
#     model,
#     dataset,
#     max_batch_size=4,
#     loss="cross_entropy",
#     model_name="ResNet50",
# )

# remote_learner.fit(nb_epochs=1)