In [1]:
from tokenizers import Tokenizer
from bastionlab.polars.policy import Policy, Aggregation, Log
from bastionlab.polars import train_test_split
import polars as pl
from bastionlab import Connection
from bastionlab.tokenizers import RemoteTokenizer
from bastionlab.torch.remote_torch import RemoteDataset


file_path = "./SMSSpamCollection"

# Read CSV file using Polars and rename columns with `text`, `label`
df = pl.read_csv(file_path, has_header=False, sep="\t", new_columns=["label", "text"])

# Transform `spam` labels to `1` and `0` for any other column label
df = df.with_column(
    pl.when(pl.col("label") == "spam").then(1).otherwise(0).alias("label")
)

# View the first few elements of the DataFrame
df.head()

  from .autonotebook import tqdm as notebook_tqdm


label,text
i64,str
0,"""Go until juron..."
0,"""Ok lar... Joki..."
1,"""Free entry in ..."
0,"""U dun say so e..."
0,"""Nah I don't th..."


In [2]:
tokenizer = RemoteTokenizer.from_hugging_face_pretrained("distilbert-base-uncased")
tokenizer.enable_padding(length=32)
tokenizer.enable_truncation(max_length=32)

connection = Connection("localhost")

policy = Policy(safe_zone=Aggregation(min_agg_size=10), unsafe_handling=Log())


rdf = connection.client.polars.send_df(df, policy=policy)

rs = rdf.column("text")
label = rdf.column("label").to_tensor()
input_ids, attention_mask = tokenizer.encode(rs)

print(input_ids, attention_mask, label)

rds = RemoteDataset(inputs=[input_ids, attention_mask], label=label)

print(rds)

RemoteTensor(identifier=04798f9e-4a30-437e-95f2-8f04ff5af9e1, dtype=torch.int64, shape=torch.Size([4774, 32])) RemoteTensor(identifier=70f3bbf7-0ffe-47d0-a314-c86b1d6b29d2, dtype=torch.int64, shape=torch.Size([4774, 32])) RemoteTensor(identifier=4c43767d-b17c-4754-8e64-fc7bd6fb526f, dtype=torch.int64, shape=torch.Size([4774]))
RemoteDataset(name=RemoteDataset-e3b0c, privacy_limit=-1.0)
	└──inputs: RemoteTensor(identifier=04798f9e-4a30-437e-95f2-8f04ff5af9e1, dtype=torch.int64, shape=torch.Size([4774, 32]))
		└──RemoteTensor(identifier=70f3bbf7-0ffe-47d0-a314-c86b1d6b29d2, dtype=torch.int64, shape=torch.Size([4774, 32]))
	└──label: RemoteTensor(identifier=4c43767d-b17c-4754-8e64-fc7bd6fb526f, dtype=torch.int64, shape=torch.Size([4774]))
