In [1]:
from datasets import load_dataset
from flwr_datasets.partitioner import IidPartitioner
from typing import Union
import pandas as pd

# Function to load local CSV, partition it and return a pandas DataFrame for the partition

def local_load_data(partition_id: int, num_partitions: int, data_path="ai4i2020.csv") -> pd.DataFrame:
    """Load a local CSV, partition it using IidPartitioner and return the partition as a pandas DataFrame.

    Args:
        data_path: Path to the CSV file (or list of paths) compatible with `datasets.load_dataset`.
        partition_id: Which partition to return (0-based).
        num_partitions: Number of partitions to create.

    Returns:
        A pandas DataFrame containing the rows for the requested partition.
    """
    dataset = load_dataset("csv", data_files=data_path)

    # load_dataset often returns a DatasetDict; pick the 'train' split if present
    if hasattr(dataset, "keys"):
        if "train" in dataset:
            dataset = dataset["train"]
        else:
            # fall back to the first split
            dataset = next(iter(dataset.values()))

    partitioner = IidPartitioner(num_partitions=num_partitions)
    partitioner.dataset = dataset
    partition = partitioner.load_partition(partition_id=partition_id)

    # Return as pandas DataFrame
    df = partition.with_format("pandas")[:]
    return df

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Example usage (uncomment to run):
df_part = local_load_data(partition_id=0, num_partitions=10)
df_part

Generating train split: 10000 examples [00:00, 233282.57 examples/s]


Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,L48175,L,296.3,307.3,1566,35.8,175,0,0,0,0,0,0
996,997,M15856,M,296.3,307.2,1286,51.1,177,0,0,0,0,0,0
997,998,M15857,M,296.3,307.2,1446,45.9,180,0,0,0,0,0,0
998,999,M15858,M,296.4,307.2,2071,19.4,183,0,0,0,0,0,0


In [3]:
import numpy as np
from flwr.common import NDArrays
from flwr_datasets import FederatedDataset
from flwr_datasets.partitioner import IidPartitioner
from sklearn.linear_model import LogisticRegression

from datasets import load_dataset
from typing import Union
import pandas as pd

# This information is needed to create a correct scikit-learn model
UNIQUE_LABELS_AI4I = [0, 1]
FEATURES_AI4I = ["Air temperature [K]", "Process temperature [K]", "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"]

fds_ai4i = None  # Cache FederatedDataset

def load_data_ai4i(partition_id: int, num_partitions: int, data_path="ai4i2020.csv") -> pd.DataFrame:
    """Load a local CSV, partition it using IidPartitioner and return the partition as a pandas DataFrame.

    Args:
        data_path: Path to the CSV file (or list of paths) compatible with `datasets.load_dataset`.
        partition_id: Which partition to return (0-based).
        num_partitions: Number of partitions to create.

    Returns:
        A pandas DataFrame containing the rows for the requested partition.
    """
    global fds_ai4i
    if fds_ai4i is None:
        df = load_dataset("csv", data_files=data_path)
        # load_dataset often returns a DatasetDict; pick the 'train' split if present
        if hasattr(df, "keys"):
            if "train" in df:
                df = df["train"]
            else:
                # fall back to the first split
                df = next(iter(df.values()))

        fds_ai4i = IidPartitioner(num_partitions=num_partitions)
        fds_ai4i.dataset = df

    dataset = fds_ai4i.load_partition(partition_id=partition_id).with_format("pandas")[:]

    X = dataset[FEATURES_AI4I]
    y = dataset["Machine failure"]
    # Split the on-edge data: 80% train, 20% test
    X_train, X_test = X[: int(0.8 * len(X))], X[int(0.8 * len(X)) :]
    y_train, y_test = y[: int(0.8 * len(y))], y[int(0.8 * len(y)) :]
    return X_train.values, y_train.values, X_test.values, y_test.values

In [4]:
X_train, y_train, _, _ = load_data_ai4i(partition_id=0, num_partitions=2)
print(X_train.shape, y_train.shape)

(4000, 5) (4000,)
