Server Trust: The setup assumes the server is trusted to hold the target (Survived) and not misuse it. If the server is untrusted or compromised, it could potentially infer information about client features by analyzing model outputs (e.g., through model inversion attacks). This is a privacy risk, not traditional data leakage in the machine learning sense (where the model learns from test data or future information).

Target Exposure: If the server shares Survived with clients or if clients inadvertently access it, this could be considered leakage in a privacy context. However, in the Flower example, Survived is explicitly removed from client data (partition.remove_columns(['Survived'])), preventing clients from accessing the target directly.

Inference Attacks: In VFL, the server’s knowledge of Survived and client model outputs could theoretically allow it to infer some client feature information (e.g., if a client’s output strongly correlates with Survived). This risk is mitigated by techniques like differential privacy or encryption, but the Flower example doesn’t implement these for simplicity.

# Import Libraries

In [18]:
# Import required libraries
from pathlib import Path
import numpy as np
import pandas as pd
# Show all columns
pd.set_option('display.max_columns', None)
from datasets import Dataset
from flwr_datasets.partitioner import IidPartitioner
import torch
import torch.nn as nn
import torch.nn.functional as F
import flwr as fl
from collections import OrderedDict
from typing import List, Tuple, Dict
from flwr.common import NDArrays, Scalar

# Define constant for number of vertical splits (3 parties)
NUM_VERTICAL_SPLITS = 3

In [19]:
pd.read_csv('train.csv')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Data Processing Functions

bin age

In [20]:
# bin age
def _bin_age(age_series):
    bins = [-np.inf, 10, 40, np.inf]
    labels = ['Child', 'Adult', 'Elderly']
    return pd.cut(age_series, bins=bins, labels=labels, right=True).astype(str).replace('nan', 'Unknown')

In [21]:
# testing the function

In [22]:
ages = pd.Series([5, 25, 50, np.nan])
ages

0     5.0
1    25.0
2    50.0
3     NaN
dtype: float64

In [23]:
_bin_age(ages)

0      Child
1      Adult
2    Elderly
3    Unknown
dtype: object

extract titles

In [24]:
# pd.set_option('future.no_silent_downcasting', True)

def _extract_title(name_series):
    # Fix regex pattern (remove leading space)
    titles = name_series.str.extract(r'([A-Za-z]+)\.', expand=False)

    rare_titles = {'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'}
    titles = titles.replace(list(rare_titles), 'Rare').astype(str)  # Ensure type consistency

    titles = titles.replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
    return titles



In [25]:
# testing the function

In [26]:
names = pd.Series(["Mr. John Doe", "Dr. Jane Smith", "Col. George Brown", "Mlle. Alice"])
names

0         Mr. John Doe
1       Dr. Jane Smith
2    Col. George Brown
3          Mlle. Alice
dtype: object

In [27]:
titles = _extract_title(names)
titles

0      Mr
1    Rare
2    Rare
3    Miss
dtype: object

In [28]:
def _create_features(df):
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
    df['Age'] = _bin_age(df['Age'])
    df['Cabin'] = df['Cabin'].str[0].fillna('Unknown')
    df['Title'] = _extract_title(df['Name'])

    # Drop irrelevant columns
    df.drop(columns=['Name', 'Ticket'], inplace=True)

    # Assign new UID before dropping PassengerId
    df["UID"] = [f"U{i:03d}" for i in range(1, len(df) + 1)]

    # Now drop PassengerId
    df.drop(columns=['PassengerId'], inplace=True)

    # One-hot encoding
    df = pd.get_dummies(df, columns=['Sex', 'Pclass', 'Embarked', 'Title', 'Cabin', 'Age'])

    all_keywords = set(df.columns)
    return df, all_keywords

In [29]:
def process_dataset():
    
    df = pd.read_csv('train.csv')
    processed_df = df.dropna(subset=['Embarked', 'Fare']).copy()
    processed_df, all_keywords = _create_features(processed_df)

    return processed_df, all_keywords


In [30]:
# Display the full processed dataset
processed_df, keyw = process_dataset()
print('Processed Dataset Shape:', processed_df.shape)
print(keyw)
processed_df.head()

Processed Dataset Shape: (889, 31)
{'Pclass_2', 'Sex_female', 'Embarked_C', 'Fare', 'Survived', 'Cabin_C', 'Age_Adult', 'Pclass_1', 'Cabin_A', 'Pclass_3', 'Title_Mrs', 'Title_Rare', 'Cabin_G', 'Sex_male', 'Age_Unknown', 'Cabin_Unknown', 'UID', 'SibSp', 'Title_Mr', 'Embarked_Q', 'Embarked_S', 'Cabin_F', 'Title_Master', 'Cabin_T', 'Cabin_E', 'Title_Miss', 'Age_Child', 'Cabin_B', 'Age_Elderly', 'Cabin_D', 'Parch'}


Unnamed: 0,Survived,SibSp,Parch,Fare,UID,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_Unknown,Age_Adult,Age_Child,Age_Elderly,Age_Unknown
0,0,1,0,7.25,U001,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False
1,1,1,0,71.2833,U002,True,False,True,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False
2,1,0,0,7.925,U003,True,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False
3,1,1,0,53.1,U004,True,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False
4,0,0,0,8.05,U005,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False


# Partitioning Functions

In [14]:
# Split the dataset vertically into three feature sets and apply horizontal partitioning
def _partition_data_vertically(df, all_keywords):
    partitions = []
    keywords_sets = [{'Parch', 'Cabin', 'Pclass'}, {'Sex', 'Title'}]
    keywords_sets.append(all_keywords - keywords_sets[0] - keywords_sets[1])
    
    for keywords in keywords_sets:
        partitions.append(
            df[list({col for col in df.columns for kw in keywords if kw in col or 'Survived' in col})]
        )
    return partitions

In [15]:

def load_data(partition_id: int, num_partitions: int):
    processed_df, features_set = process_dataset()
    v_partitions = _partition_data_vertically(processed_df, features_set)
    v_split_id = np.mod(partition_id, NUM_VERTICAL_SPLITS)
    v_partition = v_partitions[v_split_id]
    
    dataset = Dataset.from_pandas(v_partition)
    num_h_partitions = max(1, int(np.ceil(num_partitions / NUM_VERTICAL_SPLITS)))
    partitioner = IidPartitioner(num_partitions=num_h_partitions)
    partitioner.dataset = dataset
    
    # Load partition and ensure it's a Dataset
    partition = partitioner.load_partition(partition_id % num_h_partitions)
    if not isinstance(partition, Dataset):
        raise ValueError(f"Expected partition to be a Dataset, got {type(partition)}")
    
    # Store labels before removing Survived
    labels = pd.Series(partition['Survived']) if 'Survived' in partition.column_names else None
    if 'Survived' in partition.column_names:
        partition = partition.remove_columns(['Survived'])
    
    # Convert to pandas and drop index column if present
    partition_df = partition.to_pandas()
    if '__index_level_0__' in partition_df.columns:
        partition_df = partition_df.drop(columns=['__index_level_0__'])
    
    # Convert boolean columns to float32 and ensure numeric columns are float32
    for col in partition_df.columns:
        if partition_df[col].dtype == bool:
            partition_df[col] = partition_df[col].astype(np.float32)
        elif partition_df[col].dtype in [np.int64, np.int32]:
            partition_df[col] = partition_df[col].astype(np.float32)
    
    return partition_df, labels, v_split_id

# Display Full Processed Dataset

In [16]:
# Display the full processed dataset
processed_df, _ = process_dataset()
print('Processed Dataset Shape:', processed_df.shape)
print('\nFirst 5 rows of the full processed dataset:')
processed_df.head()


Processed Dataset Shape: (889, 31)

First 5 rows of the full processed dataset:


Unnamed: 0,PassengerId,Survived,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,...,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_Unknown,Age_Adult,Age_Child,Age_Elderly,Age_Unknown
0,1,0,1,0,7.25,False,True,False,False,True,...,False,False,False,False,False,True,True,False,False,False
1,2,1,1,0,71.2833,True,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
2,3,1,0,0,7.925,True,False,False,False,True,...,False,False,False,False,False,True,True,False,False,False
3,4,1,1,0,53.1,True,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
4,5,0,0,0,8.05,False,True,False,False,True,...,False,False,False,False,False,True,True,False,False,False


In [17]:
print('\nDistribution of Survived (Target):')
processed_df['Survived'].value_counts(normalize=True)


Distribution of Survived (Target):


Survived
0    0.617548
1    0.382452
Name: proportion, dtype: float64

In [18]:
# Display data for Party 0
num_partitions = 3
partition_df, labels, v_split_id = load_data(0, num_partitions)
print(f'\nParty 0 (Vertical Split {v_split_id}, Shape: {partition_df.shape}):')
print(f'Columns: {list(partition_df.columns)}')
partition_df.head()


Party 0 (Vertical Split 0, Shape: (889, 13)):
Columns: ['Cabin_A', 'Parch', 'Cabin_C', 'Pclass_2', 'Cabin_G', 'Cabin_T', 'Pclass_1', 'Cabin_F', 'Cabin_B', 'Cabin_Unknown', 'Cabin_E', 'Pclass_3', 'Cabin_D']


Unnamed: 0,Cabin_A,Parch,Cabin_C,Pclass_2,Cabin_G,Cabin_T,Pclass_1,Cabin_F,Cabin_B,Cabin_Unknown,Cabin_E,Pclass_3,Cabin_D
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [20]:
import pandas as pd
import numpy as np
import hashlib
from datasets import Dataset
from flwr_datasets.partitioner import IidPartitioner

# Constants
SALT = "MY_SECRET_SALT"
NUM_VERTICAL_SPLITS = 3

# Preprocessing
def _bin_age(age_series):
    bins = [-np.inf, 10, 40, np.inf]
    labels = ['Child', 'Adult', 'Elderly']
    return pd.cut(age_series, bins=bins, labels=labels, right=True).astype(str).replace('nan', 'Unknown')

def _extract_title(name_series):
    titles = name_series.str.extract(r'([A-Za-z]+)\.', expand=False)
    rare_titles = {'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'}
    titles = titles.replace(list(rare_titles), 'Rare').astype(str)
    titles = titles.replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
    return titles

def _create_features(df):
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
    df['Age'] = _bin_age(df['Age'])
    df['Cabin'] = df['Cabin'].str[0].fillna('Unknown')
    df['Title'] = _extract_title(df['Name'])
    df.drop(columns=['Name', 'Ticket'], inplace=True)
    all_keywords = set(df.columns)
    df = pd.get_dummies(df, columns=['Sex', 'Pclass', 'Embarked', 'Title', 'Cabin', 'Age'])
    return df, all_keywords

def process_dataset():
    df = pd.read_csv("train.csv").dropna(subset=['Embarked', 'Fare']).copy()
    df['UID'] = [f"U{i:04d}" for i in range(len(df))]  # Add unique IDs
    processed_df, feature_set = _create_features(df)
    processed_df['UID'] = df['UID']  # Add UID back to processed version
    return processed_df, feature_set

# Hashing for secure entity alignment
def hash_ids(uid_list, salt):
    return {
        hashlib.sha256((uid + salt).encode()).hexdigest(): uid
        for uid in uid_list
    }

# Partition vertically
def _partition_data_vertically(df, all_keywords):
    partitions = []
    keywords_sets = [{'Parch', 'Cabin', 'Pclass'}, {'Sex', 'Title'}]
    keywords_sets.append(all_keywords - keywords_sets[0] - keywords_sets[1])
    for keywords in keywords_sets:
        partitions.append(
            df[list({col for col in df.columns for kw in keywords if kw in col or 'Survived' in col or 'UID' in col})]
        )
    return partitions

# Entity alignment simulation and partitioning
def simulate_entity_alignment():
    df, feature_set = process_dataset()
    all_uids = df['UID'].tolist()
    np.random.seed(42)
    
    # Simulate partial overlap
    shared_uids = np.random.choice(all_uids, size=875, replace=False)
    uids_A = list(shared_uids) + list(np.random.choice(list(set(all_uids) - set(shared_uids)), size=10, replace=False))
    uids_B = list(shared_uids) + list(np.random.choice(list(set(all_uids) - set(shared_uids)), size=5, replace=False))

    # Hash and align
    hashed_A = hash_ids(uids_A, SALT)
    hashed_B = hash_ids(uids_B, SALT)
    common_hashes = set(hashed_A.keys()) & set(hashed_B.keys())
    aligned_ids = [hashed_A[h] for h in common_hashes]

    # Filter original data to aligned records only
    aligned_df = df[df['UID'].isin(aligned_ids)].reset_index(drop=True)
    print(f"✅ Aligned records: {len(aligned_df)}")

    # Partition aligned data
    v_partitions = _partition_data_vertically(aligned_df, feature_set)
    return v_partitions

# Load one party's data
def load_party_data(party_id: int, total_clients: int):
    v_partitions = simulate_entity_alignment()
    v_split_id = party_id % NUM_VERTICAL_SPLITS
    v_partition = v_partitions[v_split_id]

    dataset = Dataset.from_pandas(v_partition)
    num_h_partitions = max(1, int(np.ceil(total_clients / NUM_VERTICAL_SPLITS)))
    partitioner = IidPartitioner(num_partitions=num_h_partitions)
    partitioner.dataset = dataset

    partition = partitioner.load_partition(party_id % num_h_partitions)
    if not isinstance(partition, Dataset):
        raise ValueError(f"Expected Dataset, got {type(partition)}")

    labels = pd.Series(partition['Survived']) if 'Survived' in partition.column_names else None
    if 'Survived' in partition.column_names:
        partition = partition.remove_columns(['Survived'])

    df = partition.to_pandas()
    if '__index_level_0__' in df.columns:
        df.drop(columns=['__index_level_0__'], inplace=True)
    
    for col in df.columns:
        if df[col].dtype == bool:
            df[col] = df[col].astype(np.float32)
        elif df[col].dtype in [np.int32, np.int64]:
            df[col] = df[col].astype(np.float32)

    return df, labels, v_split_id

# Example usage
if __name__ == "__main__":
    party_df, labels, split_id = load_party_data(2, 3)
    print(f"\n🔍 Party 0 data (Vertical Split {split_id}):")
    print(party_df.head())


✅ Aligned records: 879

🔍 Party 0 data (Vertical Split 2):
     UID     Fare  Embarked_S  PassengerId  SibSp  Age_Adult  Age_Child  \
0  U0000   7.2500         1.0          1.0    1.0        1.0        0.0   
1  U0001  71.2833         0.0          2.0    1.0        1.0        0.0   
2  U0002   7.9250         1.0          3.0    0.0        1.0        0.0   
3  U0003  53.1000         1.0          4.0    1.0        1.0        0.0   
4  U0004   8.0500         1.0          5.0    0.0        1.0        0.0   

   Embarked_Q  Age_Elderly  Age_Unknown  Embarked_C  
0         0.0          0.0          0.0         0.0  
1         0.0          0.0          0.0         1.0  
2         0.0          0.0          0.0         0.0  
3         0.0          0.0          0.0         0.0  
4         0.0          0.0          0.0         0.0  
