In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1) Load the HCC dataset
# Adjust 'delimiter' and 'decimal' parameters if necessary to match your file format
df = pd.read_csv("hcc-data-complete-balanced.csv", delimiter=",", decimal=",")

# Quick check of the data
print(df.head())
print(df.columns)
print(df.info())

# Drop the target column to get features
X = df.drop(columns=["Class"])
y = df["Class"]



   Gender  Symptoms  Alcohol  HBsAg  HBeAg  HBcAb  HCVAb  Cirrhosis  Endemic  \
0       1         0        1      0      0      0      0          1        0   
1       0         0        0      0      0      0      1          1        0   
2       1         0        1      1      0      1      0          1        0   
3       1         1        1      0      0      0      0          1        0   
4       1         1        1      1      0      1      0          1        0   

   Smoking  ...  ALP   TP  Creatinine  Nodule  Major_Dim  Dir_Bil   Iron  \
0        1  ...  150  7.1        0.70       1        3.5     0.50   52.5   
1        1  ...  120  7.0        0.58       1        1.8     0.85   32.0   
2        1  ...  109  7.0        2.10       5       13.0     0.10   28.0   
3        1  ...  174  8.1        1.11       2       15.7     0.20  131.0   
4        1  ...  109  6.9        1.80       1        9.0     0.10   59.0   

    Sat  Ferritin  Class  
0  37.0     856.0      1  
1  10.0 

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,      # 20% for testing
    stratify=y,         # preserve class distribution
    random_state=42
)

# Recombine for convenience
train_df = X_train.copy()
train_df["Class"] = y_train

test_df = X_test.copy()
test_df["Class"] = y_test

num_hospitals = 20

# Shuffle the training data
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

chunk_size = len(train_df) // num_hospitals
hospital_data = []

for i in range(num_hospitals):
    start_idx = i * chunk_size

    # last hospital gets the remainder if not evenly divisible
    if i < num_hospitals - 1:
        end_idx = (i + 1) * chunk_size
    else:
        end_idx = len(train_df)

    # slice of data for this hospital
    hospital_df = train_df.iloc[start_idx:end_idx].copy()
    hospital_data.append(hospital_df)

# Now hospital_data is a list of 20 DataFrames
# Each DataFrame has the features plus the Class column
print(f"Total training samples: {len(train_df)}")
print([len(d) for d in hospital_data])


Total training samples: 163
[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 11]


#Part 2: Federated Learning

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def my_forest_train(X, y, n_estimators=100, random_state=42):
    """
    Train a Random Forest classifier with n_estimators trees.
    """
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        random_state=random_state
    )
    model.fit(X, y)
    return model

def my_forest_predict(model, X):
    """
    Predict the class labels for the given features X.
    """
    return model.predict(X)

def evaluate_accuracy(y_true, y_pred):
    """
    Simple accuracy metric.
    """
    return accuracy_score(y_true, y_pred)


In [27]:
local_forest_models = []

for i, hospital_df in enumerate(hospital_data):
    X_local = hospital_df.drop(columns=["Class"])
    y_local = hospital_df["Class"]

    # Train local model
    rf_model = my_forest_train(X_local, y_local, n_estimators=50, random_state=42 + i)
    local_forest_models.append(rf_model)


In [28]:
X_test_only = test_df.drop(columns=["Class"])
y_test_only = test_df["Class"]

local_accuracies = []
for i, model in enumerate(local_forest_models):
    y_pred_local = my_forest_predict(model, X_test_only)
    acc_local = evaluate_accuracy(y_test_only, y_pred_local)
    local_accuracies.append(acc_local)
    print(f"Hospital {i+1} - Local Model Accuracy on Test Set: {acc_local:.4f}")

print(f"\nAverage local model accuracy: {sum(local_accuracies)/len(local_accuracies):.4f}")


Hospital 1 - Local Model Accuracy on Test Set: 0.5854
Hospital 2 - Local Model Accuracy on Test Set: 0.5122
Hospital 3 - Local Model Accuracy on Test Set: 0.6341
Hospital 4 - Local Model Accuracy on Test Set: 0.5122
Hospital 5 - Local Model Accuracy on Test Set: 0.5122
Hospital 6 - Local Model Accuracy on Test Set: 0.4878
Hospital 7 - Local Model Accuracy on Test Set: 0.6098
Hospital 8 - Local Model Accuracy on Test Set: 0.5122
Hospital 9 - Local Model Accuracy on Test Set: 0.7561
Hospital 10 - Local Model Accuracy on Test Set: 0.5122
Hospital 11 - Local Model Accuracy on Test Set: 0.5854
Hospital 12 - Local Model Accuracy on Test Set: 0.6098
Hospital 13 - Local Model Accuracy on Test Set: 0.7073
Hospital 14 - Local Model Accuracy on Test Set: 0.6829
Hospital 15 - Local Model Accuracy on Test Set: 0.5854
Hospital 16 - Local Model Accuracy on Test Set: 0.7317
Hospital 17 - Local Model Accuracy on Test Set: 0.6585
Hospital 18 - Local Model Accuracy on Test Set: 0.5610
Hospital 19 - Local

In [29]:
import numpy as np

def federated_predict(models, X):
    """
    Each model predicts; we take a majority vote among the predictions.
    """
    # shape: (num_models, num_samples)
    all_preds = np.array([my_forest_predict(m, X) for m in models])
    # majority vote along axis=0
    # for each sample, all_preds[:, sample_idx] has predictions from all models
    final_preds = []

    for sample_idx in range(all_preds.shape[1]):
        # get the column of predictions for this sample
        votes = all_preds[:, sample_idx]
        # majority vote
        values, counts = np.unique(votes, return_counts=True)
        majority_label = values[np.argmax(counts)]
        final_preds.append(majority_label)

    return np.array(final_preds)

# Evaluate federated model
y_pred_federated = federated_predict(local_forest_models, X_test_only)
federated_accuracy = evaluate_accuracy(y_test_only, y_pred_federated)
print(f"\nFederated Model Accuracy on Test Set: {federated_accuracy:.4f}")



Federated Model Accuracy on Test Set: 0.8537


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
import copy
import numpy as np

# 1) Load the MNIST dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset  = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Hyperparameters
num_clients = 5     # e.g. 5 "hospitals"/clients
batch_size = 64
lr = 0.01
num_epochs_local = 1  # local epochs each round
num_rounds = 5        # how many federated rounds

from torch.utils.data import DataLoader, Subset

# Shuffle the training dataset indices
indices = np.arange(len(train_dataset))
np.random.shuffle(indices)
split_size = len(indices) // num_clients

client_loaders = []
for i in range(num_clients):
    start = i * split_size
    end = (i+1) * split_size if i < num_clients - 1 else len(indices)
    subset_idx = indices[start:end]
    client_subset = Subset(train_dataset, subset_idx)
    client_loader = DataLoader(client_subset, batch_size=batch_size, shuffle=True)
    client_loaders.append(client_loader)

# Global test loader
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # (N, 32, 14, 14)
        x = self.pool(F.relu(self.conv2(x)))  # (N, 64, 7, 7)
        x = x.view(x.size(0), -1)             # flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
def scale_params(model, fraction):
    for param in model.parameters():
        param.data = param.data * fraction
def sum_weights(model_list):
    """
    Returns a new model whose weights are the sum of corresponding
    parameters from each model in model_list.
    """
    # Deep copy the first model
    sum_model = copy.deepcopy(model_list[0])

    # Zero out the parameters in sum_model
    for param in sum_model.parameters():
        param.data = torch.zeros_like(param.data)

    # Sum up the parameters from every model in the list
    for model in model_list:
        for sum_param, model_param in zip(sum_model.parameters(), model.parameters()):
            sum_param.data += model_param.data

    return
def local_train(model, dataloader, epochs, lr=0.01):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    model.train()
    for _ in range(epochs):
        for data, target in dataloader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()

    # Return to CPU for easier manipulation
    model.cpu()
    return model

def evaluate(model, dataloader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in dataloader:
            data, target = data.to(device), target.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    model.cpu()
    return 100.0 * correct / total



In [None]:
# fraction alpha
fraction = 1.0 / num_clients  # for simple averaging

global_model = SimpleCNN(num_classes=10)

for round_idx in range(num_rounds):
    local_models = []

    # Each client trains locally
    for client_idx in range(num_clients):
        # clone global model for local training
        local_model = copy.deepcopy(global_model)

        # train locally
        local_model = local_train(local_model, client_loaders[client_idx], num_epochs_local, lr)
        local_models.append(local_model)

    # Sum weights of local models
    summed_model = sum_weights(local_models)

    # Scale the summed model by the fraction alpha
    scale_params(summed_model, fraction)

    # This is the new global model
    global_model = summed_model

    # Evaluate on the global test set
    acc = evaluate(global_model, test_loader)
    print(f"Round {round_idx+1}/{num_rounds}, Global Model Accuracy: {acc:.2f}%")