In [None]:
!pip install scikit-learn numpy pandas matplotlib seaborn

In [None]:
import torch

import torch.nn as nn

import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score

import numpy as np

import pandas as pd

import re

import matplotlib.pyplot as plt



# Custom CNN for Text Classification

class CNN_Text(nn.Module):

    def __init__(self, vocab_size, embed_size, num_classes, kernel_sizes, num_filters, dropout=0.5):

        super(CNN_Text, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_size)

        self.convs = nn.ModuleList([

            nn.Conv2d(1, num_filters, (kernel_size, embed_size)) for kernel_size in kernel_sizes

        ])

        self.dropout = nn.Dropout(dropout)

        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)



    def forward(self, x):

        x = self.embedding(x)  # [batch_size, seq_len, embed_size]

        x = x.unsqueeze(1)  # Add a channel dimension: [batch_size, 1, seq_len, embed_size]

        conv_results = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]  # Apply convs

        pooled_results = [torch.max(conv, dim=2)[0] for conv in conv_results]  # Max-pooling

        x = torch.cat(pooled_results, dim=1)  # Concatenate results of all kernel sizes

        x = self.dropout(x)

        x = self.fc(x)

        return x





# Load Emotion_final.csv dataset

emotion_data = pd.read_csv('Emotion_final.csv')



# Clean the text in the emotion data

def clean_text(text):

    text = str(text).lower()  # Convert to lowercase

    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters

    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces

    return text



emotion_data['Text'] = emotion_data['Text'].apply(clean_text)



# Load synthetic_data.csv dataset

synthetic_data = pd.read_csv('/kaggle/input/emotional-dataset-v2/synthetic_dataset.csv')



# Clean the text in the synthetic data

synthetic_data['Text'] = synthetic_data['Text'].apply(clean_text)



# Concatenate the original and synthetic datasets

combined_data = pd.concat([emotion_data, synthetic_data], ignore_index=True)



# Shuffle the combined dataset

combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)



# Encoding the labels

label_encoder = LabelEncoder()

combined_data['Emotion'] = label_encoder.fit_transform(combined_data['Emotion'])



# Tokenization



vocab = set(" ".join(combined_data['Text']).split())  # Unique words in the dataset

word2idx = {word: idx + 1 for idx, word in enumerate(vocab)}  # Map each word to an index

word2idx['<PAD>'] = 0  # Padding token



def text_to_tensor(text, word2idx, max_len=128):

    tokens = text.split()

    token_indices = [word2idx.get(token, 0) for token in tokens]  # Convert tokens to indices

    if len(token_indices) < max_len:

        token_indices.extend([0] * (max_len - len(token_indices)))  # Padding

    else:

        token_indices = token_indices[:max_len]  # Truncate if longer than max_len

    return torch.tensor(token_indices)



# Custom Dataset class for CNN

class TextDataset(Dataset):

    def __init__(self, texts, labels, word2idx, max_len=128):

        self.texts = texts.tolist()

        self.labels = labels.tolist()

        self.word2idx = word2idx

        self.max_len = max_len



    def __len__(self):

        return len(self.texts)



    def __getitem__(self, idx):

        text = self.texts[idx]

        label = int(self.labels[idx])  # Ensure label is converted to int

        text_tensor = text_to_tensor(text, self.word2idx, self.max_len)

        return {'text': text_tensor, 'label': torch.tensor(label, dtype=torch.long)}





# Hyperparameters

num_clients = 3

epochs = 100

learning_rate = 1e-3

#epsilon = 1.0

subset_size = 500

batch_size = 16

embed_size = 100  # A fixed embedding size

kernel_sizes = [3, 4, 5]  # Fixed kernel sizes for the CNN

num_filters = 128  # Number of filters for each kernel size



# Federated Learning - Split data for each client

clients_data = []

for _ in range(num_clients):

    client_data = combined_data.sample(frac=0.2, random_state=42).reset_index(drop=True).head(subset_size)

    clients_data.append(client_data)



# Train each client locally

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

models = []

train_loss_history = [[] for _ in range(num_clients)]

train_acc_history = [[] for _ in range(num_clients)]



for i in range(num_clients):

    client_data = clients_data[i]

    train_texts, test_texts, train_labels, test_labels = train_test_split(client_data['Text'], client_data['Emotion'], test_size=0.2, random_state=42)

    assert len(train_texts) > 0 and len(test_texts) > 0, "Train-test split resulted in empty sets"



    train_dataset = TextDataset(train_texts, train_labels, word2idx)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)



    model = CNN_Text(vocab_size=len(word2idx), embed_size=embed_size, num_classes=len(label_encoder.classes_), kernel_sizes=kernel_sizes, num_filters=num_filters).to(device)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)



    for epoch in range(epochs):

        model.train()

        total_loss = 0

        correct = 0

        total = 0

        for batch in train_loader:

            texts = batch['text'].to(device)

            labels = batch['label'].to(device)



            optimizer.zero_grad()

            outputs = model(texts)

            loss = nn.CrossEntropyLoss()(outputs, labels)



            loss.backward()

            optimizer.step()



            total_loss += loss.item()

            _, predicted = torch.max(outputs, dim=1)

            total += labels.size(0)

            correct += (predicted == labels).sum().item()



        avg_loss = total_loss / len(train_loader)

        accuracy = correct / total

        train_loss_history[i].append(avg_loss)

        train_acc_history[i].append(accuracy)

        print(f"Client {i}, Epoch {epoch + 1}, Loss: {avg_loss}, Accuracy: {accuracy}")



    models.append(model)



# Aggregation Step

# Combine the weights of the models for global model

global_state_dict = {}

for key in models[0].state_dict().keys():

    global_state_dict[key] = sum(model.state_dict()[key] for model in models) / num_clients



# Update the global model with the aggregated weights

global_model = CNN_Text(vocab_size=len(word2idx), embed_size=embed_size, num_classes=len(label_encoder.classes_), kernel_sizes=kernel_sizes, num_filters=num_filters).to(device)

global_model.load_state_dict(global_state_dict)





# Split the combined data into train and test datasets

train_texts, test_texts, train_labels, test_labels = train_test_split(combined_data['Text'], combined_data['Emotion'], test_size=0.2, random_state=42)



# Create a DataLoader for the test set

test_dataset = TextDataset(test_texts, test_labels, word2idx)

test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



# Evaluate the global CNN model on the test set (original test data)

global_model.eval()  # Set the model to evaluation mode



predictions = []

true_labels = []



# Evaluate on the test set

with torch.no_grad():

    for batch in test_loader:  # Use test_loader instead of original_test_loader

        texts = batch['text'].to(device)

        labels = batch['label'].to(device)

        outputs = global_model(texts)

        

        # Get predicted class labels

        _, predicted = torch.max(outputs, dim=1)

        

        predictions.extend(predicted.cpu().numpy())  # Convert to numpy array and add to predictions

        true_labels.extend(labels.cpu().numpy())     # Add true labels to the list



# Calculate the metrics

accuracy = accuracy_score(true_labels, predictions)

precision = precision_score(true_labels, predictions, average='weighted')

recall = recall_score(true_labels, predictions, average='weighted')

f1 = f1_score(true_labels, predictions, average='weighted')

cm = confusion_matrix(true_labels, predictions)



# Print the results

print(f"Forecast Test Accuracy: {accuracy:.4f}")

print(f"Forecast Test Precision: {precision:.4f}")

print(f"Forecast Test Recall: {recall:.4f}")

print(f"Forecast Test F1 Score: {f1:.4f}")

print("Forecast Confusion Matrix:")

print(cm)




In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, 
            xticklabels=True, yticklabels=True)

plt.title('Confusion Matrix Heatmap')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# Plot the training loss and accuracy
plt.figure(figsize=(12, 6))
for i in range(num_clients):
    plt.subplot(2, num_clients, i+1)
    plt.plot(range(epochs), train_loss_history[i], label=f'Client {i} Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'Client {i} Training Loss')
    plt.legend()

    plt.subplot(2, num_clients, num_clients+i+1)
    plt.plot(range(epochs), train_acc_history[i], label=f'Client {i} Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title(f'Client {i} Training Accuracy')
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

import numpy as np

import torch



def membership_inference_attack(global_model, original_data, synthetic_data, word2idx, max_len=128, device='cuda'):

    global_model.eval()  # Set the model to evaluation mode



    def get_max_prob(data, labels, word2idx, max_len, device):

        max_probs = []

        for text in data:

            # Convert text to tensor indices

            text_tensor = text_to_tensor(text, word2idx, max_len)

            text_tensor = text_tensor.to(device).unsqueeze(0)  # Add batch dimension



            # Perform inference

            with torch.no_grad():

                outputs = global_model(text_tensor)

                probs = torch.softmax(outputs, dim=1)  # Get the probabilities for each class

                max_probs.append(probs.max().item())  # Get the maximum probability

        return np.array(max_probs)



    # Get the maximum probabilities for the original and synthetic data

    original_max_probs = get_max_prob(original_data['Text'], original_data['Emotion'], word2idx, max_len, device)

    synthetic_max_probs = get_max_prob(synthetic_data['Text'], synthetic_data['Emotion'], word2idx, max_len, device)



    # Combine the probabilities with labels for membership inference

    labels = np.concatenate([np.ones(len(original_max_probs)), np.zeros(len(synthetic_max_probs))])

    scores = np.concatenate([original_max_probs, synthetic_max_probs])



    # Calculate the AUC (Area Under the Curve) for the membership inference attack

    auc = roc_auc_score(labels, scores)

    print(f"Membership Inference Attack AUC on the Global CNN model: {auc:.4f}")





# Perform the membership inference attack on the Global CNN model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

membership_inference_attack(global_model, emotion_data, synthetic_data, word2idx, max_len=128, device=device)


In [None]:
import torch
import numpy as np
from sklearn.metrics import roc_auc_score

def membership_inference_attack_local_cnn(local_models, clients_data, synthetic_data, word_to_index, max_len=128, device='cuda'):
    def get_max_prob(model, data, word_to_index, max_len, device):
        model.eval()  # Set the model to evaluation mode
        max_probs = []

        for text in data['Text']:
            # Tokenize text and convert it to tensor indices
            tokenized = [word_to_index.get(word, 0) for word in text.split()]
            padding = [0] * (max_len - len(tokenized))
            input_ids = tokenized[:max_len] + padding if len(tokenized) < max_len else tokenized[:max_len]
            text_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension

            # Perform inference
            with torch.no_grad():
                outputs = model(text_tensor)
                probs = torch.softmax(outputs, dim=1)  # Get the probabilities for each class
                max_probs.append(probs.max().item())  # Get the maximum probability

        return np.array(max_probs)

    # Perform the attack on each client's local model
    for i, (model, client_data) in enumerate(zip(local_models, clients_data)):
        print(f"Evaluating Membership Inference Attack on Client {i}'s Local CNN Model...")

        # Get maximum probabilities for client data (original) and synthetic data
        client_original_max_probs = get_max_prob(model, client_data, word_to_index, max_len, device)
        synthetic_max_probs = get_max_prob(model, synthetic_data, word_to_index, max_len, device)

        # Combine the probabilities with labels for membership inference
        labels = np.concatenate([np.ones(len(client_original_max_probs)), np.zeros(len(synthetic_max_probs))])
        scores = np.concatenate([client_original_max_probs, synthetic_max_probs])

        # Calculate AUC for the membership inference attack
        auc = roc_auc_score(labels, scores)
        print(f"Client {i} - Membership Inference Attack AUC: {auc:.4f}")

# Perform the Membership Inference Attack for each local CNN model
membership_inference_attack_local_cnn(models, clients_data, synthetic_data, word2idx, max_len=128, device=device)


In [None]:
from sklearn.preprocessing import label_binarize  # Import label_binarize
from sklearn.metrics import roc_auc_score

# Adjusted generate_linkage_data function
def generate_linkage_data(models, clients_data, word2idx, device):

    all_predictions = []
    all_labels = []

    for i, client_data in enumerate(clients_data):

        # Convert the client's text and labels to dataset using TextDataset
        dataset = TextDataset(client_data['Text'], client_data['Emotion'], word2idx)
        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        client_predictions = []
        client_labels = []

        with torch.no_grad():
            for batch in data_loader:
                texts = batch['text'].to(device)
                outputs = models[i](texts)  # Directly call the model with input tensors
                probabilities = torch.softmax(outputs, dim=1)

                client_predictions.extend(probabilities.cpu().numpy())
                client_labels.extend([i] * len(probabilities))  # Assign the client's index as the label

        all_predictions.extend(client_predictions)
        all_labels.extend(client_labels)

    return np.array(all_predictions), np.array(all_labels)


# Generate linkage attack data
all_predictions, all_labels = generate_linkage_data(models, clients_data, word2idx, device)

# Convert labels to one-hot encoding for AUC calculation
all_labels_one_hot = label_binarize(all_labels, classes=list(range(num_clients)))

# Calculate AUC for each client
auc_scores = []

for i in range(num_clients):
    # Calculate AUC for the current client (one-vs-rest)
    auc = roc_auc_score(all_labels_one_hot[:, i], all_predictions[:, i])
    auc_scores.append(auc)
    print(f"AUC for Client {i}: {auc:.4f}")

# Calculate macro-average AUC (average AUC across all clients)
macro_auc = np.mean(auc_scores)
print(f"Macro-Average AUC: {macro_auc:.4f}")
