In [None]:
# !pip install faker

In [None]:
import random
import pandas as pd

# Define templates for each emotion
templates = {
    'sadness': [
        "I can't believe {}.",
        "This is the worst {}.",
        "I feel so {}.",
        "Nothing seems to {}.",
        "I'm heartbroken {}."
    ],
    'anger': [
        "I can't stand {}!",
        "This is absolutely {}!",
        "I'm furious about {}.",
        "Why does {} always happen?",
        "I'm so mad I could {}!"
    ],
    'love': [
        "You mean {} to me.",
        "I cherish every {} with you.",
        "My heart is full of {} for you.",
        "I can't imagine {} without you.",
        "You make everything {}."
    ],
    'surprise': [
        "I didn't see {} coming!",
        "Wow, what a {}!",
        "I'm completely {}!",
        "This is such an {} turn of events.",
        "I never would have {}!"
    ],
    'fear': [
        "I'm really scared {}.",
        "What if {} happens?",
        "I can't shake this feeling of {}.",
        "This situation makes me very {}.",
        "I'm terrified of {}."
    ],
    'happy': [
        "This is the best {} ever!",
        "I feel so {} and content.",
        "I'm incredibly {} right now.",
        "Everything is {}!",
        "I can't stop {}!"
    ]
}

# Define keywords for each emotion
keywords = {
    'sadness': ['it\'s over', 'day of my life', 'alone', 'go right', 'beyond words'],
    'anger': ['this', 'unacceptable', 'what happened', 'this', 'scream'],
    'love': ['the world', 'moment', 'love', 'life', 'better'],
    'surprise': ['that', 'surprise', 'stunned', 'unexpected', 'guessed'],
    'fear': ['right now', 'something bad', 'dread', 'uneasy', 'what\'s to come'],
    'happy': ['day', 'joyful', 'happy', 'perfect', 'smiling']
}

def generate_sentence(emotion):
    template = random.choice(templates[emotion])
    keyword = random.choice(keywords[emotion])
    sentence = template.format(keyword)
    return sentence

def generate_rule_based_synthetic_data(num_samples):
    synthetic_data = []

    for _ in range(num_samples):
        # Randomly select an emotion
        emotion = random.choice(list(templates.keys()))

        # Generate a meaningful sentence for the selected emotion
        text = generate_sentence(emotion)

        # Append the record to the synthetic data list
        synthetic_data.append([text, emotion])

    # Create a DataFrame
    synthetic_df = pd.DataFrame(synthetic_data, columns=['Text', 'Emotion'])

    return synthetic_df

# Generate a synthetic dataset with the same number of entries as the original dataset
rule_based_synthetic_dataset = generate_rule_based_synthetic_data(21459)

## Save the rule-based synthetic dataset to a CSV file
rule_based_synthetic_dataset.to_csv('synthetic_dataset.csv', index=False)

# Display the first few rows of the synthetic dataset
rule_based_synthetic_dataset.head()


In [None]:
# Install necessary libraries
!pip install torch pandas scikit-learn torchtext matplotlib

!pip install syft

In [None]:

import pandas as pd
import numpy as np
import re
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt

# Load Emotion_final.csv dataset
emotion_data = pd.read_csv('Emotion_final.csv')

# Clean the text in the emotion data
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    return text

emotion_data['Text'] = emotion_data['Text'].apply(clean_text)

# Load synthetic_data.csv dataset
synthetic_data = pd.read_csv('synthetic_dataset.csv')

# Clean the text in the synthetic data
synthetic_data['Text'] = synthetic_data['Text'].apply(clean_text)

# Concatenate the original and synthetic datasets
combined_data = pd.concat([emotion_data, synthetic_data], ignore_index=True)

# Shuffle the combined dataset
combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Hyperparameters
num_clients = 3
epochs = 20
learning_rate = 1e-5
epsilon = 1.0
subset_size = 500
batch_size = 16

# Federated Learning - Split data for each client
clients_data = []
for _ in range(num_clients):
    client_data = combined_data.sample(frac=0.2, random_state=42).reset_index(drop=True).head(subset_size)
    clients_data.append(client_data)

# Encoding the labels
label_encoder = LabelEncoder()
for i in range(num_clients):
    clients_data[i]['Emotion'] = label_encoder.fit_transform(clients_data[i]['Emotion'])

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Custom Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = int(self.labels[idx])  # Ensure label is converted to int
        encoding = self.tokenizer(text, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)  # Ensure label is numeric
        }

# Train each client locally
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
models = []
scaler = GradScaler()
train_loss_history = [[] for _ in range(num_clients)]
train_acc_history = [[] for _ in range(num_clients)]

for i in range(num_clients):
    client_data = clients_data[i]
    train_texts, test_texts, train_labels, test_labels = train_test_split(client_data['Text'], client_data['Emotion'], test_size=0.2, random_state=42)
    assert len(train_texts) > 0 and len(test_texts) > 0, "Train-test split resulted in empty sets"

    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_)).to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            with autocast():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()

            _, predicted = torch.max(logits, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        avg_loss = total_loss / len(train_loader)
        accuracy = correct / total
        train_loss_history[i].append(avg_loss)
        train_acc_history[i].append(accuracy)
        print(f"Client {i}, Epoch {epoch + 1}, Loss: {avg_loss}, Accuracy: {accuracy}")

    models.append(model)

# Aggregation Step
# Combine the weights of the models for global model
global_state_dict = {}
for key in models[0].state_dict().keys():
    global_state_dict[key] = sum(model.state_dict()[key] for model in models) / num_clients

# Update the global model with the aggregated weights
global_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_)).to(device)
global_model.load_state_dict(global_state_dict)

# Evaluate the global model on both original and synthetic test sets together
original_test_texts = emotion_data['Text']
original_test_labels = label_encoder.transform(emotion_data['Emotion'])
original_test_dataset = TextDataset(original_test_texts, original_test_labels, tokenizer)
original_test_loader = DataLoader(original_test_dataset, batch_size=batch_size)

synthetic_test_texts = synthetic_data['Text']
synthetic_test_labels = label_encoder.transform(synthetic_data['Emotion'])
synthetic_test_dataset = TextDataset(synthetic_test_texts, synthetic_test_labels, tokenizer)
synthetic_test_loader = DataLoader(synthetic_test_dataset, batch_size=batch_size)

correct = 0
total = 0
with torch.no_grad():
    for batch1, batch2 in zip(original_test_loader, synthetic_test_loader):
        input_ids1 = batch1['input_ids'].to(device)
        attention_mask1 = batch1['attention_mask'].to(device)
        labels1 = batch1['labels'].to(device)

        input_ids2 = batch2['input_ids'].to(device)
        attention_mask2 = batch2['attention_mask'].to(device)
        labels2 = batch2['labels'].to(device)

        outputs1 = global_model(input_ids1, attention_mask=attention_mask1)
        _, predicted1 = torch.max(outputs1.logits, dim=1)
        total += labels1.size(0)
        correct += (predicted1 == labels1).sum().item()

        outputs2 = global_model(input_ids2, attention_mask=attention_mask2)
        _, predicted2 = torch.max(outputs2.logits, dim=1)
        total += labels2.size(0)
        correct += (predicted2 == labels2).sum().item()

overall_test_accuracy = correct / total
print(f"Overall Test Accuracy on both original and synthetic test sets: {overall_test_accuracy:.4f}")

# Plot the training loss and accuracy
plt.figure(figsize=(12, 6))
for i in range(num_clients):
    plt.subplot(2, num_clients, i+1)
    plt.plot(range(epochs), train_loss_history[i], label=f'Client {i} Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'Client {i} Training Loss')
    plt.legend()

    plt.subplot(2, num_clients, num_clients+i+1)
    plt.plot(range(epochs), train_acc_history[i], label=f'Client {i} Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title(f'Client {i} Training Accuracy')
    plt.legend()

plt.tight_layout()
plt.show()

# Calculate overall epsilon value
epsilon_values = [epsilon / num_clients] * num_clients
overall_epsilon = sum(epsilon_values)
print(f"Overall epsilon value: {overall_epsilon}")

# Forecast emotions for the test dataset (simulating future dataset)
def forecast(model, tokenizer, dataset, device):
    model.eval()
    predictions = []
    data_loader = DataLoader(dataset, batch_size=batch_size)
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, dim=1)
            predictions.extend(predicted.cpu().numpy())
    return np.array(predictions)

# Simulated future dataset (for demonstration, we will use a random sample from the combined data)
simulated_future_texts = combined_data.sample(frac=0.1, random_state=42)['Text']
simulated_future_labels = label_encoder.transform(combined_data.sample(frac=0.1, random_state=42)['Emotion'])
simulated_future_dataset = TextDataset(simulated_future_texts, simulated_future_labels, tokenizer)

# Forecast emotions for the simulated future dataset
predictions = forecast(global_model, tokenizer, simulated_future_dataset, device)
predicted_emotions = label_encoder.inverse_transform(predictions)
print("Predicted Emotions for Simulated Test Dataset:", predicted_emotions)

# Evaluate performance on the simulated test dataset with ground truth
test_true_emotions = label_encoder.inverse_transform(simulated_future_labels)
accuracy = np.mean(predicted_emotions == test_true_emotions)
precision = precision_score(test_true_emotions, predicted_emotions, average='weighted')
recall = recall_score(test_true_emotions, predicted_emotions, average='weighted')
f1 = f1_score(test_true_emotions, predicted_emotions, average='weighted')
cm = confusion_matrix(test_true_emotions, predicted_emotions)

print(f"Forecast Test Accuracy: {accuracy:.4f}")
print(f"Forecast Test Precision: {precision:.4f}")
print(f"Forecast Test Recall: {recall:.4f}")
print(f"Forecast Test F1 Score: {f1:.4f}")
print("Forecast Confusion Matrix:")
print(cm)

In [None]:
import pandas as pd

# Paths to the datasets
original_data_path = 'Emotion_final.csv'
# obfuscated_data_path = 'synthetic_dataset.csv'

# Load the datasets into pandas DataFrames
original_data = pd.read_csv(original_data_path)
obfuscated_data = combined_data

In [None]:
##Membership Inference Attack is performed on the global model

from sklearn.metrics import roc_auc_score
import numpy as np

def membership_inference_attack(global_model, original_data, synthetic_data, tokenizer, device):
    global_model.eval()

    def get_max_prob(data, labels):
        max_probs = []
        for text, label in zip(data, labels):
            encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
            with torch.no_grad():
                outputs = global_model(**{k: v.to(device) for k, v in encoding.items()})
                probs = torch.softmax(outputs.logits, dim=1)
                max_probs.append(probs.max().item())
        return np.array(max_probs)

    original_max_probs = get_max_prob(original_data['Text'], original_data['Emotion'])
    synthetic_max_probs = get_max_prob(synthetic_data['Text'], synthetic_data['Emotion'])

    labels = np.concatenate([np.ones(len(original_max_probs)), np.zeros(len(synthetic_max_probs))])
    scores = np.concatenate([original_max_probs, synthetic_max_probs])

    auc = roc_auc_score(labels, scores)
    print(f"Membership Inference Attack AUC: {auc:.4f}")

# Example usage
membership_inference_attack(global_model, emotion_data, synthetic_data, tokenizer, device)



In [None]:
##Membership Inference Attack is performed on the Local model scenario

from sklearn.metrics import roc_auc_score
import numpy as np

def membership_inference_attack(global_model, member_data, non_member_data, tokenizer, device):
    global_model.eval()

    def get_max_prob(data, tokenizer, device):
        max_probs = []
        for text in data:
            encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
            with torch.no_grad():
                encoding = {k: v.to(device) for k, v in encoding.items()}
                outputs = global_model(**encoding)
                probs = torch.softmax(outputs.logits, dim=1)
                max_probs.append(probs.max().item())
        return np.array(max_probs)

    # Get maximum probabilities for member (training) and non-member (testing) data
    member_max_probs = get_max_prob(member_data['Text'], tokenizer, device)
    non_member_max_probs = get_max_prob(non_member_data['Text'], tokenizer, device)

    # Create labels: 1 for members (training data) and 0 for non-members (testing data)
    labels = np.concatenate([np.ones(len(member_max_probs)), np.zeros(len(non_member_max_probs))])

    # Combine the scores from both sets
    scores = np.concatenate([member_max_probs, non_member_max_probs])

    # Calculate AUC to evaluate the attack's performance
    auc = roc_auc_score(labels, scores)
    print(f"Membership Inference Attack AUC: {auc:.4f}")

# Example usage:

# Simulating members as part of the training data and non-members as part of the test data
train_texts, test_texts, train_labels, test_labels = train_test_split(emotion_data['Text'], emotion_data['Emotion'], test_size=0.2, random_state=42)

# Convert the split data into pandas DataFrames
member_data = pd.DataFrame({'Text': train_texts, 'Emotion': train_labels})
non_member_data = pd.DataFrame({'Text': test_texts, 'Emotion': test_labels})

# Perform the membership inference attack
membership_inference_attack(global_model, member_data, non_member_data, tokenizer, device)


In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize

# Function to generate linkage attack data for local models
def generate_linkage_data(models, clients_data, tokenizer, device):
    all_predictions = []
    all_labels = []

    for i, client_data in enumerate(clients_data):
        # Convert the client's text and labels to dataset
        dataset = TextDataset(client_data['Text'], client_data['Emotion'], tokenizer)
        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        client_predictions = []
        client_labels = []

        with torch.no_grad():
            for batch in data_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                outputs = models[i](input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                probabilities = torch.softmax(logits, dim=1)

                client_predictions.extend(probabilities.cpu().numpy())
                client_labels.extend([i] * len(probabilities))  # Assign the client's index as the label

        all_predictions.extend(client_predictions)
        all_labels.extend(client_labels)

    return np.array(all_predictions), np.array(all_labels)

# Generate linkage attack data
all_predictions, all_labels = generate_linkage_data(models, clients_data, tokenizer, device)

# Convert labels to one-hot encoding for AUC calculation
all_labels_one_hot = label_binarize(all_labels, classes=list(range(num_clients)))

# Calculate AUC for each client
auc_scores = []
for i in range(num_clients):
    # Calculate AUC for the current client (one-vs-rest)
    auc = roc_auc_score(all_labels_one_hot[:, i], all_predictions[:, i])
    auc_scores.append(auc)
    print(f"AUC for Client {i}: {auc:.4f}")

# Calculate macro-average AUC (average AUC across all clients)
macro_auc = np.mean(auc_scores)
print(f"Macro-Average AUC: {macro_auc:.4f}")
