In [None]:
# Install necessary libraries
!pip install torch pandas scikit-learn torchtext opacus gensim matplotlib

# Verify GPU availability
import torch
print("GPU available: ", torch.cuda.is_available())
print("GPU name: ", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
import torch
import pandas as pd
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from opacus import PrivacyEngine
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import gensim.downloader as api
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
from nltk.corpus import stopwords
import re
import nltk

# Download stopwords
nltk.download('stopwords')

# Load and prepare the dataset
df = pd.read_csv('Emotion_final.csv')

# Data cleaning function
def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

# Apply data cleaning
df['Text'] = df['Text'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
df['Emotion'] = label_encoder.fit_transform(df['Emotion'])

# Tokenization and Vocabulary Creation with pre-trained FastText embeddings
tokenizer = get_tokenizer('basic_english')
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

def get_vector(token):
    try:
        return torch.tensor(fasttext_model[token])
    except KeyError:
        return torch.zeros(fasttext_model.vector_size)

counter = Counter()
for line in df['Text']:
    counter.update(tokenizer(line))

vocab = build_vocab_from_iterator([counter.keys()], specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Manually map the FastText vectors to the vocabulary
vectors = []
for token in vocab.get_itos():
    vectors.append(get_vector(token))
vocab.vectors = torch.stack(vectors)

def tokenize_and_pad(text_iter, tokenizer, vocab, max_length=50):
    tokenized_texts = [torch.tensor([vocab[token] for token in tokenizer(item)], dtype=torch.long) for item in text_iter]
    padded_texts = torch.zeros((len(tokenized_texts), max_length), dtype=torch.long)
    for i, t in enumerate(tokenized_texts):
        length = min(len(t), max_length)
        padded_texts[i, :length] = t[:length]
    return padded_texts

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Emotion'], test_size=0.2, random_state=42)

# Prepare data
train_data = tokenize_and_pad(X_train.tolist(), tokenizer, vocab)
test_data = tokenize_and_pad(X_test.tolist(), tokenizer, vocab)
train_labels = torch.tensor(y_train.values).long()
test_labels = torch.tensor(y_test.values).long()

# Define the Dataset class
class CustomTextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Manually split the dataset for federated clients
num_clients = 5
client_data_size = len(train_data) // num_clients
client_datasets = []

start = 0
for i in range(num_clients):
    end = start + client_data_size if i < num_clients - 1 else len(train_data)
    client_texts = train_data[start:end]
    client_labels = train_labels[start:end]
    client_datasets.append(CustomTextDataset(client_texts, client_labels))
    start = end

# Model Definition with Dropout and Increased LSTM Layers
class SentimentAnalysisModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight.data.copy_(vocab.vectors)
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=3, batch_first=True, dropout=0.3)  # Changed to LSTM with dropout
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, _ = self.lstm(embedded)
        hidden = lstm_out[:, -1, :]
        return self.fc(hidden)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
global_model = SentimentAnalysisModel(len(vocab), 300, 256, len(label_encoder.classes_)).to(device)

# Define the evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = total_correct / total_samples
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return accuracy, precision, recall, f1

def federated_train(model, client_datasets, epochs, device, apply_ldp=False):
    criterion = nn.CrossEntropyLoss()
    privacy_metrics = []
    privacy_engine = PrivacyEngine(accountant="rdp")  # Use the new API
    for epoch in range(epochs):
        local_weights = []
        for client_data in client_datasets:
            local_model = SentimentAnalysisModel(len(vocab), 300, 256, len(label_encoder.classes_)).to(device)
            local_model.load_state_dict(model.state_dict())  # Start with global model weights
            optimizer = optim.Adam(local_model.parameters(), lr=0.001)
            data_loader = DataLoader(client_data, batch_size=32, shuffle=True)

            # Initialize PrivacyEngine
            local_model, optimizer, data_loader = privacy_engine.make_private(
                module=local_model,
                optimizer=optimizer,
                data_loader=data_loader,
                noise_multiplier=0.1,  # Reduce noise
                max_grad_norm=1.0
            )

            # Training loop
            for inputs, labels in data_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = local_model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

            # Apply Local Differential Privacy (LDP) by adding noise to the local model updates before sending them to the server
            if apply_ldp:
                for param in local_model.parameters():
                    if param.grad is not None:
                        param.grad += torch.normal(mean=0, std=0.1, size=param.grad.shape).to(device)

            # Adjust keys in the state_dict and gather the local weights
            adjusted_state_dict = {key.replace('_module.', ''): value for key, value in local_model.state_dict().items()}
            local_weights.append(adjusted_state_dict)

        # Aggregate weights and add noise for Central Differential Privacy (CDP)
        global_weights = {k: torch.mean(torch.stack([w[k] for w in local_weights]), 0) for k in local_weights[0].keys()}
        for k, v in global_weights.items():
            global_weights[k] += torch.normal(mean=0, std=0.1, size=v.shape).to(device)  # CDP noise

        model.load_state_dict(global_weights)

        # Log privacy metrics
        epsilon, best_alpha = privacy_engine.accountant.get_privacy_spent(delta=1e-5)
        privacy_metrics.append((epsilon, best_alpha))
        print(f'Epoch {epoch+1} completed: (ε = {epsilon:.2f}, best α = {best_alpha})')

    return privacy_metrics



In [None]:
import pandas as pd
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from opacus import PrivacyEngine
from opacus.layers import DPLSTM
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import gensim.downloader as api
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from collections import Counter

# Load and prepare the dataset
df = pd.read_csv('Emotion_final.csv')

# Encode labels
label_encoder = LabelEncoder()
df['Emotion'] = label_encoder.fit_transform(df['Emotion'])

# Tokenization and Vocabulary Creation with pre-trained FastText embeddings
tokenizer = get_tokenizer('basic_english')
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

def get_vector(token):
    try:
        return torch.tensor(fasttext_model[token])
    except KeyError:
        return torch.zeros(fasttext_model.vector_size)

counter = Counter()
for line in df['Text']:
    counter.update(tokenizer(line))

vocab = build_vocab_from_iterator([counter.keys()], specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Manually map the FastText vectors to the vocabulary
vectors = []
for token in vocab.get_itos():
    vectors.append(get_vector(token))
vocab.vectors = torch.stack(vectors)

def tokenize_and_pad(text_iter, tokenizer, vocab):
    tokenized_texts = [torch.tensor([vocab[token] for token in tokenizer(item)], dtype=torch.long) for item in text_iter]
    max_length = max(len(t) for t in tokenized_texts)
    padded_texts = torch.zeros((len(tokenized_texts), max_length), dtype=torch.long)
    for i, t in enumerate(tokenized_texts):
        padded_texts[i, :len(t)] = t
    return padded_texts

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Emotion'], test_size=0.2, random_state=42)

# Prepare data
train_data = tokenize_and_pad(X_train.tolist(), tokenizer, vocab)
test_data = tokenize_and_pad(X_test.tolist(), tokenizer, vocab)
train_labels = torch.tensor(y_train.values).long()
test_labels = torch.tensor(y_test.values).long()

# Define the Dataset class
class CustomTextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Manually split the dataset for federated clients
num_clients = 5
client_data_size = len(train_data) // num_clients
client_datasets = []

start = 0
for i in range(num_clients):
    end = start + client_data_size if i < num_clients - 1 else len(train_data)
    client_texts = train_data[start:end]
    client_labels = train_labels[start:end]
    client_datasets.append(CustomTextDataset(client_texts, client_labels))
    start = end

# Model Definition with Dropout and Increased LSTM Layers
class SentimentAnalysisModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight.data.copy_(vocab.vectors)
        self.embedding.weight.requires_grad = False
        self.lstm = DPLSTM(embed_dim, hidden_dim, num_layers=3, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, _ = self.lstm(embedded)
        hidden = self.dropout(lstm_out[:, -1, :])
        return self.fc(hidden)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
global_model = SentimentAnalysisModel(len(vocab), 300, 256, len(label_encoder.classes_)).to(device)

# Define the evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = total_correct / total_samples
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return accuracy, precision, recall, f1

def federated_train(model, client_datasets, epochs, device, apply_ldp=False):
    criterion = nn.CrossEntropyLoss()
    privacy_metrics = []
    privacy_engine = PrivacyEngine(accountant="rdp")  # Use the new API
    for epoch in range(epochs):
        local_weights = []
        for client_data in client_datasets:
            local_model = SentimentAnalysisModel(len(vocab), 300, 256, len(label_encoder.classes_)).to(device)
            local_model.load_state_dict(model.state_dict())  # Start with global model weights
            optimizer = optim.Adam(local_model.parameters(), lr=0.001)
            data_loader = DataLoader(client_data, batch_size=32, shuffle=True)

            # Initialize PrivacyEngine
            local_model, optimizer, data_loader = privacy_engine.make_private(
                module=local_model,
                optimizer=optimizer,
                data_loader=data_loader,
                noise_multiplier=0.1,  # Reduce noise
                max_grad_norm=1.0
            )

            # Training loop
            for inputs, labels in data_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = local_model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

            # Apply Local Differential Privacy (LDP) by adding noise to the local model updates before sending them to the server
            if apply_ldp:
                for param in local_model.parameters():
                    if param.grad is not None:
                        param.grad += torch.normal(mean=0, std=0.1, size=param.grad.shape).to(device)

            # Adjust keys in the state_dict and gather the local weights
            adjusted_state_dict = {key.replace('_module.', ''): value for key, value in local_model.state_dict().items()}
            local_weights.append(adjusted_state_dict)

        # Aggregate weights and add noise for Central Differential Privacy (CDP)
        global_weights = {k: torch.mean(torch.stack([w[k] for w in local_weights]), 0) for k in local_weights[0].keys()}
        for k, v in global_weights.items():
            global_weights[k] += torch.normal(mean=0, std=0.1, size=v.shape).to(device)  # CDP noise

        model.load_state_dict(global_weights)

        # Log privacy metrics
        epsilon, best_alpha = privacy_engine.accountant.get_privacy_spent(delta=1e-5)
        privacy_metrics.append((epsilon, best_alpha))
        print(f'Epoch {epoch+1} completed: (ε = {epsilon:.2f}, best α = {best_alpha})')

    return privacy_metrics

# Train and evaluate the model
privacy_metrics = federated_train(global_model, client_datasets, 20, device, apply_ldp=True)  # Increase the number of epochs and apply LDP
test_dataset = CustomTextDataset(test_data, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)
accuracy, precision, recall, f1 = evaluate(global_model, test_dataloader, device)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Plot Privacy vs Utility
epochs = range(1, 21)
epsilons = [metrics[0] for metrics in privacy_metrics]

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(epochs, epsilons, marker='o')
plt.xlabel('Epoch')
plt.ylabel('Epsilon (ε)')
plt.title('Privacy (Epsilon) over Epochs')

