In [15]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import re
import time

In [16]:
# # Load and preprocess the data
# DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
# DATASET_ENCODING = "ISO-8859-1"
# dataset = pd.read_csv('C:/Users/IDEH/Desktop/Sentimental/Data/Sentiment140.csv',
#                       encoding=DATASET_ENCODING, names=DATASET_COLUMNS)


In [17]:
# sentiment_map = {0: 0, 4: 1}  # 0: negative, 1: positive
# dataset.loc[:, 'sentiment'] = dataset['sentiment'].map(sentiment_map)

In [18]:
# def preprocess(textdata):
#     processedText = []
#     wordLemm = WordNetLemmatizer()
#     urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
#     userPattern = '@[^\s]+'
#     alphaPattern = "[^a-zA-Z0-9]"
#     sequencePattern = r"(.)\1\1+"
#     seqReplacePattern = r"\1\1"

#     for tweet in tqdm(textdata, desc="Processing tweets", unit="tweet"):
#         tweet = tweet.lower()
#         tweet = re.sub(urlPattern, ' URL', tweet)
#         tweet = re.sub(userPattern, ' USER', tweet)
#         tweet = re.sub(alphaPattern, " ", tweet)
#         tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
#         tweetwords = ''
#         for word in tweet.split():
#             word = wordLemm.lemmatize(word)
#             tweetwords += (word + ' ')
#         processedText.append(tweetwords)

#     return processedText

# processed_text = preprocess(dataset['text'].values)

In [19]:
# Load the preprocessed data with a progress bar
preprocessed_df = pd.read_csv('C:/Users/IDEH/Desktop/Sentimental/Data/preprocessed_data.csv')

# Use tqdm to show progress for converting text column to list
processed_text = [text for text in tqdm(preprocessed_df['text'], desc="Loading text data")]


Loading text data: 100%|████████████████████████████████████████████████| 1600000/1600000 [00:00<00:00, 2019183.74it/s]


In [20]:
# Use tqdm to show progress for converting sentiment column to numpy array
Y = [sentiment for sentiment in tqdm(preprocessed_df['sentiment'], desc="Loading sentiment data")]
Y = np.array(Y)


Loading sentiment data: 100%|███████████████████████████████████████████| 1600000/1600000 [00:00<00:00, 2781176.20it/s]


In [21]:
# Tokenization and Padding
max_features = 10000
max_len = 20  # Define max_len
tokenizer = Tokenizer(num_words=max_features, split=' ')

# Use tqdm to show progress for fitting the tokenizer
tokenizer.fit_on_texts(tqdm(processed_text, desc="Fitting tokenizer"))

Fitting tokenizer: 100%|██████████████████████████████████████████████████| 1600000/1600000 [00:37<00:00, 42679.74it/s]


In [22]:
# Use tqdm to show progress for converting texts to sequences
X = [tokenizer.texts_to_sequences([text])[0] for text in tqdm(processed_text, desc="Converting texts to sequences")]


Converting texts to sequences: 100%|██████████████████████████████████████| 1600000/1600000 [00:30<00:00, 52811.59it/s]


In [23]:
# Use tqdm to show progress for padding sequences
X = pad_sequences(tqdm(X, desc="Padding sequences"), maxlen=max_len)

Padding sequences: 100%|████████████████████████████████████████████████| 1600000/1600000 [00:00<00:00, 3155161.44it/s]


In [24]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [25]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1280000, 20), (320000, 20), (1280000,), (320000,))

In [26]:
# Create 10 clients
num_clients = 10
clients_data = []
samples_per_label = 10  # Number of samples per label per client

for i in tqdm(range(num_clients), desc="Creating clients"):
    client_X, client_y = [], []
    for label in [0, 1]:
        idx = np.where(y_train == label)[0]
        if len(idx) >= samples_per_label:
            selected_idx = np.random.choice(idx, size=samples_per_label, replace=False)
        else:
            selected_idx = np.random.choice(idx, size=len(idx), replace=False)  # Select all available samples
            print(f"Not enough samples for label {label} in client {i + 1}, selected {len(selected_idx)} samples")
        client_X.extend(X_train[selected_idx])
        client_y.extend(y_train[selected_idx])
    clients_data.append((np.array(client_X), np.array(client_y)))

Creating clients:  80%|████████████████████████████████████████████████████             | 8/10 [00:00<00:00, 38.16it/s]

Not enough samples for label 1 in client 1, selected 0 samples
Not enough samples for label 1 in client 2, selected 0 samples
Not enough samples for label 1 in client 3, selected 0 samples
Not enough samples for label 1 in client 4, selected 0 samples
Not enough samples for label 1 in client 5, selected 0 samples
Not enough samples for label 1 in client 6, selected 0 samples
Not enough samples for label 1 in client 7, selected 0 samples
Not enough samples for label 1 in client 8, selected 0 samples


Creating clients: 100%|████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 38.15it/s]

Not enough samples for label 1 in client 9, selected 0 samples
Not enough samples for label 1 in client 10, selected 0 samples





Initialize Variables:<br/>
num_clients = 10: Specifies the number of clients.
clients_data = []: Initializes an empty list to store the data for each client.
Loop Through Each Client:<br/>
for i in range(num_clients): Loops through each client (from 0 to 9).<br/>
Initialize Client Data:<br/>
client_X, client_y = [], []:
<br/> Initializes empty lists to store the features (client_X) and labels (client_y) for the current client.
Loop Through Each Label:<br/>
for label in [0, 1]:<br/> Loops through each label (0 and 1).<br/>
Select Data for Each Label:<br/>
idx = np.where(y_train == label)[0]:<br/> Finds the indices of all samples in y_train that have the current label.<br/>
selected_idx = np.random.choice(idx, size=10, replace=False): Randomly selects 10 indices from the found indices without replacement.
Add Selected Data to Client Data:<br/>
client_X.extend(X_train[selected_idx]): Adds the selected features from X_train to client_X.<br/>
client_y.extend(y_train[selected_idx]): Adds the selected labels from y_train to client_y.<br/>
Store Client Data:<br/>
clients_data.append((np.array(client_X), np.array(client_y))): Converts client_X and client_y to numpy arrays and appends them as a tuple to clients_data.

In [28]:
clients_data[1]

(array([[   0,    0,    0,    0,    0,    0,    0,   23,  105,  142,   72,
         1684,  326,   16,    1,   65,   68,   98, 1530, 2612],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    1,   54,
            5,  616,    7,  176, 3293,   24,  934,   91,   56],
        [ 331,    8,   23,  177,   25,   24, 2913,   55, 4824,    1,  148,
           72,  104, 2913,   62,   93,    3,    4, 2062,  173],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            1,   20,    5,  622, 7497,   16,    7,  376, 2599],
        [   0,    0,    0,    0,    0,    0,    0,  317,  236,   22,    5,
          159,  289,   73,   62,  114,   41,  247,   16,   45],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,  199,  331,    8, 4118,   82,    5,  509],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    2,   86,   69,    3,  236,  241,  150,   45],
        [   0,    0,    0, 

In [27]:
def create_model():
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=max_len))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [31]:
from tqdm import tqdm

def federated_learning(X_train, y_train, X_test, y_test, num_clients=10, num_rounds=5, local_epochs=3):
    # Create client data
    clients_data = []
    samples_per_client = len(X_train) // num_clients
    for i in tqdm(range(num_clients), desc="Creating clients"):
        start = i * samples_per_client
        end = (i + 1) * samples_per_client
        clients_data.append((X_train[start:end], y_train[start:end]))

    # Initialize global model
    global_model = create_model()

    for round in tqdm(range(num_rounds), desc="Federated learning rounds"):
        print(f"Round {round + 1}/{num_rounds}")
        
        client_models = []
        
        for i, (client_X, client_y) in enumerate(tqdm(clients_data, desc="Training clients", leave=False)):
            print(f"Training on client {i + 1}/{num_clients}")
            client_model = tf.keras.models.clone_model(global_model)
            client_model.set_weights(global_model.get_weights())
            client_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  # Compile the model
            client_model.fit(client_X, client_y, epochs=local_epochs, batch_size=32, verbose=0)
            client_models.append(client_model)
        
        # Average the models
        averaged_weights = [np.zeros_like(w) for w in global_model.get_weights()]
        for client_model in tqdm(client_models, desc="Averaging models", leave=False):
            client_weights = client_model.get_weights()
            for i in range(len(client_weights)):
                averaged_weights[i] += client_weights[i]
        averaged_weights = [w / num_clients for w in averaged_weights]
        
        global_model.set_weights(averaged_weights)
        
        # Evaluate global model
        test_loss, test_accuracy = global_model.evaluate(X_test, y_test, verbose=0)
        print(f"Test Accuracy: {test_accuracy:.4f}")
    
    return global_model



In [32]:
# Run Federated Learning
start_time = time.time()
final_model = federated_learning(X_train, y_train, X_test, y_test)
end_time = time.time()
print(f"Total training time: {end_time - start_time:.2f} seconds")


Creating clients: 100%|████████████████████████████████████████████████████████████████████████| 10/10 [00:00<?, ?it/s]
Federated learning rounds:   0%|                                                                 | 0/5 [00:00<?, ?it/s]

Round 1/5



Training clients:   0%|                                                                         | 0/10 [00:00<?, ?it/s][A

Training on client 1/10



Federated learning rounds:   0%|                                                                 | 0/5 [05:22<?, ?it/s][A


KeyboardInterrupt: 

In [None]:
from tqdm import tqdm

def federated_learning(X_train, y_train, X_test, y_test, num_clients=10, num_rounds=5, local_epochs=3, batch_size=32):
    # Create client data
    clients_data = []
    samples_per_client = len(X_train) // num_clients
    for i in range(num_clients):
        start = i * samples_per_client
        end = (i + 1) * samples_per_client
        clients_data.append((X_train[start:end], y_train[start:end]))

    # Initialize global model
    global_model = create_model()

    # Main federated learning loop
    with tqdm(total=num_rounds, desc="Federated Learning Rounds") as pbar_rounds:
        for round in range(num_rounds):
            client_models = []
            
            # Client training loop
            with tqdm(total=num_clients, desc=f"Round {round+1} Client Training", leave=False) as pbar_clients:
                for i, (client_X, client_y) in enumerate(clients_data):
                    client_model = tf.keras.models.clone_model(global_model)
                    client_model.set_weights(global_model.get_weights())
                    client_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
                    
                    # Train client model
                    history = client_model.fit(
                        client_X, client_y, 
                        epochs=local_epochs, 
                        batch_size=batch_size, 
                        verbose=0,
                        validation_split=0.1
                    )
                    
                    client_models.append(client_model)
                    pbar_clients.update(1)
                    pbar_clients.set_postfix({'Client Acc': f"{history.history['accuracy'][-1]:.4f}"})

            # Average the models
            averaged_weights = [np.zeros_like(w) for w in global_model.get_weights()]
            for client_model in client_models:
                client_weights = client_model.get_weights()
                for i in range(len(client_weights)):
                    averaged_weights[i] += client_weights[i]
            averaged_weights = [w / num_clients for w in averaged_weights]
            
            global_model.set_weights(averaged_weights)
            
            # Evaluate global model
            test_loss, test_accuracy = global_model.evaluate(X_test, y_test, verbose=0)
            pbar_rounds.update(1)
            pbar_rounds.set_postfix({'Test Acc': f"{test_accuracy:.4f}"})

    return global_model

# Run Federated Learning
start_time = time.time()
final_model = federated_learning(X_train, y_train, X_test, y_test, num_clients=10, num_rounds=5, local_epochs=3, batch_size=64)
end_time = time.time()
print(f"Total training time: {end_time - start_time:.2f} seconds")

Federated Learning Rounds:   0%|                                                                 | 0/5 [00:00<?, ?it/s]
Round 1 Client Training:   0%|                                                                  | 0/10 [00:00<?, ?it/s][A