In [26]:
import numpy as np
import pandas as pd
import re
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.layers import SpatialDropout1D,LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
# Make data directory if it doesn't exist
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/sentiment140-subset.csv.zip -P data
!unzip -n -d data data/sentiment140-subset.csv.zip

# Load the dataset
df = pd.read_csv("data/sentiment140-subset.csv")
dataset = df[['polarity', 'text']]

--2024-08-08 06:09:48--  https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/sentiment140-subset.csv.zip
Resolving nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)... 162.243.189.2
Connecting to nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17927149 (17M) [application/zip]
Saving to: ‘data/sentiment140-subset.csv.zip’


2024-08-08 06:09:49 (114 MB/s) - ‘data/sentiment140-subset.csv.zip’ saved [17927149/17927149]

Archive:  data/sentiment140-subset.csv.zip
  inflating: data/sentiment140-subset.csv  


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   polarity  500000 non-null  int64 
 1   text      500000 non-null  object
dtypes: int64(1), object(1)
memory usage: 7.6+ MB


In [4]:
# Balance the dataset
minority_count = dataset['polarity'].value_counts().min()
balanced_df = pd.concat([
    dataset[dataset['polarity'] == 0].sample(minority_count, replace=False),
    dataset[dataset['polarity'] == 1].sample(minority_count, replace=False),
]).sample(frac=1).reset_index(drop=True)

In [5]:
balanced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499450 entries, 0 to 499449
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   polarity  499450 non-null  int64 
 1   text      499450 non-null  object
dtypes: int64(1), object(1)
memory usage: 7.6+ MB


In [7]:
# Preprocessing function
def preprocess(textdata):
    processedText = []
    wordLemm = WordNetLemmatizer()
    urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern = '@[^\s]+'
    alphaPattern = "[^a-zA-Z0-9]"
    sequencePattern = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    for tweet in tqdm(textdata, desc="Processing tweets", unit="tweet"):
        tweet = str(tweet).lower()
        tweet = re.sub(urlPattern, ' URL', tweet)
        tweet = re.sub(userPattern, ' USER', tweet)
        tweet = re.sub(alphaPattern, " ", tweet)
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
        tweetwords = ''
        for word in tweet.split():
            word = wordLemm.lemmatize(word)
            tweetwords += (word + ' ')
        processedText.append(tweetwords)
    return processedText

processed_text = preprocess(balanced_df['text'].values)


Processing tweets: 100%|██████████| 499450/499450 [00:45<00:00, 10971.76tweet/s]


In [14]:
processed_text[1]

'man not even a 10 minute drunken phone convo with one of my bmas can make me feel better sad time man i love you '

In [15]:
# Use tqdm to show progress for converting sentiment column to numpy array
Y = [sentiment for sentiment in tqdm(balanced_df['polarity'], desc="Loading sentiment data")]
Y = np.array(Y)


Loading sentiment data: 100%|██████████| 499450/499450 [00:00<00:00, 1477466.53it/s]


In [8]:
# Tokenization and Padding
max_features = 10000
max_len = 20  # Define max_len
tokenizer = Tokenizer(num_words=max_features, split=' ')

# Use tqdm to show progress for fitting the tokenizer
tokenizer.fit_on_texts(tqdm(processed_text, desc="Fitting tokenizer"))

Fitting tokenizer: 100%|██████████| 499450/499450 [00:12<00:00, 39716.55it/s]


In [9]:
# Use tqdm to show progress for converting texts to sequences
X = [tokenizer.texts_to_sequences([text])[0] for text in tqdm(processed_text, desc="Converting texts to sequences")]


Converting texts to sequences: 100%|██████████| 499450/499450 [00:11<00:00, 42832.23it/s]


In [10]:
# Use tqdm to show progress for padding sequences
X = pad_sequences(tqdm(X, desc="Padding sequences"), maxlen=max_len)

Padding sequences: 100%|██████████| 499450/499450 [00:00<00:00, 1407795.75it/s]


In [16]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [17]:
# Create 10 clients
num_clients = 10
clients_data = []
samples_per_label = 10  # Number of samples per label per client

for i in tqdm(range(num_clients), desc="Creating clients"):
    client_X, client_y = [], []
    for label in [0, 1]:
        idx = np.where(y_train == label)[0]
        if len(idx) >= samples_per_label:
            selected_idx = np.random.choice(idx, size=samples_per_label, replace=False)
        else:
            selected_idx = np.random.choice(idx, size=len(idx), replace=False)  # Select all available samples
            print(f"Not enough samples for label {label} in client {i + 1}, selected {len(selected_idx)} samples")
        client_X.extend(X_train[selected_idx])
        client_y.extend(y_train[selected_idx])
    clients_data.append((np.array(client_X), np.array(client_y)))

Creating clients: 100%|██████████| 10/10 [00:00<00:00, 64.66it/s]


In [18]:
clients_data[1]

(array([[   0,    0,    2,   85,   29,   92,  861, 1035,    5,  360,  181,
         1565,   27,  498,   59,  157,   11,    5,  270,  139],
        [   0,    0,    0,    0,    0,    2,   54,    6,    1,   23, 1673,
           11,    9,    8,    1,  154,   15,   71, 6678,  129],
        [ 290,   15,   38,    7,  396,   79,    1,   66,  465,    3,   20,
           96,  878,  546,    4,  278, 2248, 1808,   59, 2223],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    2,  148, 4598,  544,   18, 2229],
        [   0,    0,    0,    0,    0,    0,    0,    0,    1,   66, 4843,
            7, 7703,  530, 1452,    6,  524, 1871,   58,  139],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    2,    1,   68,    3,   25,   24,    9],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,   10,   26,   45],
        [  11,   62,   96, 

In [19]:
def create_model():
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=max_len))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [30]:
def federated_learning(X_train, y_train, X_test, y_test, num_clients=1, num_rounds=1, local_epochs=1):
    # Split the data among clients
    client_data = np.array_split(X_train, num_clients)
    client_labels = np.array_split(y_train, num_clients)

    # Initialize the global model
    global_model = create_model()
    global_model.build(input_shape=(None, X_train.shape[1]))  # Ensure the model is built

    for round_num in range(num_rounds):
        print(f"Round {round_num+1}/{num_rounds}")

        # Initialize the list to store client weights
        client_weights_list = []

        for client_num in range(num_clients):
            print(f"Training on client {client_num+1}/{num_clients}")

            # Create a new model for each client
            client_model = create_model()
            client_model.build(input_shape=(None, X_train.shape[1]))  # Ensure the model is built
            client_model.set_weights(global_model.get_weights())

            # Train the client model
            client_model.fit(client_data[client_num], client_labels[client_num], epochs=local_epochs, verbose=0)

            # Append the client weights to the list
            client_weights_list.append(client_model.get_weights())

        # Average the client weights
        averaged_weights = [np.zeros_like(w) for w in client_weights_list[0]]
        for client_weights in client_weights_list:
            for i in range(len(client_weights)):
                averaged_weights[i] += client_weights[i]
        averaged_weights = [w / num_clients for w in averaged_weights]

        # Set the global model weights to the averaged weights
        global_model.set_weights(averaged_weights)

    # Evaluate the global model
    loss, accuracy = global_model.evaluate(X_test, y_test)
    print(f"Final model accuracy: {accuracy}")

    return global_model

In [31]:
# Run Federated Learning
start_time = time.time()
final_model = federated_learning(X_train, y_train, X_test, y_test)
end_time = time.time()
print(f"Total training time: {end_time - start_time:.2f} seconds")

Round 1/1
Training on client 1/1
[1m3122/3122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step - accuracy: 0.8106 - loss: 0.4145
Final model accuracy: 0.8095705509185791
Total training time: 333.32 seconds
