In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import seaborn as sns
from bs4 import BeautifulSoup

In [None]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

**Short description of data**

In [None]:
df.describe()

**Ration between classes**

In [None]:
sns.countplot(df.sentiment)

**Check for null values**

In [None]:
df.isna().sum()

In [None]:
classes= {
    'positive': 1,
    'negative': 0
}

**Cleaning data -- remove html tags, urls, special symbols, miltiple spaces, spaces at the beginning, single chars and stopwords**

In [None]:
stop = set(stopwords.words('english'))
def clean_data(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text) 
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    final = []
    for word in text.split():
        if word.strip().lower() not in stop and word.strip().lower().isalpha():
            final.append(word.strip().lower())
    text = " ".join(final)
    return text

In [None]:
df.review=df.review.apply(clean_data)
df.sentiment = df.sentiment.map(classes)

**Word cloud for positive reviews**

In [None]:
plt.figure(figsize = (15,15)) # Positive Review Text
word_cloud = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.sentiment == 1].review))
plt.imshow(word_cloud , interpolation = 'bilinear')

**Word cloud for negative reviews**

In [None]:
plt.figure(figsize = (15,15)) # Positive Review Text
word_cloud = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.sentiment == 0].review))
plt.imshow(word_cloud , interpolation = 'bilinear')

In [None]:
X = df.review.tolist()

In [None]:
y = df.sentiment.to_numpy()

In [None]:
def dtm2wid(dtm, maxlen):
    x = []
    nwds = []
    for idx, row in enumerate(dtm):
        seq = []
        indices = (row.indices + 1).astype(np.int64)
        np.append(nwds, len(indices))
        data = (row.data).astype(np.int64)
        count_dict = dict(zip(indices, data))
        for k,v in count_dict.items():
            seq.extend([k]*v)
        num_words = len(seq)
        nwds.append(num_words)
        # pad up to maxlen with 0
        if num_words < maxlen: 
            seq = np.pad(seq, (maxlen - num_words, 0),    
                         mode='constant')
        # truncate down to maxlen
        else:                  
            seq = seq[-maxlen:]
        x.append(seq)
    nwds = np.array(nwds)
#     print('sequence stats: avg:%s, max:%s, min:%s' % (nwds.mean(),
#                                                       nwds.max(), 
#                                                       nwds.min()) )
    return np.array(x)


**Make document-term matrix, word id sequences for train and test**

In [None]:
def prepare_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)   
    vectorizer =  CountVectorizer(ngram_range=(1,3), binary=True, 
                             token_pattern=r'\w+',
                             max_features=800000)
    dtm_train = vectorizer.fit_transform(X_train)
    dtm_test = vectorizer.transform(X_test)
#     print("DTM shape (training): (%s, %s)" % (dtm_train.shape))
#     print("DTM shape (test): (%s, %s)" % (dtm_test.shape))
    num_words = len([v for k,v in vectorizer.vocabulary_.items()]) + 1
#     print('vocab size:%s' % (num_words))
    maxlen = 2000
    x_train = dtm2wid(dtm_train, maxlen)
    x_test = dtm2wid(dtm_test, maxlen)
    
    return dtm_train, dtm_test, x_train, y_train, x_test, y_test, num_words, maxlen
    

**Function for computing Naive Bayes Log-Count Ratios**

These ratios capture the probability of a word appearing in a document in one class (i.e., positive) versus another (i.e., negative).

In [None]:
def pr(dtm, y, y_i):
    p = dtm[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

# nbratios = np.log(pr(dtm_train, y_train, 1)/pr(dtm_train, 
#                                                #y_train, 0))
# print(nbratios)
# nbratios = np.squeeze(np.asarray(nbratios))
# print(nbratios)

In [None]:
from keras import backend as K
from keras import regularizers
from keras.models import Model, Sequential
from keras.layers.core import Activation
from keras.layers import Input, Embedding, Flatten, dot, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import time

**Building NBSVM/NBLR model**

NBSVM is an approach to text classification proposed by Wang and Manning that takes a linear model such as SVM (or logistic regression) and infuses it with Bayesian probabilities by replacing word count features with Naive Bayes log-count ratios.

In [None]:
# def NBSVM(num_words, maxlen, nbratios=None):
#     embedding_mat = np.zeros((num_words, 1))
#     for i in range(1, num_words): # skip 0, the padding value
#         if nbratios is not None:
#             # if log-count ratios are supplied, then it's NBSVM
#             embedding_mat[i] = nbratios[i-1]
#         else:
#             # if log-count ratios are not supplied, 
#             # this reduces to a logistic regression
#             embedding_mat[i] = 1
            
#     input_layer = Input(shape=(maxlen,))
#     nb_layer = Embedding(num_words, 1, input_length=maxlen, weights=[embedding_mat],trainable=False)(input_layer)
#     x = Embedding(num_words, 1, input_length=maxlen, 
#                   embeddings_initializer='glorot_normal')(input_layer)
#     x = dot([nb_layer,x], axes=1)
#     x = Flatten()(x)
#     x = Dropout(0.5)(x)
#     x = Dense(64, activation='relu')(x)
#     x = Dense(2, kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.0001))(x) 
#     x = Activation('linear')(x)
    
#     model = Model(inputs=input_layer, outputs=x)
#     model.compile(loss='squared_hinge',
#                        optimizer='adadelta', metrics=['accuracy'])

#     return model

In [None]:
def NBLR(num_words, maxlen, nbratios=None):
    embedding_mat = np.zeros((num_words, 1))
    for i in range(1, num_words): # skip 0, the padding value
        if nbratios is not None:
            # if log-count ratios are supplied, then it's NBSVM
            embedding_mat[i] = nbratios[i-1]
        else:
            # if log-count ratios are not supplied, 
            # this reduces to a logistic regression
            embedding_mat[i] = 1
            
    input_layer = Input(shape=(maxlen,))
    nb_layer = Embedding(num_words, 1, input_length=maxlen, weights=[embedding_mat],trainable=False)(input_layer)
    x = Embedding(num_words, 1, input_length=maxlen, 
                  embeddings_initializer='glorot_normal')(input_layer)
    x = dot([nb_layer,x], axes=1)
    x = Flatten()(x)
    x = Activation('sigmoid')(x)
    
    model = Model(inputs=input_layer, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(learning_rate=0.001),
                  metrics=['accuracy'])
    return model

**Start learning with SVM**

In [None]:
# start = time.time()
# dtm_train, dtm_test, x_train, y_train, x_test, y_test, num_words, maxlen = prepare_data(X,y)
# model = NBSVM(num_words, maxlen)
# history = model.fit(x_train, y_train,
#           batch_size=32,
#           epochs=6,
#           validation_data=(x_test, y_test))

# figure, axis = plt.subplots(2, 1, constrained_layout = True)
# axis[0].plot(history.history['loss'])
# axis[0].plot(history.history['val_loss'])
# axis[0].set_title('model loss')
# axis[0].set_xlabel('loss')
# axis[0].set_ylabel('epoch')
# axis[0].legend(['train', 'val'], loc='upper left')

# axis[1].plot(history.history['accuracy'])
# axis[1].plot(history.history['val_accuracy'])
# axis[1].set_title('model accuracy')
# axis[1].set_xlabel('accuracy')
# axis[1].set_ylabel('epoch')
# axis[1].legend(['train', 'val'], loc='upper left')

# plt.show()
# stop = time.time()
# print(f'Begin: {start}, finish: {stop}, summary time: {stop - start}')

**Start learning with Logistic regression**

In [None]:
start = time.time()
dtm_train, dtm_test, x_train, y_train, x_test, y_test, num_words, maxlen = prepare_data(X,y)
model = NBLR(num_words, maxlen)
history = model.fit(x_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=(x_test, y_test))

figure, axis = plt.subplots(2, 1, constrained_layout = True)
axis[0].plot(history.history['loss'])
axis[0].plot(history.history['val_loss'])
axis[0].set_title('model loss')
axis[0].set_xlabel('epoch')
axis[0].set_ylabel('loss')
axis[0].legend(['train', 'val'], loc='upper left')

axis[1].plot(history.history['accuracy'])
axis[1].plot(history.history['val_accuracy'])
axis[1].set_title('model accuracy')
axis[1].set_xlabel('epoch')
axis[1].set_ylabel('accuracy')
axis[1].legend(['train', 'val'], loc='upper left')

plt.show()
stop = time.time()
print(f'Begin: {start}, finish: {stop}, summary time: {stop - start}')

**Start learning with NBSVM**

In [None]:
# start = time.time()
# dtm_train, dtm_test, x_train, y_train, x_test, y_test, num_words, maxlen = prepare_data(X,y)
# nbratios = np.log(pr(dtm_train, y_train, 1)/pr(dtm_train, y_train, 0))
# nbratios = np.squeeze(np.asarray(nbratios))
# model = NBSVM(num_words, maxlen, nbratios=nbratios)
# history = model.fit(x_train, y_train,
#           batch_size=32,
#           epochs=10,
#           validation_data=(x_test, y_test))
# figure, axis = plt.subplots(2, 1, constrained_layout = True)

# axis[0].plot(history.history['loss'])
# axis[0].plot(history.history['val_loss'])
# axis[0].set_title('model loss')
# axis[0].set_xlabel('loss')
# axis[0].set_ylabel('epoch')
# axis[0].legend(['train', 'val'], loc='upper left')

# axis[1].plot(history.history['accuracy'])
# axis[1].plot(history.history['val_accuracy'])
# axis[1].set_title('model accuracy')
# axis[1].set_xlabel('accuracy')
# axis[1].set_ylabel('epoch')
# axis[1].legend(['train', 'val'], loc='upper left')

# plt.show()
# stop = time.time()
# print(f'Begin: {start}, finish: {stop}, summary time: {stop - start}')


**Starting learning with NBLR**

In [None]:
start = time.time()
dtm_train, dtm_test, x_train, y_train, x_test, y_test, num_words, maxlen = prepare_data(X,y)
nbratios = np.log(pr(dtm_train, y_train, 1)/pr(dtm_train, y_train, 0))
nbratios = np.squeeze(np.asarray(nbratios))
model = NBLR(num_words, maxlen, nbratios=nbratios)
history = model.fit(x_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=(x_test, y_test))
figure, axis = plt.subplots(2, 1, constrained_layout = True)

axis[0].plot(history.history['loss'])
axis[0].plot(history.history['val_loss'])
axis[0].set_title('model loss')
axis[0].set_xlabel('epoch')
axis[0].set_ylabel('loss')
axis[0].legend(['train', 'val'], loc='upper left')

axis[1].plot(history.history['accuracy'])
axis[1].plot(history.history['val_accuracy'])
axis[1].set_title('model accuracy')
axis[1].set_xlabel('epoch')
axis[1].set_ylabel('accuracy')
axis[1].legend(['train', 'val'], loc='upper left')

plt.show()
stop = time.time()
print(f'Begin: {start}, finish: {stop}, summary time: {stop - start}')

**Here comes Federated Learning with Flower simulation**

In [None]:
!pip install -U flwr["simulation"]

In [None]:
import flwr as fl

In [None]:
class FlowerClient(fl.client.NumPyClient):
    def __init__(self, model, x_train, y_train, x_val, y_val) -> None:
        self.model = model
        self.x_train, self.y_train = x_train, y_train
        self.x_val, self.y_val = x_val, y_val

    def get_parameters(self):
        return self.model.get_weights()

    def fit(self, parameters, config):
        self.model.set_weights(parameters)
        history = self.model.fit(self.x_train, self.y_train, epochs=1, verbose=1, batch_size=32)
        print(f"History during fit round: {history.history}")
        print(f"Len of self x_train {len(self.x_train)}")
        return self.model.get_weights(), len(self.x_train), {}

    def evaluate(self, parameters, config):
        self.model.set_weights(parameters)
        loss, acc = self.model.evaluate(self.x_val, self.y_val, verbose=2)
        #print(f"loss {loss} and acc is {acc}")
        return loss, len(self.x_val), {"accuracy": acc}

In [None]:
def client_fn(cid: str) -> fl.client.Client:
    dtm_train, dtm_test, X_train, y_train, X_test, y_test, num_words, maxlen = prepare_data(X,y)
    partition_size = math.floor(len(X_train) / NUM_CLIENTS)
    idx_from, idx_to = int(cid) * partition_size, (int(cid) + 1) * partition_size
    X_train_cid = X_train[idx_from:idx_to]
    dtm_train_cid = dtm_train[idx_from:idx_to]
    y_train_cid = y_train[idx_from:idx_to]
    #print(f'Shape of cid train X: {X_train_cid.shape}')
    #print(f'Shape of cid train y: {y_train_cid.shape}')
    #print(f'Shape of cid dtm train: {dtm_train_cid.shape}')
    
    partition_size = math.floor(len(X_test) / NUM_CLIENTS)
    idx_from, idx_to = int(cid) * partition_size, (int(cid) + 1) * partition_size
    X_test_cid = X_test[idx_from:idx_to]
    dtm_test_cid = dtm_test[idx_from:idx_to]
    y_test_cid = y_test[idx_from:idx_to]

      
    nbratios = np.log(pr(dtm_train_cid, y_train_cid, 1)/pr(dtm_train_cid, 
                                               y_train_cid, 0))
    nbratios = np.squeeze(np.asarray(nbratios))
    
    model = NBLR(num_words,maxlen,nbratios=nbratios)
#     model = NBLR(num_words,maxlen)
    return FlowerClient(model, X_train_cid, y_train_cid, X_test_cid, y_test_cid)
    
    
    

In [None]:
from typing import List, Tuple, Optional, Dict
class SaveModelStrategy(fl.server.strategy.FedAvg):
#     def aggregate_fit(
#         self,
#         rnd: int,
#         results: List[Tuple[fl.server.client_proxy.ClientProxy, fl.common.FitRes]],
#         failures: List[BaseException],
#     ) -> Optional[fl.common.Weights]:
#         aggregated_weights = super().aggregate_fit(rnd, results, failures)
#         if aggregated_weights is not None and rnd == 10:
#             # Save aggregated_weights
#             print(f"Saving round {rnd} aggregated_weights...")
#             np.savez(f"round-{rnd}-nblr_for_sure-weights.npz", *aggregated_weights)
#         return aggregated_weights
    
#     def aggregate_fit(
#         self,
#         rnd: int,
#         results: List[Tuple[fl.server.client_proxy.ClientProxy, fl.common.FitRes]],
#         failures: List[BaseException],
#     ) -> Tuple[Optional[fl.common.Parameters], Dict[str,fl.common.Scalar]]:
#         params, metrics = super().aggregate_fit(rnd, results, failures)
   
     def aggregate_evaluate(
        self,
        rnd: int,
        results: List[Tuple[fl.server.client_proxy.ClientProxy, fl.common.EvaluateRes]],
        failures: List[BaseException],
    ) -> Optional[float]:
        """Aggregate evaluation losses using weighted average."""
        if not results:
            return None

        # Weigh accuracy of each client by number of examples used
        accuracies = [r.metrics["accuracy"] * r.num_examples for _, r in results]
        examples = [r.num_examples for _, r in results]

        # Aggregate and print custom metric
        accuracy_aggregated = sum(accuracies) / sum(examples)
        print(f"Round {rnd} accuracy aggregated from client results: {accuracy_aggregated}")

        # Call aggregate_evaluate from base class (FedAvg)
        return super().aggregate_evaluate(rnd, results, failures)



**Here comes federated learning with logistic regression**

In [None]:
NUM_CLIENTS = 10

# Start simulation
fl.simulation.start_simulation(
    client_fn=client_fn,
    num_clients=NUM_CLIENTS,
    num_rounds=10,
    strategy=SaveModelStrategy()
)

**Here comes FL for NBLR**

In [None]:
NUM_CLIENTS = 10


# Start simulation
fl.simulation.start_simulation(
    client_fn=client_fn,
    num_clients=NUM_CLIENTS,
    num_rounds=10,
    #strategy=strategy,
    strategy=SaveModelStrategy(min_fit_clients=3,min_eval_clients=3,)
)

