## Import libraries

In [None]:
import numpy as np
from numpy import sqrt, argmax
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, roc_curve
import matplotlib.pyplot as plt
# import tqdm

import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader, Dataset

import syft as sy

# import opacus
# from opacus import PrivacyEngine

import warnings
from pprint import pprint
warnings.filterwarnings("ignore")

import pandas as pd
import re
from nltk.corpus import stopwords

# from syft.frameworks.torch.nn import GRU
from handcrafted_GRU import GRU
# from opacus.layers import DPGRU

from torch.utils.data import DataLoader, TensorDataset

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device:{device}")

print(f"Torch Ver: {torch.__version__}")
# print(f"Opacus Ver: {opacus.__version__}")
# print(f"Syft Ver: {sy.__version__}")


In [None]:
STOPWORDS = set(stopwords.words('indonesian'))
# print(f"STOPWORDS:\n {STOPWORDS}")

def clean_text(text):
    # print(f"\n\nOriginal Text: {text}")
    text = text.lower()
#     print(f"\nCase Lowered Text: {text}")
    text = re.sub(r'[^a-z\s]', '', text)
#     print(f"\nRegexed Text: {text}")
    text = ' '.join([word for word in text.split() if word not in STOPWORDS])
#     print(f"\nStopwords Removed Text: {text}")
    return text

def tokenize(text, word_to_idx):
    tokens = []
    for word in text.split():
        if word in word_to_idx:
            tokens.append(word_to_idx[word])
        else:
            tokens.append(0)
    return tokens

def pad_and_truncate(messages, max_length=30):
    features = np.zeros((len(messages), max_length), dtype=int)
#     pprint(f"Messages: {messages}\nFeatures: {features}")
    for i, sms in enumerate(messages):
        # print(f"\ni: {i}\nSMS:{sms}")
        if len(sms):
            features[i, -len(sms):] = sms[:max_length]
    return features

In [None]:
if __name__ == '__main__':
    data = pd.read_csv('dataset_sms_spam_v1.csv', sep=',', names=['Teks', 'Label'], encoding= 'unicode_escape')
    data = data.sample(frac = 1)
    #Lowercase, remove unnecessary char with regex, remove stop words
    data.Teks = data.Teks.apply(clean_text)
    # print(data.Teks)
    words = set((' '.join(data.Teks)).split())
#     print(words)
    word_to_idx = {word: i for i, word in enumerate(words, start=1)}
    # pprint(word_to_idx)
    tokens = data.Teks.apply(lambda x: tokenize(x, word_to_idx))
    # print(tokens)
    inputs = pad_and_truncate(tokens)
    # pprint(inputs)
    labels = np.array((data.Label == '1').astype(int))

    np.save('labels.npy', labels)
    np.save('inputs.npy', inputs)

## Loading data

In [None]:
original_inputs = np.load('inputs.npy')
original_labels = np.load('labels.npy')

## Training model with Federated learning

### Training and model hyperparameters

In [None]:
# Training params
TRAIN_VOCAB_SIZE = int(inputs.max()) + 1
EPOCHS = 30
CLIP = 5 # gradient clipping - to avoid gradient explosion (frequent in RNNs)
lr = 0.1
BATCH_SIZE = 30

# Model params
EMBEDDING_DIM = 50
HIDDEN_DIM = 10
DROPOUT = 0.2

# # Privacy Engine Hyperparameters
MAX_GRAD_NORM = 1.2
NOISE_MULTIPLIER = 1.3
EPSILON = 50.0
# Delta value must be less than inverse of data amount. e.g: 100 data require leak probability value (delta) < 1/100
DELTA = 1e-5

### Train and Test Split

In [None]:
inputs = torch.tensor(original_inputs)
labels = torch.tensor(original_labels)

# splitting training and test data
# 20% of the data will be for Tests.
pct_test = 0.2

#20% of total data
pct_test_count = -int(len(labels)*pct_test)

# Get 80% of Train LABELS from left.
train_labels = labels[:pct_test_count]
print(f"Train Labels: [:{pct_test_count}]")

# Get 80% of Train INPUTS from left.
train_inputs = inputs[:pct_test_count]
print(f"Train Inputs: [:{pct_test_count}]")

# Get the rest of the LABEL data for test on the right (20%) 
test_labels = labels[pct_test_count:]
print(f"Test Labels: [{pct_test_count}:]")

# Get the rest of the INPUT data for test on the right (20%)
test_inputs = inputs[pct_test_count:]
print(f"Test Inputs: [{pct_test_count}:]")

print(f"Length Labels: {len(labels)}")
print(f"20% of Length Labels: {len(labels)*pct_test}")

SAMPLE_SIZE = len(labels)
print(SAMPLE_SIZE)

# For Local Model Evaluation
original_test_inputs = original_inputs[pct_test_count:]
original_test_labels = original_labels[pct_test_count:]


### VirtualWorkers

In [None]:
# Hook that extends the Pytorch library to enable all computations with pointers of tensors sent to other workers
hook = sy.TorchHook(torch)

# Creating 2 virtual workers Syft v0.2.0
anne = sy.VirtualWorker(hook, id="anne")
bob = sy.VirtualWorker(hook, id="bob")

workers = [anne, bob]

# this is done to have the local worker (you on your notebook!) have a registry
# of objects like every other workers, which is disabled by default but needed here
# sy.local_worker.is_client_worker = False


# threshold indexes for dataset split (one half for Bob, other half for Anne)
train_idx = int(len(train_labels)/2)
test_idx = int(len(test_labels)/2)


# Sending toy datasets to virtual workers
bob_train_dataset = sy.BaseDataset(train_inputs[:train_idx], train_labels[:train_idx]).send(bob)
anne_train_dataset = sy.BaseDataset(train_inputs[train_idx:], train_labels[train_idx:]).send(anne)
bob_test_dataset = sy.BaseDataset(test_inputs[:test_idx], test_labels[:test_idx]).send(bob)
anne_test_dataset = sy.BaseDataset(test_inputs[test_idx:], test_labels[test_idx:]).send(anne)


# # Creating federated datasets, an extension of Pytorch TensorDataset class
federated_train_dataset = sy.FederatedDataset([bob_train_dataset, anne_train_dataset])
federated_test_dataset = sy.FederatedDataset([bob_test_dataset, anne_test_dataset])

# Creating federated datasets, an extension of Pytorch TensorDataset class for TRAINING METHOD #1
bob_federated_train_dataset = sy.FederatedDataset([bob_train_dataset])
anne_federated_train_dataset = sy.FederatedDataset([anne_train_dataset])
bob_federated_test_dataset = sy.FederatedDataset([bob_test_dataset])
anne_federated_test_dataset = sy.FederatedDataset([anne_test_dataset])

# Creating federated dataloaders, an extension of Pytorch DataLoader class
federated_train_loader = sy.FederatedDataLoader(federated_train_dataset, shuffle=True, batch_size=BATCH_SIZE)
federated_test_loader = sy.FederatedDataLoader(federated_test_dataset, shuffle=True, batch_size=BATCH_SIZE)

# Original Torch DataLoader to test Local Model
# class CustomTextDataset(Dataset):
#         def __init__(self, text, labels):
#                 self.labels = labels
#                 self.text = text

#         def __len__(self):
#                 return len(self.labels)
                
#         def __getitem__(self, idx):
#                 label = self.labels[idx]
#                 text = self.text[idx]
#                 sample = {"Text": text, "Class": label}
#                 return sample


merged_test_dataset = list(zip(original_test_inputs, original_test_labels))
# print(f"Input:{original_test_inputs[1]}\t Label:{original_test_labels[1]}")
# print(merged_test_dataset[0])

def collate_batch(batch):
        label_list, text_list = [], []
        for (_label, _text) in batch:
                label_list.append(_label)
                text_list.append(_text)
        return label_list, text_list

# Create Pandas DF
# print(original_test_inputs[1])
# text_labels_df = pd.DataFrame({'Text': original_test_inputs, 'Labels': original_test_labels})
# Define dataset object
# TD = CustomTextDataset(text_labels_df['Text'], text_labels_df['Labels'])

original_test_dataloader = DataLoader(merged_test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

# print('\nFirst iteration of data set: ', next(iter(TD)), '\n')



### Import GRU Model

In [None]:
# Initiating the model
# torch.set_default_tensor_type('torch.cuda.FloatTensor')
# model = GRU(vocab_size=TRAIN_VOCAB_SIZE, hidden_dim=HIDDEN_DIM, embedding_dim=EMBEDDING_DIM, dropout=DROPOUT)
# torch.set_default_tensor_type('torch.FloatTensor')

def make_model():
    model = GRU(vocab_size=TRAIN_VOCAB_SIZE, hidden_dim=HIDDEN_DIM, embedding_dim=EMBEDDING_DIM, dropout=DROPOUT)
    # model = DPGRU(hidden_size=HIDDEN_DIM, input_size=EMBEDDING_DIM, dropout=DROPOUT)
    return model
    
local_model = make_model()

models, train_dataloaders, test_dataloaders, optimizers, privacy_engines = [], [], [], [], []

### Attaching model, dataloaders, optimizers, and privacy engine to each worker

In [None]:
for worker in workers:
    model = make_model()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    model.send(worker)
    if(worker == anne):
        train_dataset = anne_federated_train_dataset
        test_dataset = anne_federated_test_dataset
    elif(worker == bob):
        train_dataset = bob_federated_train_dataset
        test_dataset = bob_federated_test_dataset


    train_dataloader = sy.FederatedDataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_dataloader = sy.FederatedDataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    # privacy_engine = PrivacyEngine(model, 
    #                             batch_size=BATCH_SIZE, 
    #                             sample_size=SAMPLE_SIZE, 
    #                             alphas=range(2,32), 
    #                             noise_multiplier=NOISE_MULTIPLIER, 
    #                             max_grad_norm=MAX_GRAD_NORM)

    # Disable Privacy Engine
    # privacy_engine.attach(optimizer)

    models.append(model)
    train_dataloaders.append(train_dataloader)
    test_dataloaders.append(test_dataloader)
    optimizers.append(optimizer)
    # privacy_engines.append(privacy_engine)

### Functions to aggregate remote models and to send new updates

In [None]:
def send_new_models(local_model, models):
    with torch.no_grad():
        for remote_model in models:
            for new_param, remote_param in zip(local_model.parameters(), remote_model.parameters()):
                worker = remote_param.location
                remote_value = new_param.send(worker)
                remote_param.set_(remote_value)

def federated_aggregation(local_model, models):
    with torch.no_grad():
        for local_param, *remote_params in zip(*([local_model.parameters()] + [model.parameters() for model in models])):
            param_stack = torch.zeros(*remote_params[0].shape)
            for remote_param in remote_params:
                param_stack += remote_param.copy().get()
            param_stack /= len(remote_params)
            local_param.set_(param_stack)
        

### Training Method #1

In [None]:
def train(epoch):
    # 1. Send new version of the model
    send_new_models(local_model, models)

    # 2. Train remotely the models
    for i, worker in enumerate(workers):
        train_dataloader = train_dataloaders[i]
        model = models[i]
        optimizer = optimizers[i]
        
        model.train()
        criterion = nn.BCELoss() # for two class classification
        losses = []   
    
        for i, (data, target) in enumerate(train_dataloader):            
            data = data.to(torch.long)
            # h = torch.Tensor(np.zeros((BATCH_SIZE, HIDDEN_DIM))).send(worker)
            h = torch.Tensor(torch.zeros(BATCH_SIZE, HIDDEN_DIM)).send(worker)  
            
            # Call zero grad to clear previous gradient before every training passses.
            optimizer.zero_grad()

            # print(f"Data:{data}\nTarget: {target}\n")

            # print(f"Worker: {worker}\nWorker Objects: {worker.object_store._objects}")

            output, _ = model(data.to(torch.long), h)
            loss = criterion(output.squeeze(), target.float())
            loss.backward()

            # # Clipping the gradient to avoid explosion
            # nn.utils.clip_grad_norm_(model.parameters(), CLIP)

            losses.append(loss.get()) 
            optimizer.step()

        sy.local_worker.clear_objects()
        

        train_loss = sum(losses) / len(losses)
        print(
            f"[{worker.id}]\t"
            f"Train Epoch: {epoch} \t"
            f"Train Loss: {train_loss:.4f} ")

    # 3. Federated aggregation of the updated models
    federated_aggregation(local_model, models)


def eval(epoch):
    # 4. Evaluate the model
    local_model.eval()

    with torch.no_grad():
        test_preds = []
        test_labels_list = []
        eval_losses = []

        for inputs, labels in original_test_dataloader:
            h = torch.Tensor(np.zeros((BATCH_SIZE, HIDDEN_DIM)))
            output, _ = local_model(torch.LongTensor(inputs), h)
            criterion = nn.BCELoss()
            labels = torch.LongTensor(labels)
            loss = criterion(output.squeeze(), labels.float())
            eval_losses.append(loss)
            preds = output.squeeze()
            test_preds += list(preds.numpy())
            test_labels_list += list(labels.numpy().astype(int))
    
    score = roc_auc_score(test_labels_list, test_preds)

    eval_loss = sum(eval_losses) / len(eval_losses)

    print(
        f"Eval Epoch: {epoch} \t"
        f"AUC: {score:.3%} \t"
        f"Eval Loss: {eval_loss:.4f} ")



In [None]:
for epoch in range(EPOCHS):
    train(epoch)
    eval(epoch)

### Training Method #2

For now, PySyft does not support optimizers with momentum. Therefore, we are going to stick with the classical [Stochastic Gradient Descent](https://pytorch.org/docs/stable/optim.html#torch.optim.SGD) optimizer.

As our task consists of a binary classification, we are going to use the [Binary Cross Entropy Loss](https://pytorch.org/docs/stable/nn.html#torch.nn.BCELoss).

In [None]:
# Defining loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

### Privacy Engine using Opacus

For each epoch we are going to compute the training and validations losses, as well as the [Area Under the ROC Curve](https://scikit-learn.org/stable/modules/model_evaluation.html#roc-metrics) score due to the fact that the target dataset is unbalaced (only 13% of labels are positive).

In [None]:
train_losses = []
test_losses = []

# For Early Stopping
last_loss = 100
patience = 3
trigger_times = 0

for e in range(EPOCHS):
    
    ######### Training ##########

    losses = []
    # Batch loop
    for inputs, labels in federated_train_loader:
        # Location of current batch
        worker = inputs.location
        # Initialize hidden state and send it to worker
        h = torch.Tensor(np.zeros((BATCH_SIZE, HIDDEN_DIM))).send(worker)
        # Send model to current worker
        model.send(worker)
        # Setting accumulated gradients to zero before backward step
        optimizer.zero_grad()
        # Output from the model
        print(f"Inputs: {inputs}\nInput size: {inputs.shape}\nh: {h}")
        # output, _ = model(inputs, h)
        # output = model(inputs.int(), h)
        output, _ = model(inputs.to(torch.long), h)
        print(f"Output:{output}")
        # Calculate the loss and perform backprop
        print(f"Output Shape: {output.shape} Labels Shape: {labels.shape}")
        # loss = criterion(output.squeeze(), labels.float())
        loss = criterion(output.squeeze(), labels)
        loss.backward()
        # Clipping the gradient to avoid explosion
        nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        # Backpropagation step
        optimizer.step() 
        # Get the model back to the local worker
        model.get()
        losses.append(loss.get())
    
    # epsilon, best_alpha = optimizer.privacy_engine.get_privacy_spent(DELTA)
    
    ######## Evaluation ##########
    
    # Model in evaluation mode
    model.eval()

    with torch.no_grad():
        test_preds = []
        test_labels_list = []
        eval_losses = []

        for inputs, labels in federated_test_loader:
            # get current location
            worker = inputs.location
            # Initialize hidden state and send it to worker
            h = torch.Tensor(np.zeros((BATCH_SIZE, HIDDEN_DIM))).send(worker)    
            # Send model to worker
            model.send(worker)
            output, _ = model(inputs.to(torch.long), h)
            # loss = criterion(output.squeeze(), labels.float())
            loss = criterion(output, labels.float())
            eval_losses.append(loss.get())
            preds = output.squeeze().get()
            test_preds += list(preds.numpy())
            test_labels_list += list(labels.get().numpy().astype(int))
            # Get the model back to the local worker
            model.get()

    # Check test preds
    score = roc_auc_score(test_labels_list, test_preds)

    train_loss = sum(losses)/len(losses)
    eval_loss = sum(eval_losses)/len(eval_losses)
    
    train_losses.append(train_loss.item())
    test_losses.append(eval_loss.item())
    
    print("Epoch {}/{}...  \
    AUC: {:.3%}...  \
    Training loss: {:.5f}...  \
    Validation loss: {:.5f}".format(e+1, EPOCHS, score, train_loss, eval_loss))
    
    # Early Stopping
    if eval_loss > last_loss:
        trigger_times += 1
        print(f"Trigger Times: {trigger_times}")
        
        if trigger_times >= patience:
            print("EARLY STOPPING! START ING TEST PROCESS...")
            break
    else:
        print(f"Trigger Times: 0")
        trigger_times = 0
    
    last_loss = eval_loss
    
    model.train()

### Plot Losses

In [None]:
print(f"Train Losses: {train_losses}")
plt.plot(train_losses, 'r')
plt.plot(test_losses, 'b')
plt.legend(['Training Loss', 'Test Loss'])
plt.xlabel('Epoch')
plt.ylabel('Train Loss')
plt.show()

### Saving model for inference

In [None]:
# Save
PATH = "state_dict_model.pt"
torch.save(model.state_dict(), PATH)

### Ask for input and pre-process text

In [None]:
# text = "Selamat Anda Mendapatkan 50 JUTA!"
# ctext = clean_text(text)
# print(ctext)
# words = ctext.split()
# print(words)
# words = set((' '.join(words)).split())
# token_holder = []
# tokens = tokenize(text, word_to_idx)
# print(f"Tokens:{tokens}")
# # print(word_to_idx)
# token_holder.append(tokens)
# inference_input = pad_and_truncate(token_holder)
# print(f"\nPadded Inputs: {inference_input}")

In [None]:
data = pd.read_csv('manually_collected_sms_600.csv', sep=',', names=['Teks', 'Label'])
data = data.sample(frac = 1)
# Lowercase, remove unnecessary char with regex, remove stop words
data.Teks = data.Teks.apply(clean_text)
#     print(data.Teks)
words = set((' '.join(data.Teks)).split())
#     print(words)
word_to_idx = {word: i for i, word in enumerate(words, start=1)}
#     pprint(word_to_idx)
tokens = data.Teks.apply(lambda x: tokenize(x, word_to_idx))
#     print(tokens)
inputs = pad_and_truncate(tokens)
#     pprint(inputs)
labels = np.array((data.Label == '1').astype(int))

np.save('test_labels.npy', labels)
np.save('test_inputs.npy', inputs)

test_inputs = torch.tensor(np.load('test_inputs.npy'))
test_labels = torch.tensor(np.load('test_labels.npy'))


In [None]:
# Testing params
VOCAB_SIZE = int(test_inputs.max()) + 1
TEST_VOCAB_SIZE = TRAIN_VOCAB_SIZE
lr = 0.001
BATCH_SIZE = 30

# Model params
EMBEDDING_DIM = 50
HIDDEN_DIM = 10
DROPOUT = 0.2

### Load Model

In [None]:
PATH = "state_dict_model.pt"
model = GRU(vocab_size=TEST_VOCAB_SIZE, hidden_dim=HIDDEN_DIM, embedding_dim=EMBEDDING_DIM, dropout=DROPOUT)
model.load_state_dict(torch.load(PATH))
model.eval()

In [None]:
from sklearn.metrics import f1_score
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

test_dataset = TensorDataset(test_inputs, test_labels)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_losses = []

with torch.no_grad():
    test_preds = []
    test_labels_list = []
    eval_losses = []

    for inputs, labels in test_loader:
        h = torch.Tensor(np.zeros((BATCH_SIZE, HIDDEN_DIM)))

        output, _ = model(inputs, h)
        # output, _ = model(inputs.to(torch.long), h)
        loss = criterion(output.squeeze(), labels.float())
        eval_losses.append(loss)
        preds = output.squeeze()
        test_preds += list(preds.numpy())
        test_labels_list += list(labels.numpy().astype(int))


roc_acc_score = roc_auc_score(test_labels_list, test_preds)

# Calculate ROC Curve
fpr, tpr, thresholds = roc_curve(test_labels_list, test_preds)
# calculate the g-mean for each threshold
gmeans = sqrt(tpr * (1-fpr))
# Index of largest G-means
ix = argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))
threshold = thresholds[ix]

# Print how many data is being tested
print(f"Amount of test data: {len(test_labels_list)}")


# # Plot ROC Curve
# plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
# plt.plot(fpr, tpr, marker='.', label='Logistic')
# # axis labels
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.legend()
# # show the plot
# plt.show()

    
print(f"ROC Accuracy Score: {roc_acc_score}")

# Normalize probability with threshold
test_preds_thresholded = np.where(test_preds > threshold, 1, 0)
for i in range(len(test_preds)-1140):
    print("Test Preds Prob: {}    \
    Test Preds Label: {}  \
    True Label: {}  \
    ".format(test_preds[i], test_preds_thresholded[i], test_labels_list[i]))

acc_score = accuracy_score(test_labels_list, test_preds_thresholded)
print(f"\nAccuracy Score: {acc_score}")

# f1_score = f1_score(test_labels_list, test_preds_thresholded)
# print(f"F1 Score: {f1_score}")
