# Does Ensemble Adversarial Training Improve the Robustness of NLP Models?
An empirical analysis leveraging multiple neural architectures and the IMDB Review dataset.

**Author:** [Melih Catal ](https://github.com/melihcatal)

# Setup

## Packages & Data to reproduce the research

If you would like to reproduce the results please run the following cell to get the parameters

In [1]:
# in case of ssl error due to OpenAttack run the following cell
import urllib.request
from urllib.request import urlopen
import ssl
import json
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
!unzip reproduce_data.zip # unzip the data if you have

unzip:  cannot find or open reproduce_data.zip, reproduce_data.zip.zip or reproduce_data.zip.ZIP.


In [3]:
!mkdir reproduce_data # to save your work

In [None]:
# requirements: torchtext, tqdm OpenAttack
!pip install torchtext tqdm OpenAttack

## Libraries

In [5]:
# Standard libraries
import re
import os
import json
import glob
import requests
import tarfile
import numpy as np
import copy
import pandas as pd
from tqdm.auto import trange, tqdm
from functools import partial
from sklearn.model_selection import train_test_split
from collections import Counter

# PyTorch and utilities
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import nn
from torchtext.data.utils import ngrams_iterator, get_tokenizer
from torchtext.vocab import GloVe, vocab as torch_vocab
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import Dataset, DataLoader

# Other libraries
import OpenAttack
import torchtext
import datasets
from datasets import load_from_disk, Dataset as HuggingFaceDataset


## Util Functions

### Checkpoint & Configuration Management

In [6]:
def save_json(data, path):
    with open(path, "w") as f:
        json.dump(data, f)

def load_json(path):
    with open(path, "r") as f:
        loaded_data = json.load(f)
    return loaded_data

### Data Retrieval & Preprocessing

In [7]:
def download_extract_data(url:str, file_name:str, extract_path:str):
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 8192
    print("Downloading...")
    with open(file_name, "wb") as file:
        for chunk in tqdm(response.iter_content(chunk_size=block_size), total=total_size//block_size, unit='KB', unit_scale=True):
            file.write(chunk)
    print("Extracting...")
    with tarfile.open(file_name, "r:gz") as file:
        file.extractall(path=extract_path)
    print("Done!")



In [8]:
# get text and labels
def read_imdb_split(split_dir):
    split_dir = os.path.join("aclImdb", split_dir)
    print(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        dir_name = os.path.join(split_dir, label_dir)
        print(dir_name)
        for filename in glob.glob(os.path.join(dir_name, "*.txt")):
            with open(filename, "r") as file:
                texts.append(file.read())
            labels.append(0 if label_dir == "neg" else 1)

    return texts, labels

In [9]:
# clean data
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-z]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text

In [10]:
# utility function build vocab out of given set of texts
def build_vocab(texts, min_freq=1):
    counter = Counter()
    for text in texts:
        counter.update(text.split())

    # Add special tokens to the counter with a decreasing count to ensure their order - Unknown is always at index 0
    special_tokens = ["<UNK>", "<PAD>", "<SOS>", "<EOS>", "<SEP>", "<CLS>", "<MASK>"]
    for idx, token in enumerate(special_tokens):
        counter[token] = float('inf') - idx

    vocab = torch_vocab(counter, min_freq=min_freq)

    return vocab



In [11]:
def create_embedding_matrix(vocab, embeddings, embed_size) :
    embedding_matrix = torch.FloatTensor(len(vocab), embed_size).zero_()

    for i, word in enumerate(vocab):
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            assert len(embedding_vector) == embed_size, f"Embedding vector for {word} has size {len(embedding_vector)}, expected {embed_size}"
            embedding_matrix[i] = embedding_vector
        else:
            embedding_matrix[i] = torch.rand(embed_size)  # Assign a random vector

    return embedding_matrix


### Data Processing

In [12]:
# to split dataset
def split_dataset(dataset, split_percentage=0.5):
    if not isinstance(dataset, IMDBDataset ):
      raise("Unsupported data type. You should have IMDBDataset or pandas Data Frame")

    total_size = len(dataset)
    train_size = int(split_percentage * total_size)
    val_size = total_size - train_size

    indices = list(range(total_size))
    np.random.shuffle(indices)

    train_indices = indices[:train_size]
    val_indices = indices[train_size:]

    train_data = dataset.data.iloc[train_indices].reset_index(drop=True)
    val_data = dataset.data.iloc[val_indices].reset_index(drop=True)

    train_dataset = IMDBDataset(train_data, dataset.get_vocab(), dataset.max_len)
    val_dataset = IMDBDataset(val_data, dataset.get_vocab(), dataset.max_len)

    return train_dataset, val_dataset


In [13]:
# to split data into n models so each model will have their own portion
def split_into_n(dataset, n_splits=3):
    total_size = len(dataset)
    split_size = total_size // n_splits

    indices = list(range(total_size))
    np.random.shuffle(indices)

    datasets = []
    for i in range(n_splits):
        start_idx = i * split_size
        end_idx = (i + 1) * split_size if i != n_splits - 1 else total_size
        subset_indices = indices[start_idx:end_idx]
        subset_data = dataset.data.iloc[subset_indices].reset_index(drop=True)
        subset_dataset = IMDBDataset(subset_data, dataset.vocab, dataset.max_len)
        datasets.append(subset_dataset)

    return datasets

In [14]:
# to combine two different datasets. Returns pandas dataframe
def combine_datasets(dataset1, dataset2,test_texts):
    # Extract texts and labels from the two datasets
    texts1 = [text for text in dataset1.data["text"]]
    labels1 = dataset1.labels.tolist()

    texts2 = [text for text in dataset2.data["text"]]
    labels2 = dataset2.labels.tolist()

    # Combine texts and labels
    combined_texts = texts1 + texts2
    combined_labels = labels1 + labels2

    # Create a new dataframe with the combined data
    combined_data = {
        'text': combined_texts,
        'label': combined_labels
    }

    #new_vocab = build_vocab(combined_texts + test_texts)

    combined_data_df = pd.DataFrame(combined_data)

    # Shuffle the combined dataframe
    combined_data_df = combined_data_df.sample(frac=1).reset_index(drop=True)
    return combined_data_df

    # Create a new IMDBDataset instance with the combined data
    #combined_dataset = IMDBDataset(combined_data_df, new_vocab, 100)

    #return combined_data_df, new_vocab




### Training & Testing

In [15]:
# train
def train_one_epoch(model, iterator, criterion=None, optimizer=None):
    # by default optimizer is adam
    optimizer = optimizer if optimizer is not None else optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

    # default cross entropy loss
    criterion = criterion if criterion is not None else nn.CrossEntropyLoss()

    model.train()
    epoch_loss = 0
    epoch_acc = 0

    # Use tqdm to display progress
    for texts, labels in tqdm(iterator, desc="Training", total=len(iterator), unit="batch"):
        texts = texts.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(texts)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        acc = (outputs.argmax(dim=-1) == labels).float().mean()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def train(model, iterator, criterion = None, optimizer= None, epochs=5):
    model = model.to(device)
    for epoch in range(epochs):
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer=optimizer, criterion=criterion)
        print(f"Epoch: {epoch+1} - Loss: {train_loss:.4f}, Accuracy: {train_acc*100:.2f}%")



def test(model, iterator, verbose=False):
    """
    Tests the model and computes the accuracy.
    """
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    model = model.to(device)

    with torch.no_grad():
        for texts, labels in tqdm(iterator, desc="Testing", total=len(iterator)):
            texts = texts.to(device)
            labels = labels.to(device)

            outputs = model(texts)
            predicted_labels = outputs.argmax(dim=-1)

            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

            if verbose:
                print("For input texts, prediction is ", predicted_labels, " correct was ", labels)

    accuracy = correct_predictions / total_predictions
    return accuracy


### Attacks

In [16]:
# Prepares a dataset to the format OpenAttack needs. OpenAttack expects x and y columns
def prepare_attack_data(test_dataset, num_samples=-1):
    # get a copy of test data
    df = test_dataset.data.copy()

    # If not all samples and num_samples is less than the length of the dataframe
    if num_samples != -1 and num_samples < len(df):
        df = df.head(num_samples)

    # prepare for the attack
    df.rename(columns={'text': 'x',
                       'label': 'y'}, inplace=True)
    df.reset_index(inplace=True, drop=True)

    attack_data_dict = df.to_dict(orient='records')
    attack_data = HuggingFaceDataset.from_dict({col: df[col].tolist() for col in df.columns})

    return attack_data


In [17]:
# to run the attack
def attack(model, attack_data):
    model = model.to(device)
    victim = model

    # Initialize attacker
    attacker = OpenAttack.attackers.TextFoolerAttacker()

    # Create an attack evaluation instance
    attack_eval = OpenAttack.AttackEval(attacker, victim)

    # Store original and adversarial samples, along with metrics
    samples = {
        "x_orig": [],  # Original x
        "y_orig": [],  # Original y
        "x_adv": [],   # Adversarial x
        "y_adv": [],   # Adversarial y
        "metrics": []  # Attack metrics
    }

    total_attacks = 0
    successful_attacks = 0

    # Execute the attack
    for result in tqdm(attack_eval.ieval(attack_data), total=len(attack_data)):
        total_attacks += 1
        samples["x_orig"].append(result["data"]["x"])
        samples["y_orig"].append(result["data"]["y"])
        samples["metrics"].append(result["metrics"])

        if result["success"]:
            successful_attacks += 1
            samples["x_adv"].append(result["result"])
            samples["y_adv"].append(1 - result["data"]["y"])  # Flip the label
        else:
            samples["x_adv"].append(None)
            samples["y_adv"].append(None)

    # Calculate success rate
    attack_success_rate = successful_attacks / total_attacks
    print(f"Attack success rate: {attack_success_rate * 100:.2f}%")

    model = model.to("cpu")
    torch.cuda.empty_cache()

    # Return the samples as a dataset
    return datasets.Dataset.from_dict(samples)


## Global Variables

In [18]:
# default tokenizer
tokenizer = get_tokenizer("basic_english")

# use cuda if available - change this if you have other preferences
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# the directory to save your work
save_dir = "./reproduce_data"

## Dataset

We are using IMDB review dataset for sentiment analysis classification task

In [19]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset

class IMDBDataset(Dataset):
    def __init__(self, data, vocab, max_len=100):
        """
        Initializes the IMDBDataset.

        Args:
        - data (dict): A dictionary containing "text" and "label" keys.
        - vocab (Vocab object): Vocabulary object for indexing.
        - max_len (int, optional): Maximum length for tokenized texts. Defaults to 100.
        """
        super().__init__()
        self.data = data
        self.vocab = vocab
        self.max_len = max_len

        # Get stoi dictionary
        stoi = self.vocab.get_stoi()

        # Preprocess texts
        tokenized_texts = [text.split()[:max_len] for text in data["text"]]
        self.indexed_texts = [[stoi.get(token, stoi["<UNK>"]) for token in text] for text in tokenized_texts]

        # Preprocess labels
        self.labels = torch.tensor(data["label"].tolist())

    def __len__(self):
        """Returns the length of the dataset."""
        return len(self.labels)

    def __getitem__(self, index):
        """
        Fetches the indexed item from the dataset.

        Args:
        - index (int): Index of the item to fetch.

        Returns:
        - tuple: A tuple containing the text tensor and its corresponding label.
        """
        text_tensor = torch.tensor(self.indexed_texts[index])
        return text_tensor, self.labels[index]

    def get_vocab(self):
        """Returns the vocabulary object used in the dataset."""
        return copy.deepcopy(self.vocab)

    @staticmethod
    def collate_fn(pad_index, batch):
        """
        Custom collate function to pad sequences in a batch.
        """
        texts, labels = zip(*batch)
        texts = pad_sequence(texts, batch_first=True, padding_value=pad_index)
        return texts, torch.stack(labels)



## Models
There are 3 different models:
1. CNN
2. LSTM
3. RNN

All models are subclass of `BaseModel` which is a subclass of `OpenAttack.Classifier` and `nn.Module`

`OpenAttack.Classifier` expects `get_pred` and `get_prob` methods to be implemented by the subclasses. To being able to be compatiable with them we have a dedicated `pre_process` method that converts the data into the structure `OpenAttack.Classifier` works

### BaseModel

In [20]:
class BaseModel(OpenAttack.Classifier,nn.Module):
    """
    Base Model for the models that have been used in the module
    """
    def __init__(self):
        super(BaseModel, self).__init__()

    def pre_process(self, data):
        max_len = 100
        tokenized_texts = [text.split()[:max_len] for text in data]
        indexed_texts = [[self.stoi.get(token, 0) for token in text] for text in tokenized_texts]
        torch_data = [torch.tensor(indexed_text) for indexed_text in indexed_texts]
        return torch.stack(torch_data).to(device)

    def get_pred(self, input_):
        data = self.pre_process(input_)
        logits = self(data)
        return logits.argmax(dim=1).cpu().detach().numpy()

    def get_prob(self, input_):
        data = self.pre_process(input_)
        logits = self(data)
        return F.softmax(logits, dim=1).cpu().detach().numpy()


### CNN Model

In [21]:
# Basic CNN Model
class CNNModel(BaseModel):
    def __init__(self, stoi, vocab_size, embed_size, num_classes, kernel_sizes, num_channels, padding_idx=None):
        super(CNNModel, self).__init__()
        self.stoi = stoi
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=padding_idx)
        self.conv = nn.ModuleList([
            nn.Conv1d(in_channels=embed_size, out_channels=c, kernel_size=k)
            for c, k in zip(num_channels, kernel_sizes)
        ])
        self.fc = nn.Linear(sum(num_channels), num_classes)
        self.stoi = stoi

    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)
        out = [F.relu(conv(x)).max(dim=-1)[0] for conv in self.conv]
        out = torch.cat(out, dim=-1)
        return self.fc(out)


### RNN Model

In [22]:
class RNNModel(BaseModel):
    def __init__(self, stoi, vocab_size, embedding_dim, hidden_dim, num_layers, num_classes, pretrained_embeddings=None):
        super(RNNModel, self).__init__()
        self.stoi = stoi
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(pretrained_embeddings)
            self.embedding.weight.requires_grad = False
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.rnn(x)
        output = self.dropout(output[:, -1, :])
        return self.fc(output)


### LSTM Model

In [23]:
class LSTMModel(BaseModel):
    def __init__(self, stoi, vocab_size, embedding_dim, hidden_dim, num_layers, num_classes, pretrained_embeddings=None):
        super(LSTMModel, self).__init__()
        self.stoi = stoi
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(pretrained_embeddings)
            self.embedding.weight.requires_grad = False

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        output = self.dropout(output[:, -1, :])
        return self.fc(output)


# Implementation

### Data Retrieval

In [24]:
# download the data
data_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
download_extract_data(data_url, "aclImdb_v1.tar.gz", ".")



Downloading...


  0%|          | 0.00/10.3k [00:00<?, ?KB/s]

Extracting...
Done!


### Train & Test Data Generation

In [25]:
# load the data based on it's portion
train_texts, train_labels = read_imdb_split("train")
test_texts, test_labels = read_imdb_split("test")

# combine all the data
all_texts = train_texts + test_texts
all_labels = train_labels + test_labels

aclImdb/train
aclImdb/train/pos
aclImdb/train/neg
aclImdb/test
aclImdb/test/pos
aclImdb/test/neg


In [50]:
# build vocab for imdb dataset
all_texts = [" ".join(tokenizer(clean_text(text))) for text in all_texts]

imdb_ds_vocab = build_vocab(all_texts)

# split the data into train and test 80% - 20%
all_data = pd.DataFrame({"text": all_texts, "label": all_labels})
# For the scope of current project we are reducing the total data size from 50000 to 5000
subset_data = all_data.sample(frac=0.2)
train_data, test_data = train_test_split(subset_data, test_size=0.2, random_state=42)


In [44]:
len(all_data)

50000

In [51]:
len(subset_data)

10000

In [52]:
# build dataset
train_dataset = IMDBDataset(train_data, imdb_ds_vocab)
test_dataset = IMDBDataset(test_data, imdb_ds_vocab)


## Clean Accuracy

As the first step of our research we are training our target model, which is the `LSTM` model on clean data to get a baseline performance accuracy score.

In addition to our target model, we also train our models, namely `RNN` and `CNN`models to being able to use them during ensemble adversarial training later

In [53]:
# set hyperparams
vocab_size = len(imdb_ds_vocab)
embedding_dim = 100
hidden_dim = 256
num_layers = 2
num_classes = 2
kernel_sizes = [3, 4, 5]
num_channels = [100, 100, 100]

# initialize embedding. This is optional if you would like to use pretrained glove embeddings
# glove_embeddings = GloVe(name="6B", dim=embedding_dim)

# vocab_items = imdb_ds_vocab.get_itos()
# embedding_matrix = create_embedding_matrix(vocab_items, glove_embeddings, embed_size=embedding_dim)
# pretrained_embeddings = torch.tensor(embedding_matrix)

# get iterators

# Get the padding index
pad_index = imdb_ds_vocab.get_stoi()["<PAD>"]
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=partial(IMDBDataset.collate_fn, pad_index))
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=partial(IMDBDataset.collate_fn, pad_index))


##### Models

In [54]:
# Define models
lstm_model_base = LSTMModel(stoi = imdb_ds_vocab.get_stoi(), vocab_size = vocab_size, embedding_dim = embedding_dim, hidden_dim = hidden_dim, num_classes = num_classes, num_layers=num_layers)

cnn_model_base = CNNModel(stoi = imdb_ds_vocab.get_stoi(), vocab_size=vocab_size, embed_size = embedding_dim, num_classes = num_classes, kernel_sizes= kernel_sizes, num_channels=num_channels)

rnn_model_base = RNNModel(stoi = imdb_ds_vocab.get_stoi(), vocab_size = vocab_size, embedding_dim = embedding_dim, hidden_dim = hidden_dim, num_classes = num_classes, num_layers=num_layers)

### Load Existing

You can easily load model weights that we have trained before and see them in the action without running all the pipeline!

If you would like to do this please run this cell instead of Manual Run

In [None]:
lstm_model_base.load_state_dict(torch.load(f'{save_dir}/lstm_model_base.pth'))
cnn_model_base.load_state_dict(torch.load(f'{save_dir}/cnn_model_base.pth'))
rnn_model_base.load_state_dict(torch.load(f'{save_dir}/rnn_model_base.pth'))

In [None]:
print(test(model=lstm_model_base, iterator=test_loader))
print(test(model=cnn_model_base, iterator=test_loader))
print(test(model=rnn_model_base, iterator=test_loader))


### Manual Run

You can run the following cells if you manually want to continue

In [55]:
train(model= lstm_model_base, iterator= train_loader, epochs= 10)
# check clean model accuracy on clean data
test(model=lstm_model_base, iterator=test_loader)

Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 1 - Loss: 0.6937, Accuracy: 50.41%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 2 - Loss: 0.6917, Accuracy: 52.12%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 3 - Loss: 0.6898, Accuracy: 54.83%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 4 - Loss: 0.6827, Accuracy: 56.62%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 5 - Loss: 0.6516, Accuracy: 61.68%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 6 - Loss: 0.6319, Accuracy: 64.76%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 7 - Loss: 0.6098, Accuracy: 67.12%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 8 - Loss: 0.5896, Accuracy: 69.14%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 9 - Loss: 0.5748, Accuracy: 70.76%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 10 - Loss: 0.5556, Accuracy: 72.10%


Testing:   0%|          | 0/32 [00:00<?, ?it/s]

0.6945

In [56]:
train(model= cnn_model_base, iterator= train_loader, epochs= 10)
# check clean model accuracy on clean data
test(model=cnn_model_base, iterator=test_loader)

Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 1 - Loss: 0.7093, Accuracy: 52.74%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 2 - Loss: 0.6521, Accuracy: 64.42%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 3 - Loss: 0.6145, Accuracy: 72.95%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 4 - Loss: 0.5800, Accuracy: 77.60%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 5 - Loss: 0.5441, Accuracy: 81.79%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 6 - Loss: 0.5105, Accuracy: 84.34%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 7 - Loss: 0.4766, Accuracy: 86.66%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 8 - Loss: 0.4418, Accuracy: 89.19%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 9 - Loss: 0.4126, Accuracy: 90.45%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 10 - Loss: 0.3826, Accuracy: 92.33%


Testing:   0%|          | 0/32 [00:00<?, ?it/s]

0.729

In [57]:
train(model= rnn_model_base, iterator= train_loader, epochs= 10)
test(model=rnn_model_base, iterator=test_loader)

Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 1 - Loss: 0.7074, Accuracy: 49.96%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 2 - Loss: 0.7011, Accuracy: 51.55%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 3 - Loss: 0.6942, Accuracy: 52.91%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 4 - Loss: 0.6926, Accuracy: 54.05%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 5 - Loss: 0.6907, Accuracy: 54.67%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 6 - Loss: 0.6836, Accuracy: 55.41%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 7 - Loss: 0.6841, Accuracy: 55.81%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 8 - Loss: 0.6776, Accuracy: 56.95%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 9 - Loss: 0.6683, Accuracy: 59.13%


Training:   0%|          | 0/125 [00:00<?, ?batch/s]

Epoch: 10 - Loss: 0.6472, Accuracy: 62.69%


Testing:   0%|          | 0/32 [00:00<?, ?it/s]

0.583

##### Checkpoint
This was a computationally high task. You may want to save the results for later use. Please uncomment the following cell if you want to do so.

In [35]:
torch.save(lstm_model_base.state_dict(), f'{save_dir}/lstm_model_base.pth')
torch.save(cnn_model_base.state_dict(), f'{save_dir}/cnn_model_base.pth')
torch.save(rnn_model_base.state_dict(), f'{save_dir}/rnn_model_base.pth')


### Clean Model Robustness

In [58]:
attack_data = prepare_attack_data(test_dataset)
victim = lstm_model_base

attacker = OpenAttack.attackers.TextFoolerAttacker()
attack_eval = OpenAttack.AttackEval(attacker, victim)

attack_eval.eval(attack_data, visualize=False, progress_bar=True)


100%|██████████| 2000/2000 [12:49<00:00,  2.60it/s]


{'Total Attacked Instances': 2000,
 'Successful Instances': 1998,
 'Attack Success Rate': 0.999,
 'Avg. Running Time': 0.33669511711597444,
 'Total Query Exceeded': 0.0,
 'Avg. Victim Model Queries': 292.89}

## Traiditional Adversarial Training

Here the target `LSTM` model generates adversarial samples for it's training data using TextFooler attack. These adversarial samples will be combined with the clean dataset to create a mix of adversarial and clean training dataset.

The proportion of adversarial samples and clean samples is 1/2. This means half of the training data contains adversarial samples and half of them is clean samples.

There two different ways of adversarial training:
1. **Generating adversarial samples on the fly, during training loop:** In addition to training on clean samples, the training algorithm generates adversarial samples leveraging the respective gradient of the model. This can give a better adaption of adversarial samples to the target model.
2. **Generating adversarial samples prior to training:** In this option, adversarial samples are generated using the pretrained model (on clean dataset) and then such samples are combined with the original clean data to train another model in hope of making it robust. This step can be considered as one form of data augmentation, where the purpose is generating more samples out of the existing ones.

We opt the second option because of time and resource constraints. Also, the efficiency and robustness difference between these two are still open question that can be investigated further.

### Generating Adversarial Samples

#### Load Existing Data


In [None]:
filtered_lstm_adv_data = load_from_disk("filtered_lstm_adv_data") # uncomment if you want to use pre attacked data

#### Manual Run

In [59]:
# here we attack the target model using the training data to generate adversarial samples
lstm_adv_samples = []
attack_data = prepare_attack_data(train_dataset)
victim = lstm_model_base
results = attack(victim, attack_data)
lstm_adv_samples.append(results)


  0%|          | 0/8000 [00:00<?, ?it/s]

Attack success rate: 99.96%


In [61]:
len(lstm_adv_samples[0])

8000

In [67]:
# Filter out rows where 'x_adv' is not None
filtered_lstm_adv_samples = lstm_adv_samples[0].filter(lambda example: example['x_adv'] is not None)

# make sure we have data :)
assert len(filtered_lstm_adv_samples) > 0

Filter:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [68]:
len(filtered_lstm_adv_samples)

7997

In [69]:
filtered_lstm_adv_samples_df = pd.DataFrame(filtered_lstm_adv_samples)

In [115]:
# update labels to int
columns_to_convert = ['y_orig', 'y_adv']
filtered_lstm_adv_samples_df[columns_to_convert] = filtered_lstm_adv_samples_df[columns_to_convert].astype(int)

In [116]:
filtered_lstm_adv_samples_df.head()

Unnamed: 0,x_orig,y_orig,x_adv,y_adv,metrics
0,full of spoilers this is a pretty fast and enj...,1,full of spoilers this is a pretty fast and enj...,0,"{'Query Exceeded': False, 'Running Time': 0.44..."
1,attend the tale of sweeney todd the strangest ...,1,attend the tale of sweeney todd the strangest ...,0,"{'Query Exceeded': False, 'Running Time': 0.28..."
2,i saw this series in in london tv and was blow...,1,i saw this series in in london tv and was blow...,0,"{'Query Exceeded': False, 'Running Time': 0.33..."
3,to me bollywood movies are not generally up to...,0,to me bollywood movies are not generally up to...,1,"{'Query Exceeded': False, 'Running Time': 0.42..."
4,this film is really bad so bad that even chris...,0,this film is really unsound so uncollectible t...,1,"{'Query Exceeded': False, 'Running Time': 0.76..."


#### Checkpoint
This was a computationally high task. You may want to save the results for later use. Please uncomment the following cell if you want to do so.

In [117]:
filtered_lstm_adv_samples_df.to_csv(f'{save_dir}/filtered_lstm_adv_samples_df.csv') # check point


### Adversarial Training



##### Dataset Generation

In [127]:
# get adversarial texts
lstm_adv_texts = filtered_lstm_adv_samples_df['x_adv'].tolist()
# add new adversarial text too, other remains the same
all_texts_lstm_adv = train_texts + test_texts + lstm_adv_texts


In [129]:
# build vocab for this new combined data
all_texts_lstm_adv = [" ".join(tokenizer(clean_text(text))) for text in all_texts_lstm_adv]

lstm_adv_vocab = build_vocab(all_texts_lstm_adv)

In [138]:
# now we need to merge adv samples into training samples

# Step 1: Rename columns
adversarial_samples_to_merge_lstm = filtered_lstm_adv_samples_df.rename(columns={'x_adv': 'text', 'y_orig': 'label'})

# Step 2: Drop unnecessary columns
adversarial_samples_to_merge_lstm = adversarial_samples_to_merge_lstm[['text', 'label']]

# Step 3: Concatenate dataframes
combined_lstm_adv_train_data = pd.concat([train_data, adversarial_samples_to_merge_lstm], ignore_index=True)

# Step 4: Shuffle the dataframe
combined_lstm_adv_train_data = combined_lstm_adv_train_data.sample(frac=1).reset_index(drop=True)

assert len(combined_lstm_adv_train_data) == len(adversarial_samples_to_merge_lstm) + len(train_data)

In [139]:
combined_lstm_adv_train_data.head()

Unnamed: 0,text,label
0,i own this movie and i love canadian movies bu...,1
1,not having read nabokov and recognise nothing ...,1
2,if only eddie murphy were born years later the...,1
3,i really wanted to like this movie great cast ...,0
4,i watched this movie or part of it in hope tha...,0


In [146]:
# drop duplications - these are the failed attacks, we don't need them since they are the same with the original sample
combined_lstm_adv_train_data = combined_lstm_adv_train_data.drop_duplicates()

In [148]:
# build dataset

# use new concatenated train data
train_dataset = IMDBDataset(combined_lstm_adv_train_data, lstm_adv_vocab)

# test data remains the same
test_dataset = IMDBDataset(test_data, lstm_adv_vocab)

# set hyperparams
vocab_size = len(lstm_adv_vocab)
embedding_dim = 100
hidden_dim = 256
num_layers = 2
num_classes = 2
kernel_sizes = [3, 4, 5]
num_channels = [100, 100, 100]


# Get the padding index
pad_index = lstm_adv_vocab.get_stoi()["<PAD>"]
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=partial(IMDBDataset.collate_fn, pad_index))
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=partial(IMDBDataset.collate_fn, pad_index))


##### Models

In [149]:
# Define models

lstm_model_robust_1 = LSTMModel(stoi = lstm_adv_vocab.get_stoi(), vocab_size = vocab_size, embedding_dim = embedding_dim, hidden_dim = hidden_dim, num_classes = num_classes, num_layers=num_layers)

cnn_model_robust_1 = CNNModel(stoi = lstm_adv_vocab.get_stoi(), vocab_size=vocab_size, embed_size = embedding_dim, num_classes = num_classes, kernel_sizes= kernel_sizes, num_channels=num_channels)

rnn_model_robust_1 = RNNModel(stoi = lstm_adv_vocab.get_stoi(), vocab_size = vocab_size, embedding_dim = embedding_dim, hidden_dim = hidden_dim, num_classes = num_classes, num_layers=num_layers)

##### Training

In [150]:
# LSTM Robust 1 training
train(model= lstm_model_robust_1, iterator= train_loader, epochs= 10)


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 1 - Loss: 0.6926, Accuracy: 51.74%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 2 - Loss: 0.6872, Accuracy: 54.98%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 3 - Loss: 0.6606, Accuracy: 60.00%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 4 - Loss: 0.6240, Accuracy: 65.24%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 5 - Loss: 0.5884, Accuracy: 69.34%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 6 - Loss: 0.5495, Accuracy: 72.09%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 7 - Loss: 0.5024, Accuracy: 75.81%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 8 - Loss: 0.4566, Accuracy: 78.85%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 9 - Loss: 0.4117, Accuracy: 81.96%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 10 - Loss: 0.3718, Accuracy: 83.90%


In [151]:
# CNN Robust 1 training
train(model= cnn_model_robust_1, iterator= train_loader, epochs= 10)


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 1 - Loss: 0.6785, Accuracy: 58.26%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 2 - Loss: 0.6054, Accuracy: 74.27%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 3 - Loss: 0.5460, Accuracy: 80.58%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 4 - Loss: 0.4887, Accuracy: 86.22%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 5 - Loss: 0.4315, Accuracy: 90.31%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 6 - Loss: 0.3782, Accuracy: 93.18%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 7 - Loss: 0.3257, Accuracy: 95.28%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 8 - Loss: 0.2774, Accuracy: 97.04%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 9 - Loss: 0.2344, Accuracy: 98.16%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 10 - Loss: 0.1956, Accuracy: 98.95%


In [152]:
# RNN Robust 1 training
train(model= rnn_model_robust_1, iterator= train_loader, epochs= 10)


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 1 - Loss: 0.7010, Accuracy: 51.51%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 2 - Loss: 0.6928, Accuracy: 52.99%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 3 - Loss: 0.6868, Accuracy: 54.93%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 4 - Loss: 0.6807, Accuracy: 57.02%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 5 - Loss: 0.6724, Accuracy: 58.36%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 6 - Loss: 0.6606, Accuracy: 60.23%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 7 - Loss: 0.6425, Accuracy: 63.27%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 8 - Loss: 0.6233, Accuracy: 65.24%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 9 - Loss: 0.6020, Accuracy: 67.58%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 10 - Loss: 0.5843, Accuracy: 69.79%


##### Testing

In [153]:
# LSTM check robust model accuracy on clean test data
test(model=lstm_model_robust_1, iterator=test_loader)

Testing:   0%|          | 0/32 [00:00<?, ?it/s]

0.7485

In [154]:
# CNN check robust model accuracy on clean test data
test(model=cnn_model_robust_1, iterator=test_loader)

Testing:   0%|          | 0/32 [00:00<?, ?it/s]

0.763

In [155]:
# RNN check robust model accuracy on clean test data
test(model=rnn_model_robust_1, iterator=test_loader)

Testing:   0%|          | 0/32 [00:00<?, ?it/s]

0.589

## Ensemble Adversarial Training

Ensemble Adversaril Training aims to generalize machine learning robustness by introducing adversarial samples crafted from other models in addition to target model. We are using the three models we defined above, namely `CNN`, `RNN` and `LSTM` models to generate adversarial samples. Once such dataset is generated successfully we combine them with the clean training data. The model trains on this combined dataset.

There are some decision choices. Such as the contribution of adversarial samples on combined training dataset. We stick 1/2 percentage. This means half of the training data comes from adversarial samples and the remaining halve is clean training data.

Another question is the contribution of each model onto the generation of adversarial samples. We have equal distribution to have better generalization and to avoid overfitting. This means each model contributes to the 1/3 of the adversarial samples dataset.

### Generating Adversarial Samples

#### Load Existing

You can skip the manual run by running the following cell. Please make sure you have the data and don't run the manual section since it will override the parameters

In [None]:
adv_samples= [load_from_disk("adv_samples[0]"), load_from_disk("adv_samples[1]") ,load_from_disk("adv_samples[2]")]  # to load from disk

#### Manual Run

In [84]:
len(train_dataset)

8000

In [85]:
# initalizing adversarial sample generation models and data

models= [cnn_model_base, rnn_model_base, lstm_model_base]

# each model will have their own data
model_data = split_into_n(train_dataset, len(models))
assert len(model_data) == len(models)

adv_samples = []
for idx, model in enumerate(models):
    model = model.to(device)
    print(f"running for model : {idx+1}/{len(models)}" )
    current_clean_data = model_data[idx]
    attack_data = prepare_attack_data(current_clean_data)
    victim = model
    results = attack(victim, attack_data)
    adv_samples.append(results)

running for model : 1/3


  0%|          | 0/2666 [00:00<?, ?it/s]

Attack success rate: 99.74%
running for model : 2/3


  0%|          | 0/2666 [00:00<?, ?it/s]

Attack success rate: 98.35%
running for model : 3/3


  0%|          | 0/2668 [00:00<?, ?it/s]

Attack success rate: 99.96%


In [92]:
ensemble_adv_samples = pd.concat([pd.DataFrame(sample) for sample in adv_samples])


In [113]:
# clean na data
ensemble_adv_samples.dropna(inplace=True)

# update labels to int
columns_to_convert = ['y_orig', 'y_adv']
ensemble_adv_samples[columns_to_convert] = ensemble_adv_samples[columns_to_convert].astype(int)


In [114]:
ensemble_adv_samples.head()

Unnamed: 0,x_orig,y_orig,x_adv,y_adv,metrics
0,the amazing mr williams stars melvyn douglas w...,1,the amazing mister tennessee asterisk melvyn s...,0,"{'Query Exceeded': False, 'Running Time': 1.11..."
1,the plot of this enjoyable mgm musical is cont...,1,the plot of this enjoyable mgm musical is cont...,0,"{'Query Exceeded': False, 'Running Time': 0.40..."
2,i never saw this movie until i bought the tape...,1,i never saw this movie until i bought the tape...,0,"{'Query Exceeded': False, 'Running Time': 0.67..."
3,it takes a serbian or at least a balkan famili...,1,it train a serbian or at least a balkan famili...,0,"{'Query Exceeded': False, 'Running Time': 0.99..."
4,okay now i am pretty sure that my summary got ...,0,okay now i am pretty sure that my summary got ...,1,"{'Query Exceeded': False, 'Running Time': 0.32..."


In [111]:
len(ensemble_adv_samples)

7948

##### Checkpoint
This was a computationally high task. You may want to save the results for later use. Please uncomment the following cell if you want to do so.

In [118]:
ensemble_adv_samples.to_csv(f'{save_dir}/ensemble_adv_samples.csv')

### Adversarial Training

##### Dataset Generation

In [156]:
ensemble_adv_samples.head()

Unnamed: 0,x_orig,y_orig,x_adv,y_adv,metrics
0,the amazing mr williams stars melvyn douglas w...,1,the amazing mister tennessee asterisk melvyn s...,0,"{'Query Exceeded': False, 'Running Time': 1.11..."
1,the plot of this enjoyable mgm musical is cont...,1,the plot of this enjoyable mgm musical is cont...,0,"{'Query Exceeded': False, 'Running Time': 0.40..."
2,i never saw this movie until i bought the tape...,1,i never saw this movie until i bought the tape...,0,"{'Query Exceeded': False, 'Running Time': 0.67..."
3,it takes a serbian or at least a balkan famili...,1,it train a serbian or at least a balkan famili...,0,"{'Query Exceeded': False, 'Running Time': 0.99..."
4,okay now i am pretty sure that my summary got ...,0,okay now i am pretty sure that my summary got ...,1,"{'Query Exceeded': False, 'Running Time': 0.32..."


In [158]:
# get adversarial texts
ensembe_adv_texts = ensemble_adv_samples['x_adv'].tolist()

# add new adversarial text too, other remains the same
all_ensemble_adv_texts = train_texts + test_texts + ensembe_adv_texts


In [159]:
# build vocab for this new combined data
all_ensemble_adv_texts = [" ".join(tokenizer(clean_text(text))) for text in all_ensemble_adv_texts]

ensemble_adv_vocab = build_vocab(all_ensemble_adv_texts)

In [160]:
# now we need to merge adv samples into training samples

# Step 1: Rename columns
ensemble_adv_samples_to_merge = ensemble_adv_samples.rename(columns={'x_adv': 'text', 'y_orig': 'label'})

# Step 2: Drop unnecessary columns
ensemble_adv_samples_to_merge = ensemble_adv_samples_to_merge[['text', 'label']]

# Step 3: Concatenate dataframes
combined_ensemble_adv_data = pd.concat([train_data, ensemble_adv_samples_to_merge], ignore_index=True)

# Step 4: Shuffle the dataframe
combined_ensemble_adv_data = combined_ensemble_adv_data.sample(frac=1).reset_index(drop=True)

assert len(combined_ensemble_adv_data) == len(ensemble_adv_samples_to_merge) + len(train_data)

In [162]:
# drop duplications - these are the failed attacks, we don't need them since they are the same with the original sample
combined_ensemble_adv_data = combined_ensemble_adv_data.drop_duplicates()

In [164]:
# build dataset

# use new concatenated train data
train_dataset = IMDBDataset(combined_ensemble_adv_data, ensemble_adv_vocab)

# test data remains the same
test_dataset = IMDBDataset(test_data, ensemble_adv_vocab)

# set hyperparams
vocab_size = len(ensemble_adv_vocab)
embedding_dim = 100
hidden_dim = 256
num_layers = 2
num_classes = 2
kernel_sizes = [3, 4, 5]
num_channels = [100, 100, 100]


# Get the padding index
pad_index = ensemble_adv_vocab.get_stoi()["<PAD>"]
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=partial(IMDBDataset.collate_fn, pad_index))
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=partial(IMDBDataset.collate_fn, pad_index))


##### Models

In [165]:
# Define models

lstm_model_robust_2 = LSTMModel(stoi = ensemble_adv_vocab.get_stoi(), vocab_size = vocab_size, embedding_dim = embedding_dim, hidden_dim = hidden_dim, num_classes = num_classes, num_layers=num_layers)

cnn_model_robust_2 = CNNModel(stoi = ensemble_adv_vocab.get_stoi(), vocab_size=vocab_size, embed_size = embedding_dim, num_classes = num_classes, kernel_sizes= kernel_sizes, num_channels=num_channels)

rnn_model_robust_2 = RNNModel(stoi = ensemble_adv_vocab.get_stoi(), vocab_size = vocab_size, embedding_dim = embedding_dim, hidden_dim = hidden_dim, num_classes = num_classes, num_layers=num_layers)

##### Training

In [166]:
# LSTM Robust 2 training
train(model= lstm_model_robust_2, iterator= train_loader, epochs= 10)

Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 1 - Loss: 0.6923, Accuracy: 52.21%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 2 - Loss: 0.6906, Accuracy: 53.19%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 3 - Loss: 0.6835, Accuracy: 55.80%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 4 - Loss: 0.6440, Accuracy: 62.83%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 5 - Loss: 0.6128, Accuracy: 66.64%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 6 - Loss: 0.5745, Accuracy: 70.41%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 7 - Loss: 0.5439, Accuracy: 73.02%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 8 - Loss: 0.5038, Accuracy: 75.80%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 9 - Loss: 0.4557, Accuracy: 79.09%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 10 - Loss: 0.4133, Accuracy: 81.66%


In [167]:
# CNN Robust 2 training
train(model= cnn_model_robust_2, iterator= train_loader, epochs= 10)

Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 1 - Loss: 0.6718, Accuracy: 59.71%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 2 - Loss: 0.6029, Accuracy: 73.36%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 3 - Loss: 0.5428, Accuracy: 80.86%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 4 - Loss: 0.4850, Accuracy: 85.78%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 5 - Loss: 0.4299, Accuracy: 89.20%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 6 - Loss: 0.3773, Accuracy: 91.96%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 7 - Loss: 0.3294, Accuracy: 94.65%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 8 - Loss: 0.2836, Accuracy: 96.55%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 9 - Loss: 0.2427, Accuracy: 97.96%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 10 - Loss: 0.2027, Accuracy: 98.91%


In [168]:
# RNN Robust 2 training
train(model= rnn_model_robust_2, iterator= train_loader, epochs= 10)

Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 1 - Loss: 0.7046, Accuracy: 50.46%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 2 - Loss: 0.6925, Accuracy: 53.86%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 3 - Loss: 0.6883, Accuracy: 54.55%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 4 - Loss: 0.6812, Accuracy: 56.36%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 5 - Loss: 0.6744, Accuracy: 57.71%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 6 - Loss: 0.6648, Accuracy: 59.65%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 7 - Loss: 0.6489, Accuracy: 62.19%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 8 - Loss: 0.6243, Accuracy: 65.36%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 9 - Loss: 0.6139, Accuracy: 66.21%


Training:   0%|          | 0/250 [00:00<?, ?batch/s]

Epoch: 10 - Loss: 0.6016, Accuracy: 68.14%


##### Testing

In [169]:
# LSTM check robust model accuracy on clean test data - ensemble training
test(model=lstm_model_robust_2, iterator=test_loader)

Testing:   0%|          | 0/32 [00:00<?, ?it/s]

0.7515

In [170]:
# CNN check robust model accuracy on clean test data - ensemble training
test(model=cnn_model_robust_2, iterator=test_loader)

Testing:   0%|          | 0/32 [00:00<?, ?it/s]

0.7575

In [171]:
# RNN check robust model accuracy on clean test data - ensemble training
test(model=rnn_model_robust_2, iterator=test_loader)

Testing:   0%|          | 0/32 [00:00<?, ?it/s]

0.559

# Evaluation
Here we want to check the robustness of the our target model LTSM in different conditions:


1.   No Adversarial Training
2.   Traditional Adversarial Training
3. Ensemble Adversarial Training





##### No Adversarial Training

In [177]:
attack_data

Dataset({
    features: ['x', 'y'],
    num_rows: 2000
})

In [178]:
# test dataset should use it's respective vocab
test_dataset = IMDBDataset(test_data, imdb_ds_vocab)

attack_data = prepare_attack_data(test_dataset)

lstm_model_base = lstm_model_base.to(device)

# Victim is the base model - no adversarial training
victim = lstm_model_base
attacker = OpenAttack.attackers.TextFoolerAttacker()
attack_eval = OpenAttack.AttackEval(attacker, victim)

attack_eval.eval(attack_data, visualize=False, progress_bar=True)


100%|██████████| 2000/2000 [12:26<00:00,  2.68it/s]


{'Total Attacked Instances': 2000,
 'Successful Instances': 1998,
 'Attack Success Rate': 0.999,
 'Avg. Running Time': 0.3246043506860733,
 'Total Query Exceeded': 0.0,
 'Avg. Victim Model Queries': 292.89}

##### Traditional Adversarial Training

In [175]:
# test dataset should use it's respective vocab
test_dataset = IMDBDataset(test_data, lstm_adv_vocab)

attack_data = prepare_attack_data(test_dataset)

# Victim is the robust_1 model - traditional adversarial training
victim = lstm_model_robust_1

attacker = OpenAttack.attackers.TextFoolerAttacker()
attack_eval = OpenAttack.AttackEval(attacker, victim)

attack_eval.eval(attack_data, visualize=False, progress_bar=True)


100%|██████████| 2000/2000 [26:08<00:00,  1.27it/s]


{'Total Attacked Instances': 2000,
 'Successful Instances': 1997,
 'Attack Success Rate': 0.9985,
 'Avg. Running Time': 0.7145295497179032,
 'Total Query Exceeded': 0.0,
 'Avg. Victim Model Queries': 363.71}

##### Ensemble Adversarial Training

In [174]:
# test dataset should use it's respective vocab
test_dataset = IMDBDataset(test_data, ensemble_adv_vocab)

attack_data = prepare_attack_data(test_dataset)

# Victim is the robust_2 model - ensemble adversarial training
victim = lstm_model_robust_2

attacker = OpenAttack.attackers.TextFoolerAttacker()
attack_eval = OpenAttack.AttackEval(attacker, victim)

attack_eval.eval(attack_data, visualize=False, progress_bar=True)


100%|██████████| 2000/2000 [46:25<00:00,  1.39s/it]


{'Total Attacked Instances': 2000,
 'Successful Instances': 1999,
 'Attack Success Rate': 0.9995,
 'Avg. Running Time': 1.2853759752511977,
 'Total Query Exceeded': 0.0,
 'Avg. Victim Model Queries': 464.698}

In [179]:
!zip -r reproduce_data.zip reproduce_data/

  adding: reproduce_data/ (stored 0%)
  adding: reproduce_data/.ipynb_checkpoints/ (stored 0%)
  adding: reproduce_data/lstm_model_base.pth (deflated 6%)
  adding: reproduce_data/rnn_model_base.pth (deflated 6%)
  adding: reproduce_data/cnn_model_base.pth (deflated 6%)
  adding: reproduce_data/filtered_lstm_adv_samples_df.csv (deflated 78%)
  adding: reproduce_data/ensemble_adv_samples.csv (deflated 78%)


In [180]:
ensemble_adv_samples.head()

Unnamed: 0,x_orig,y_orig,x_adv,y_adv,metrics
0,the amazing mr williams stars melvyn douglas w...,1,the amazing mister tennessee asterisk melvyn s...,0,"{'Query Exceeded': False, 'Running Time': 1.11..."
1,the plot of this enjoyable mgm musical is cont...,1,the plot of this enjoyable mgm musical is cont...,0,"{'Query Exceeded': False, 'Running Time': 0.40..."
2,i never saw this movie until i bought the tape...,1,i never saw this movie until i bought the tape...,0,"{'Query Exceeded': False, 'Running Time': 0.67..."
3,it takes a serbian or at least a balkan famili...,1,it train a serbian or at least a balkan famili...,0,"{'Query Exceeded': False, 'Running Time': 0.99..."
4,okay now i am pretty sure that my summary got ...,0,okay now i am pretty sure that my summary got ...,1,"{'Query Exceeded': False, 'Running Time': 0.32..."
