<a href="https://colab.research.google.com/github/misha345a/E-commerce_Reviews_Classifier/blob/main/LSTM_Models_And_Attack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Goal
Train a deep learning model to predict Recommended vs Not Recommended classification based on customer reviews. <br> Then, attack the trained models using TextAttack to evaluate model robustness.

In [None]:
%%capture 

!pip install openpyxl -U

## Dataset

In [None]:
import pandas as pd
from tqdm.notebook import tqdm_notebook

# initiate tqdm for pandas.apply() functions
tqdm_notebook.pandas()

In [None]:
# expand notebook display options for dataframes
pd.set_option('display.max_colwidth', 200)
pd.options.display.max_columns = 999
pd.options.display.max_rows = 300

In [None]:
# load the dataset of your choice
# dataset = pd.read_excel('/content/Raw_Dataset_(Cleaned).xlsx')
dataset = pd.read_excel('/content/Upsampled_Dataset.xlsx')
# dataset = pd.read_excel('/content/Augmented_Dataset.xlsx')

In [None]:
# check value counts of prediction class
dataset['Recommended IND'].value_counts()

1    18540
0    16104
Name: Recommended IND, dtype: int64

In [None]:
# define a random seed for reproducible results
random_state = 42

# shuffle the dataset rows
dataset = dataset.sample(frac=1,
                         random_state=random_state)

In [None]:
from sklearn.model_selection import train_test_split

X = dataset.drop('Recommended IND', axis=1)
y = dataset['Recommended IND']

# split the dataset into an 80% training and 20% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=random_state,
                                                    shuffle=True)

## Tokenization

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
from sklearn.metrics import classification_report

In [None]:
# utilize the most frequently apprearing words in the corpus
num_words = 10000

# tokenize the training data
tokenizer = Tokenizer(num_words=num_words,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n1234567890') 
corpus = X_train['Review Text'].tolist() + X_test['Review Text'].tolist()
tokenizer.fit_on_texts(corpus)

In [None]:
# define the data word index
word_index = tokenizer.word_index

In [None]:
# encode training/test data into sequences
X_train_seq = tokenizer.texts_to_sequences(X_train['Review Text'].tolist())
X_test_seq = tokenizer.texts_to_sequences(X_test['Review Text'].tolist())

In [None]:
# define the max number of words to consider in each review
maxlen = max([len(x) for x in X_train_seq])
print(f"Max sequence length: {maxlen}\n")

# truncate and pad the training/test input sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

# output the resulting dimensions 
print("Padded shape (training):".ljust(25), X_train_pad.shape)
print("Padded shape (test):".ljust(25), X_test_pad.shape)

Max sequence length: 117

Padded shape (training):  (27715, 117)
Padded shape (test):      (6929, 117)


## LSTM Neural Network

In [None]:
# initiate LSTM for sequence classification
model = Sequential()

# embed each numeric in a 50-dimensional vector
model.add(Embedding(len(word_index) + 1,
                    50,     
                    input_length=maxlen))

# add bidirectional LSTM layer
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

# add a classifier 
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 117, 50)           545850    
                                                                 
 lstm_1 (LSTM)               (None, 100)               60400     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 606,351
Trainable params: 606,351
Non-trainable params: 0
_________________________________________________________________


In [None]:
batch_size = 512
num_epochs = 5

# train the model
model.fit(X_train_pad, y_train, 
          epochs=num_epochs,
          batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2b16bad110>

## Evaluation

In [None]:
# evaluate model on the test set
model.evaluate(X_test_pad, y_test)
y_test_pred = (model.predict(X_test_pad) >= 0.5).astype("int32") 



In [None]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92      3222
           1       0.97      0.89      0.93      3707

    accuracy                           0.93      6929
   macro avg       0.93      0.93      0.93      6929
weighted avg       0.93      0.93      0.93      6929



## Save Model (optional)

In [None]:
# save the entire model
model.save('LSTM_Raw_Dataset')

INFO:tensorflow:Assets written to: LSTM_Raw_Dataset/assets


INFO:tensorflow:Assets written to: LSTM_Raw_Dataset/assets


## Load Model (optional)

In [None]:
# reload a fresh Keras model from the saved model
new_model = tf.keras.models.load_model('/content/LSTM_Raw_Dataset')

In [None]:
# retrieve the maxlen variable of the model
model_config = new_model.get_config()
maxlen = model_config['layers'][0]['config']['batch_input_shape'][1]
print(maxlen)

## Custom Predictions (Examples)

In [None]:
# 5 randomely selected reviews
reviews = ["this dress is perfection! so pretty and flattering.",
           "this is my new favorite top! looks and fits as described.",
           "i could wear this every day, it is stylish and comfortable",
           "material is too thin and quality is poor",
           "it is nice material but the design makes you look like a pregnant lady"]

In [None]:
def model_pred(text):
  """
  Use the trained LSTM to make predictions on new examples.
  """
  tokens = tokenizer.texts_to_sequences([text]) 
  tokens_pad = pad_sequences(tokens, maxlen=maxlen)
  tokens_pad.shape
  model_pred = model.predict(tokens_pad)

  conf_val = model_pred[0][0]
  if conf_val>=0.5:
    print( f"'{text}'\nRecommended | {int(conf_val*100)}% Confidence\n")
  else:
    print( f"'{text}'\nNot Recommended | {int(conf_val*100)}% Confidence\n")  

In [None]:
for i in reviews:
  model_pred(i)

'this dress is perfection! so pretty and flattering.'
Recommended | 93% Confidence

'this is my new favorite top! looks and fits as described.'
Recommended | 75% Confidence

'i could wear this every day, it is stylish and comfortable'
Recommended | 95% Confidence

'material is too thin and quality is poor'
Not Recommended | 5% Confidence

'it is nice material but the design makes you look like a pregnant lady'
Not Recommended | 11% Confidence



## Textual Adversarial Attack

In [None]:
%%capture

!pip install textattack
!pip install tensorflow_text 

In [None]:
from textattack.models.wrappers import ModelWrapper
from textattack import AttackArgs
from textattack.datasets import Dataset
from textattack import Attacker
import numpy as np
import torch
import nltk
nltk.download('omw-1.4')

textattack: Updating TextAttack package dependencies.
textattack: Downloading NLTK required packages.


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


textattack: Downloading https://textattack.s3.amazonaws.com/word_embeddings/paragramcf.
100%|██████████| 481M/481M [00:12<00:00, 39.9MB/s]
textattack: Unzipping file /root/.cache/textattack/tmp9q9l_kwf.zip to /root/.cache/textattack/word_embeddings/paragramcf.
textattack: Successfully saved word_embeddings/paragramcf to cache.


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [None]:
class CustomTensorFlowModelWrapper(ModelWrapper):
  """
  Implementation of a model wrapper class to
  run TextAttack with a custom TensorFlow model.
  """
  def __init__(self, model):
    self.model = model

  def __call__(self, text_input_list):
    # retrieve model prediction
    text_array = np.array(text_input_list)
    tokens = tokenizer.texts_to_sequences(text_input_list) 
    tokens_pad = pad_sequences(tokens, maxlen=maxlen)
    model_pred = self.model.predict(tokens_pad)

    # return prediction scores as torch.Tensors
    logits = torch.FloatTensor(model_pred)
    logits = logits.squeeze(dim=-1)

    # for each output, index 0 corresponds to the negative 
    # and index 1 corresponds to the positive confidence 
    final_preds = torch.stack((1-logits, logits), dim=1)

    return final_preds

In [None]:
# example output
CustomTensorFlowModelWrapper(model)(["this is negative text. bad terrible awful.",
                                     "this is positive text. great amazing love"])

tensor([[0.9619, 0.0381],
        [0.1963, 0.8037]])

In [None]:
# example of a successful text atack which fools the model into predicting the wrong label
t1 = 'i love the tie dye and the accent stitching. back detail is fun!'
t2 = 'i adore the tie colouring and the accent stitching. back detail is amusing!'
CustomTensorFlowModelWrapper(model)([t1,t2])

tensor([[0.1261, 0.8739],
        [0.4111, 0.5889]])

## Creating the Attack

In [None]:
# initialize the model wrapper with the trained LSTM
model_wrapper = CustomTensorFlowModelWrapper(model)

# textattack requires custom datasets to be presented as a list of (input, ground-truth label) pairs
data_pairs = []
for input, label in zip(dataset['Review Text'], dataset['Recommended IND']):
  data_pairs.append((input, label))

new_dataset = Dataset(data_pairs, shuffle=True)

In [None]:
# construct the four fundamental components of the attack
from textattack.goal_functions.classification import UntargetedClassification
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification
from textattack.constraints.semantics import WordEmbeddingDistance
from textattack.transformations import WordSwapEmbedding
from textattack.search_methods import GreedyWordSwapWIR
from textattack import Attack

goal_function = UntargetedClassification(model_wrapper)

constraints = [
    RepeatModification(),
    StopwordModification(),
    WordEmbeddingDistance(min_cos_sim=0.9)
]

transformation = WordSwapEmbedding(max_candidates=50)

search_method = GreedyWordSwapWIR(wir_method="delete")

# construct the actual attack
attack = Attack(goal_function, constraints, transformation, search_method)

textattack: Unknown if model of class <class 'keras.engine.sequential.Sequential'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


In [None]:
from textattack.loggers import CSVLogger 
from textattack.attack_results import SuccessfulAttackResult
from textattack import Attacker
from textattack import AttackArgs
from textattack.datasets import Dataset

# attack until 1000 successfull attacks are reached
attack_args = AttackArgs(num_successful_examples=1000, 
                         random_seed=random_state)

attacker = Attacker(attack, new_dataset, attack_args)

attack_results = attacker.attack_dataset()

In [None]:
# display the attack results and the differences
logger = CSVLogger(color_method='html')

for result in attack_results:
    logger.log_attack_result(result)

from IPython.core.display import display, HTML
display(HTML(logger.df[['original_text', 'perturbed_text']].to_html(escape=False)))