In [1]:
import configparser
import numpy as np
import os
import sys
import tensorflow as tf

In [3]:
# set to latest model version number

def set_model_version_number():
    version_number = []
    global MODEL_VERSION
    global MODEL_PATH

    if os.path.exists(os.path.join(MODEL_SAVE_DIRECTORY,MODEL_NAME)):   
        for entry in os.listdir(os.path.join(MODEL_SAVE_DIRECTORY,MODEL_NAME)):
            version_number.append(entry)       
        MODEL_VERSION = version_number[-1]
        MODEL_PATH = os.path.join(MODEL_SAVE_DIRECTORY, MODEL_NAME, MODEL_VERSION)
        

In [4]:

config = configparser.ConfigParser()
config.read('config/main.conf')

DATASET = 1
MODEL_VERSION =  "0001"
DOWNLOAD_GOOGLE_LM = False

if DATASET == 1:
    set_dataset = "imdb"
if DATASET == 2:
    set_dataset = "s140"

DATASET_URL = (config[set_dataset]['DATASET_URL'])

DATASET_FOLDER = config[set_dataset]['DATASET_FOLDER']
DATASET_TAR_FILE_NAME = config[set_dataset]['DATASET_TAR_FILE_NAME']
DATASET_NAME = config[set_dataset]['DATASET_NAME']

MODEL_NAME = config[set_dataset]['MODEL_NAME']

CLEAN_DATA_FILE = os.path.join(DATASET_FOLDER,"normalized_dataset.csv")
TAR_FILE_PATH = os.path.join(DATASET_FOLDER,DATASET_TAR_FILE_NAME)
DATA_SET_LOCATION = os.path.join(DATASET_FOLDER,DATASET_NAME)

MODEL_SAVE_DIRECTORY = config[set_dataset]['MODEL_SAVE_DIRECTORY']
# Create the model save directory
if not os.path.exists(MODEL_SAVE_DIRECTORY):
    os.makedirs(MODEL_SAVE_DIRECTORY)
    
IMAGE_SAVE_FOLDER = config[set_dataset]['IMAGE_SAVE_FOLDER']
    
GLOVE_EMBEDDINGS = config[set_dataset]['GLOVE_EMBEDDINGS']
COUNTER_FITTED_VECTORS = config[set_dataset]['COUNTER_FITTED_VECTORS']

GLOVE_EMBEDDINGS_MATRIX = config[set_dataset]['GLOVE_EMBEDDINGS_MATRIX']
COUNTER_FITTED_EMBEDDINGS_MATRIX = config[set_dataset]['COUNTER_FITTED_EMBEDDINGS_MATRIX']

LM_URLS = config[set_dataset]['LM_URLS']
LM_DIRECTORY = config[set_dataset]['LM_DIRECTORY']

####### files required to reconstruct the final trained model ##############################
MODEL_PATH = os.path.join(MODEL_SAVE_DIRECTORY, MODEL_NAME, MODEL_VERSION)

set_model_version_number()

ASSESTS_FOLDER = os.path.join(MODEL_PATH,"assets")
MODEL_ASSETS_VOCABULARY_FILE = os.path.join(ASSESTS_FOLDER,"vocab")
MODEL_ASSETS_EMBEDDINGS_FILE = os.path.join(ASSESTS_FOLDER,"imdb_glove_embeddings_matrix")
MODEL_ASSETS_COUNTER_EMBEDDINGS_FILE = os.path.join(ASSESTS_FOLDER,"counter_embeddings_matrix")
MODEL_ASSETS_DISTANCE_MATRIX = os.path.join(ASSESTS_FOLDER,"distance_matrix.npy")
MODEL_ASSETS_SAVE_BEST_WEIGHTS = os.path.join(ASSESTS_FOLDER, "cp.ckpt")
MODEL_TRAINING_HISORTY_FILE = os.path.join(ASSESTS_FOLDER, "training_history.csv")



### load our pre trained sentiment model

In [5]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization # in Tensorflow 2.1 and above
import pickle 

MAX_VOCABULARY_SIZE = 50000
DIMENSION = 300
LEARNING_RATE = 1e-4


from manny_modules import tf_normalize_data as tfnd
from manny_modules import return_model as rmodel

saved_vocab = pickle.load(open(MODEL_ASSETS_VOCABULARY_FILE, 'rb'))
saved_word_index = dict(zip(saved_vocab, range(len(saved_vocab))))

saved_embeddings_matric = pickle.load(open(MODEL_ASSETS_EMBEDDINGS_FILE, 'rb'))


vectorizer_layer = TextVectorization(
    standardize=tfnd.normlize_data, 
    max_tokens=MAX_VOCABULARY_SIZE, 
    output_mode='int',
    output_sequence_length=300)

# build vocabulary, will also run the normalize_data() 
vectorizer_layer.set_vocabulary(saved_vocab)


saved_model = rmodel.create_model(vectorizer_layer,
                                  saved_embeddings_matric,
                                  saved_vocab,
                                  dimension=DIMENSION, 
                                  lrate=LEARNING_RATE)

# load the weights
saved_model.load_weights(MODEL_ASSETS_SAVE_BEST_WEIGHTS) # loads best weights saved during training

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fb24c21c160>

### Test model - check predictions for unseen data

In [6]:
# check negative review
p_2 = [["Seriously, don't bother if you're over 12. This looks like a kids show designed purely to sell merchandise, theme park rides, etc. No logic, holes all over the shop, no characters motivation and really crap acting to top it off... just rubbish, really"]]
prob_positive = saved_model.predict(p_2)

print("Positive confidence: ",prob_positive, " Negative confidence: ", (1 - prob_positive ))


# check positive review
p = [["It's one thing to bring back elements, characters, settings and stories, and to flash them in front of the audience to cash in on the nostalgia and/or recognisable memorabilia but without using it to further the plot and other to do exactly the opposite. It was about time that Star Wars directives understood that it is too unique a product to be lend to corporate filmmakers. Star Wars needs to be understood and its uniqueness has to be acknowledged in order to make the new stories feel like they belong. This may sound too obvious but if you ever wondered why the new SW movies are so controversial this may be the reason.Like with 'Spider-Man: Into the Spider-verse (2018)' and their comicbook-industry experts participation, the creators behind The Mandalorian were experts of the industry, connoisseurs of the Star Wars Universe and even long time fans. So they were able to not only recapture the aesthetic of the grimy, battered Star Wars but also build upon it taking the most 'subtle' things into account. Things like the predominancy of puppets and practical effects over CGI, settings you can feel and touch over green screens and the abundancy of not only known elements previously seen in Star Wars, but a whole batch of new creatures, designs and overall plot elements that felt like they belong to this universe and had always been there. Exceeding expectations are not only the visual aspects but the narrative too. It might be too late for some story elements now, but it is of great importance that from now on you try to watch the unraveling of the story unspoiled. I was lucky to have seen the premiere of the show before the 'memefication' of a certain 'element' that went viral and became one of the biggest highlights of the show. But for me I saw the reveal of this element unspoiled and I was pleasantly shocked, a memory I'll always carry with me. The ability of these creators to generate such shock value and deep moments it's often baffling to me. This is proof that the creators behind the narrative are fully aware of the complexities of the universe they are tampering with and like an experienced surgeon, they are able to tweak, traverse and call back any Star Wars element as they please and with astonishing results."]]
prob_positive = saved_model.predict(p)

print("Positive confidence: ",prob_positive, " Negative confidence: ", (1 - prob_positive ))

Positive confidence:  [[0.01855881]]  Negative confidence:  [[0.9814412]]
Positive confidence:  [[0.9553545]]  Negative confidence:  [[0.04464549]]


In [776]:
test_n = "some string"
test_nn = [[test_n]]
print(test_nn)

[['some string']]


### load the distance matrix from disk (load this before running below tests)
- This is a large file (~20GB), so will take time to load

In [7]:

impodistance_matrixumpy as np

distance_matrix = np.load(MODEL_ASSETS_DISTANCE_MATRIX)



### test the distance matrix

In [24]:
target_word =  saved_word_index['england']

In [25]:
from manny_modules import nearest_neighbour as nn

nearest_neighbour, distance_to_neighbour = nn.closest_neighbours(target_word, distance_matrix, number_of_words_to_return=5, max_distance=None)

In [26]:
closest_word = [saved_vocab[x] for x in nearest_neighbour]

print("Words closest to `%s` are `%s` " % (saved_vocab[target_word], closest_word))

Words closest to `england` are `['british', 'britain', 'brits', 'uk', 'britons']` 


# Genetic Attack

### load dataset

In [1991]:
import pandas as pd

dtypes = {'sentiment': 'int', 'text': 'str'}
data_frame = pd.read_csv(CLEAN_DATA_FILE,dtype=dtypes)

# split the dataset
#train_data_raw, test_data_raw = train_test_split(data_frame, test_size= (1 - TRAINING_SPLIT), random_state = 7)

### create a sample data set from dataframe of size ```SAMPLE_SIZE```

In [1992]:
SAMPLE_SIZE = 1100

data_sample = data_frame.sample(n = SAMPLE_SIZE) 

# show the first 5 randonly selected data items
data_sample.head()

Unnamed: 0,sentiment,text
48645,1,over the years i've come to be a fan of direc...
12264,1,ladies and gentlemen we've really got ourselv...
35037,0,i have to admit i did not finish this movie be...
15373,1,this review contains some small yet significa...
5589,0,this kiyoshi kurosawa ghost movie is pretty wi...


### add a new column ```probs``` to our sample dataframe to store probability of text being positove (default values = 0)

In [1993]:
data_sample['probs'] = 0
data_sample['probs'] = data_sample['probs'].astype(float) # has to be of type float, to store probability values

data_sample = data_sample.reset_index(drop=True) # reindex so we start from 0 in the sample data set
data_sample.head()

Unnamed: 0,sentiment,text,probs
0,1,over the years i've come to be a fan of direc...,0.0
1,1,ladies and gentlemen we've really got ourselv...,0.0
2,0,i have to admit i did not finish this movie be...,0.0
3,1,this review contains some small yet significa...,0.0
4,0,this kiyoshi kurosawa ghost movie is pretty wi...,0.0


### now run each one against model and store probabilities

In [1994]:
for i in data_sample.index:
    p = saved_model.predict([data_sample.iloc[i]['text']])
    data_sample.at[i,'probs']= p

data_sample.head()

Unnamed: 0,sentiment,text,probs
0,1,over the years i've come to be a fan of direc...,0.846156
1,1,ladies and gentlemen we've really got ourselv...,0.058191
2,0,i have to admit i did not finish this movie be...,0.003571
3,1,this review contains some small yet significa...,0.973612
4,0,this kiyoshi kurosawa ghost movie is pretty wi...,0.015457


### add new columns to hold results after genetic attack

In [1995]:
# copy current sentiment values to new columns, we can then later go through and compare any values that have been changed during the GA attack
data_sample['ga_sentiment'] = data_sample['sentiment']

# create new ga_text to hold perturbed text
data_sample['ga_text'] = ""

# create ga_probs to hold new probability values after GA Attack, fill with current values
data_sample['ga_probs'] = data_sample['probs']

# create ga_num_changes to hold the number of words changed
data_sample['ga_num_changes'] = 0
data_sample['ga_num_changes'] = data_sample['ga_num_changes'].astype(int)

# create ga_lev_ratio to hold the Levenshtein ratio
data_sample['ga_lev_ratio'] = 0.0
data_sample['ga_lev_ratio'] = data_sample['ga_lev_ratio'].astype(float)

# add field to indicate if sentiment was flipped on review text
data_sample['ga_flipped_sentiment'] = 'N'
data_sample['ga_flipped_sentiment'] = data_sample['ga_flipped_sentiment'].astype(str)


# percentage of words changed in sentence
data_sample['ga_percent_change'] = 0.0
data_sample['ga_percent_change'] = data_sample['ga_percent_change'].astype(float)



data_sample.head()

Unnamed: 0,sentiment,text,probs,ga_sentiment,ga_text,ga_probs,ga_num_changes,ga_lev_ratio,ga_flipped_sentiment,ga_percent_change
0,1,over the years i've come to be a fan of direc...,0.846156,1,,0.846156,0,0.0,N,0.0
1,1,ladies and gentlemen we've really got ourselv...,0.058191,1,,0.058191,0,0.0,N,0.0
2,0,i have to admit i did not finish this movie be...,0.003571,0,,0.003571,0,0.0,N,0.0
3,1,this review contains some small yet significa...,0.973612,1,,0.973612,0,0.0,N,0.0
4,0,this kiyoshi kurosawa ghost movie is pretty wi...,0.015457,0,,0.015457,0,0.0,N,0.0


## check the predictions are correct, if not then drop row from data set
### we only want to keep correctly classified data items

In [1996]:
drop_indexes = []

for i in data_sample.index:
    if data_sample.iloc[i]['sentiment'] == 1 and data_sample.iloc[i]['probs'] > 0.5:
        continue
    if data_sample.iloc[i]['sentiment'] == 0 and data_sample.iloc[i]['probs'] <= 0.5:
        continue
    else:
        drop_indexes.append(i)
    

In [1997]:
data_sample = data_sample.drop(drop_indexes)
data_sample = data_sample.reset_index(drop=True) # reindex dataframe to start from 0

# check how many rows we dropped due to incorrect classification
print("Number of data items dropped from sample: ",(SAMPLE_SIZE - len(data_sample)))
print("Number of data items kept in sample: ",(len(data_sample)))

Number of data items dropped from sample:  52
Number of data items kept in sample:  1048


### GA functions 

In [1998]:
from random import randrange
import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english')) 

def prediction_probability(model, text_review):
    '''return probability of this string being a positive sentiment'''
    return model.predict([text_review])


def crossover(parent_one, parent_two):
    p_one = parent_one.split()
    p_two = parent_two.split()
    
    new_offspring = p_one.copy()
    text_len = min(len(new_offspring), len(p_two))
    # use random univform distribution to select which words to replace 
    # when creating the new offspring for our two parent strings
    for i in range(text_len):
        if np.random.uniform() < 0.5:
            new_offspring[i] = p_two[i]
    return ' '.join(new_offspring)
    
    
def mutation(model, text_review, current_prediction, target_label, max_perturbations, max_neighbours, saved_vocab):
    '''returns the string after swapping max_perturbations nearest neighbours
    of each string'''
    
    # keep track of list index of the which word we have already changed
    selected_index = []
    
    #split string so we can iterate over each word
    t_split = text_review.split()
    
    # select a random index value
    indx = randrange(len(t_split))
    
    for i in range(max_perturbations):
        # get a random index number for word list
        indx = randrange(len(t_split))
        
        found_in_vocab = False
        # skip over all stop words and any indexes we have already selected
        while t_split[indx] in stop_words or indx in selected_index  or not found_in_vocab:     
            
            indx = randrange(len(t_split))
            # if word is not found in vocabulary, skip it and try next word
            try:
                target_word = saved_word_index[t_split[indx]]
                found_in_vocab = True
            except KeyError:
                found_in_vocab = False
        
        # now we have a word that is not a stop word and has not already been selected
        selected_index.append(indx) # add to our list
        
        
        ### FITNESS TEST #########
        # we want to now get a list of the closest max_neighbours synonyms
        target_word = saved_word_index[t_split[indx]]
     
        nearest_neighbour, _ = nn.closest_neighbours(target_word, distance_matrix, number_of_words_to_return=max_neighbours)
        
        # create a list of the closest words returned
        closest_word = [saved_vocab[x] for x in nearest_neighbour]
        
        # now we need to substitute each word and find the new probability after each substitution
        # we need the original label and the target label we are aiming for
       
        original_word = t_split[indx]
        word_prob_dict = dict()
        for w in closest_word:
            if not w: # if we have an empty string then do nothing, we don't want to remove a word from the string
                continue
            t_split[indx] = w
            word_prob_dict[w] = model.predict([' '.join(t_split)])
            
        # didn't find any suitable words
        if len(word_prob_dict) == 0:
            continue
        
        # the word we decided to substitute is based on the probability returned by the model
        # if we have target_label == 1 then we are trying to go from negative to positive sentiment
        # therefore we want to keep the highest probability returned
        # NB the probability returned by our model is the probability that the review is positive, higher values == more positive sentiment
        if target_label == 1:
            sub_word = max(word_prob_dict, key=word_prob_dict.get)
            t_split[indx] = sub_word
            
        # if our target_label == 0 i.e. negative, then we are going from positive to negative
        # hence we want to keep only the lowest value
        else:
            sub_word = min(word_prob_dict, key=word_prob_dict.get)
            t_split[indx] = sub_word
        
        ### END FITNESS TEST ##########
        
        # add selected index to selected_index list
        selected_index.append(indx)

    return ' '.join(t_split)
    

def generate_population(model, text_review, population_size, current_prediction, target_label, max_perturbations, max_neighbours, saved_vocab):
    
    '''return list of strings of size population_size'''
    
    population = []
    
    for i in range(population_size):
        t = mutation(model, text_review, current_prediction, target_label, max_perturbations, max_neighbours, saved_vocab)
        population.append(t)
    return population



In [None]:
import Levenshtein as lev

POPULATION_SIZE = 50 # max population size to create
MAXIMUM_ITERATIONS = 25 # stopping condition for loop if we do not find an optimal solution
MAX_PERTURBATIONS = 5 # maximum number of changes to make for each population member
MAX_NEIGHBOURS = 4 # maximum number of neighbouring words to return and check against



def population_probs_df(population_list, model):
    # dataframe to store population text and probabilities
    population_probs = pd.DataFrame(columns=['ga_text','ga_probs'])
    

    # make sure columns have the correct types
    population_probs['ga_text'] = population_probs['ga_text'].astype(str)
    population_probs['ga_probs'] = population_probs['ga_probs'].astype(float)
    
    for i in range(len(population_list)):
        new_row = {'ga_text':population_list[i], 'ga_probs':model.predict([population_list[i]])}
        #append row to the dataframe
        population_probs = population_probs.append(new_row, ignore_index=True)
    
    # sort the array by probabilities column, i.e column 2 
    return  population_probs.sort_values('ga_probs') # return sorted df, sorted by probability lowest to highest
    

def found_solution(target_label, population_df):
    
    if target_label == 1:
        if population_df.iloc[-1]['ga_probs'] > 0.5:
            return True
    
    if target_label == 0:
        if population_df.iloc[0]['ga_probs'] <= 0.5:
            return True 
    return False


def number_of_changes_made(review_before, review_after):
    word_count = 0
    r_before = review_before.split()
    r_after = review_after.split()
    for i in range(len(r_before)):
        if r_before[i] == r_after[i]:
            pass
        else:
            word_count += 1
    return word_count

data_sample_len = len(data_sample)
for i in range (data_sample_len):
    
    print("####### Data Item: ",i+1," #######")
    target_label = 0 if data_sample.iloc[i]['sentiment'] == 1 else 1
    current_prediction = data_sample.iloc[i]['probs']
    
    p = generate_population(saved_model, data_sample.iloc[i]['text'], POPULATION_SIZE, current_prediction, target_label, MAX_PERTURBATIONS, MAX_NEIGHBOURS, saved_vocab)
    
    population_dataframe = population_probs_df(p, saved_model)

    
    ## GA Attack START
    # need to run through and do crossover and mutation, recheck the label and if it has flipped then we stop, other wise keep going
    # also on each iteration keep updating the ga_text and ga_prob and if label has changed update the ga_sentiment column and stop
    for j in range(MAXIMUM_ITERATIONS):
        
        
        # first check if we have found a solution, if yes then we are done so save results and break
        # and move onto next
        if found_solution(target_label, population_dataframe):
            #save solution
            if target_label == 1:   
                data_sample.at[i, "ga_text"] = population_dataframe.iloc[-1]['ga_text']
                data_sample.at[i, "ga_probs"] = population_dataframe.iloc[-1]['ga_probs']
                data_sample.at[i, "ga_sentiment"] = 1
                break
            if target_label == 0:
                data_sample.at[i, "ga_text"] = population_dataframe.iloc[0]['ga_text']
                data_sample.at[i, "ga_probs"] = population_dataframe.iloc[0]['ga_probs']
                data_sample.at[i, "ga_sentiment"] = 0
                break
        
        # limit max percentage change to 20% of each review
        if round(1 - (lev.ratio(data_sample.at[0,'text'],data_sample.at[0,'ga_text'])), 2) > 0.20:
            data_sample.at[i, "ga_text"] = population_dataframe.iloc[-1]['ga_text']
            data_sample.at[i, "ga_probs"] = population_dataframe.iloc[-1]['ga_probs']
            break
            
        
        if target_label == 1:
            parent_one = population_dataframe.iloc[-1]['ga_text']
            parent_two = population_dataframe.iloc[-2]['ga_text']
        else: 
            parent_one = population_dataframe.iloc[0]['ga_text']
            parent_two = population_dataframe.iloc[1]['ga_text']
        
        
        if target_label == 1 and (data_sample.iloc[i]['ga_probs'] < population_dataframe.iloc[-1]['ga_probs']):
            data_sample.at[i, "ga_text"] = population_dataframe.iloc[-1]['ga_text']
            data_sample.at[i, "ga_probs"] = population_dataframe.iloc[-1]['ga_probs']
            
        if target_label == 0 and (data_sample.iloc[i]['ga_probs'] > population_dataframe.iloc[0]['ga_probs']):
            data_sample.at[i, "ga_text"]  = population_dataframe.iloc[0]['ga_text']
            data_sample.at[i, "ga_probs"] = population_dataframe.iloc[0]['ga_probs']
        
        # 
        t = crossover(parent_one, parent_two)
        
        # we didn't find a solution yet, so we do crossover and generate a new population of possible solutions
        p = generate_population(saved_model, t, POPULATION_SIZE, current_prediction, target_label, MAX_PERTURBATIONS, MAX_NEIGHBOURS, saved_vocab)
        population_dataframe = population_probs_df(p, saved_model)
    print("\tNumber of words swapped: ", number_of_changes_made(data_sample.at[i, "text"], data_sample.at[i, "ga_text"]))
    print("\tProb. before and after: ", data_sample.at[i, "probs"]," : ", data_sample.at[i, "ga_probs"])

    
       


####### Data Item:  1  #######
	Number of words swapped:  5
	Prob. before and after:  0.8461564183235168  :  0.4633117616176605
####### Data Item:  2  #######
	Number of words swapped:  25
	Prob. before and after:  0.0035706618800759315  :  0.7263316512107849
####### Data Item:  3  #######
	Number of words swapped:  18
	Prob. before and after:  0.9736121296882629  :  0.3686012327671051
####### Data Item:  4  #######
	Number of words swapped:  11
	Prob. before and after:  0.015457245521247387  :  0.5674644708633423
####### Data Item:  5  #######
	Number of words swapped:  19
	Prob. before and after:  0.9906110167503357  :  0.3801567852497101
####### Data Item:  6  #######
	Number of words swapped:  12
	Prob. before and after:  0.9230806231498718  :  0.3661811649799347
####### Data Item:  7  #######
	Number of words swapped:  24
	Prob. before and after:  0.012578213587403297  :  0.5830695033073425
####### Data Item:  8  #######
	Number of words swapped:  10
	Prob. before and after:  0.04

### save the results after running GA Attack

In [1990]:
# save the final set of results
data_sample.to_csv('imdb_dataset/ga_results.csv', index = False)

# load saved file, so we can remove any results that were not processed due to Jupyter notebook crash
dtypes = {'sentiment': 'int', 
          'text': 'str', 
          'probs': 'float', 
          'ga_sentiment': 'int', 
          'ga_text': 'str',
          'ga_probs': 'float', 
          'ga_num_changes': 'int', 
          'ga_lev_ratio': 'float', 
          'ga_flipped_sentiment': 'str',
         'ga_percent_change': 'float'}

data_sample = pd.read_csv('imdb_dataset/ga_results.csv', dtype=dtypes)

# drop any rows with nan value - i.e. data items not processed due to Jupyter notebook crash
# so we don't have to re-run the whole GA Attack again
data_sample = data_sample.dropna()

# save the final set of results
data_sample.to_csv('imdb_dataset/ga_results.csv', index = False)


### number of test data items processed during GA Attack

In [1915]:
print("Number of data items processed in GA Attack: ", len(data_sample))

Number of data items processed in GA Attack:  616


In [1916]:
data_sample.head()

Unnamed: 0,sentiment,text,probs,ga_sentiment,ga_text,ga_probs,ga_num_changes,ga_lev_ratio,ga_flipped_sentiment
0,1,it is first and foremost a chick flick it is ...,0.917392,0,it is first and foremost a chick flick it is a...,0.316359,0,0.0,N
1,1,i'm guessing that we all no matter if we are ...,0.623475,0,i'm guessing that we all no matter if we are f...,0.368994,0,0.0,N
2,0,today i had a real craving for a scifi movie a...,0.004375,1,today i had a veritable craving for a scifi mo...,0.594624,0,0.0,N
3,1,the kennel murder case starts off at a run and...,0.973507,0,the pounds murder lawsuit starts off at a run ...,0.131199,0,0.0,N
4,1,one reason pixar has endured so well and been...,0.985845,0,one reason pixar has experimented so well and ...,0.262389,0,0.0,N


### calculate Levenshtein ratio statistics for before and after GA Attack

In [1917]:
import Levenshtein as lev

# calculate Levenshtein ratio for text and ga_text
# text == initial review text, ga_text == review text after GA Attack
# value closer to 1.0 indicates more similarity, i.e. less changes made to original text
for i in range (len(data_sample)):
    data_sample.at[i,'ga_lev_ratio'] = lev.ratio(data_sample.at[i,'text'],data_sample.at[i,'ga_text'])
    

data_sample.head()


Unnamed: 0,sentiment,text,probs,ga_sentiment,ga_text,ga_probs,ga_num_changes,ga_lev_ratio,ga_flipped_sentiment
0,1,it is first and foremost a chick flick it is ...,0.917392,0,it is first and foremost a chick flick it is a...,0.316359,0,0.915202,N
1,1,i'm guessing that we all no matter if we are ...,0.623475,0,i'm guessing that we all no matter if we are f...,0.368994,0,0.9775,N
2,0,today i had a real craving for a scifi movie a...,0.004375,1,today i had a veritable craving for a scifi mo...,0.594624,0,0.925351,N
3,1,the kennel murder case starts off at a run and...,0.973507,0,the pounds murder lawsuit starts off at a run ...,0.131199,0,0.879293,N
4,1,one reason pixar has endured so well and been...,0.985845,0,one reason pixar has experimented so well and ...,0.262389,0,0.9633,N


### check which review we successfully flipped the sentiment and set ```ga_flipped_sentiment``` column

In [1945]:
flipped_count = 0
for i in range(len(data_sample)):
    if  data_sample.at[i,'sentiment'] != data_sample.at[i,'ga_sentiment']:
        data_sample.at[i,'ga_flipped_sentiment'] = 'Y'
        flipped_count += 1
print("Percentage of reviews where sentiment was changed after attack: ", round(flipped_count/len(data_sample) * 100, 2), "%")

Percentage of reviews where sentiment was changed after attack:  99.84 %


### make sure all reviews are of the same length before and after GA Attack
i.e. make sure we didn't remove any words

In [1920]:
len_diff_count = 0
# check to make sure no words were completely removed form reviews
for i in range(len(data_sample)):
    if  len(data_sample.at[i,'text'].split()) != len(data_sample.at[i,'ga_text'].split()):
        len_diff_count += 1
print(len_diff_count)

0


### count how many words were changed for each review, average number of word changed and percentage of change to review

In [1963]:
word_total_count = 0
percent_modified_total = 0.0
for i in range(len(data_sample)):
    review_before = data_sample.at[i,'text'].split()
    review_after = data_sample.at[i,'ga_text'].split()
    word_count = 0
    for j in range(len(review_before)):
        if review_before[j] == review_after[j]:
            pass
        else:
            word_count += 1
    data_sample.at[i, 'ga_num_changes'] = word_count
    percent_modified_total += (word_count / len(review_before))
    data_sample.at[i, 'ga_percent_change'] = round(word_count / len(review_before), 2)
    word_total_count += word_count
            
data_sample.head()

print("Avg. num of words changed changes made: ", (int)(word_total_count / len(data_sample)))
print("Avg. percentage modified: ", (round(percent_modified_total / len(data_sample), 2) * 100),"%")
      

Avg. num of words changed changes made:  18
Avg. percentage modified:  10.0 %


In [1978]:
data_sample.head(20)
if round(1 - (lev.ratio(data_sample.at[1,'text'],data_sample.at[1,'ga_text'])), 2) > 0.01:
    print("over")

over


In [1988]:
over_20_percent = 1

for i in range(len(data_sample)):
    if data_sample.at[i,'ga_percent_change'] > 0.20:
        over_20_percent += 1
print(over_20_percent)

print((len(data_sample) - over_20_percent)/len(data_sample))

48
0.922077922077922


In [1989]:
data_sample.head()

Unnamed: 0,sentiment,text,probs,ga_sentiment,ga_text,ga_probs,ga_num_changes,ga_lev_ratio,ga_flipped_sentiment,ga_percent_change
0,1,it is first and foremost a chick flick it is ...,0.917392,0,it is first and foremost a chick flick it is a...,0.316359,11,0.915202,Y,0.09
1,1,i'm guessing that we all no matter if we are ...,0.623475,0,i'm guessing that we all no matter if we are f...,0.368994,12,0.9775,Y,0.01
2,0,today i had a real craving for a scifi movie a...,0.004375,1,today i had a veritable craving for a scifi mo...,0.594624,25,0.925351,Y,0.1
3,1,the kennel murder case starts off at a run and...,0.973507,0,the pounds murder lawsuit starts off at a run ...,0.131199,17,0.879293,Y,0.19
4,1,one reason pixar has endured so well and been...,0.985845,0,one reason pixar has experimented so well and ...,0.262389,30,0.9633,Y,0.05
