In [43]:
import nltk
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from spamassassin_client import SpamAssassin
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import wordnet as wn
from util import run_sa, evaluate

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
with open('spammy_words.txt', 'r') as f:
    spammy_words = set(f.read().splitlines())
len(list(spammy_words))

232

In [45]:
# just default to getting first synset of word, ordered by popularity, wont guarantee correct usage in context but should be good enough
# could use BERT in this way as well? "A synonym for ___ is ___" prompt?
# get nex
def replace_word(word):
    try: 
        synset = wn.synsets(word)[0] # get most popular synset
        lemmas = synset.lemmas()
        for l in lemmas:
            if word.lower() != l.name().lower():
                return True, l.name().replace("_", " ")
        return False, word
    except:
        return False, word

In [46]:
def poison_word(word):
    return "..".join(word)

In [47]:
def get_subj_body_tokens(header, body):
    # managing header
    header_spl = header.split("\n", 1)
    subj = header_spl[0]
    header_leftovers = header_spl[1]
    # ----
    subj_content = subj.split("Subject:", 1)[1].strip()
    body_content = body.strip()

    subj_tokens = nltk.word_tokenize(subj_content)
    body_tokens = nltk.word_tokenize(body_content)

    return subj_tokens, body_tokens, header_leftovers

In [48]:
def reconsruct_email(subj_tokens, header_leftovers, body_tokens):
    header_str = "Subject: " + (TreebankWordDetokenizer().detokenize(subj_tokens).strip() + "\n" + header_leftovers).strip()
    body_str = TreebankWordDetokenizer().detokenize(body_tokens).strip()

    return header_str + "\n\n" + body_str

In [49]:
def synonym_replace_attack(email, fallback_poison=False):
    spl = email.split("\n\n", 1)
    header, body = spl[0].strip(), spl[1].strip()
    subj_tokens, body_tokens, header_leftovers = get_subj_body_tokens(header, body)

    new_subj_tokens = []
    new_body_tokens = []

    num_potential = 0 # we know there is a spammy word here
    num_successful = 0 # we replaced it with something
    repl = [] # replacement information
    
    for s_t in subj_tokens:
        w_new = s_t
        if s_t in spammy_words:
            num_potential += 1
            success, new_word = replace_word(s_t)
            w_new = new_word
            if success:
                num_successful += 1
            elif fallback_poison: # poison word if we can not find a synonym
                w_new = poison_word(s_t)
            repl.append((s_t, w_new, success))
        new_subj_tokens.append(w_new)
            

    for b_t in body_tokens:
        w_new = b_t
        if b_t in spammy_words:
            num_potential += 1
            success, new_word = replace_word(b_t)
            w_new = new_word
            if success:
                num_successful += 1
            elif fallback_poison: # poison word if we can not find a synonym
                w_new = poison_word(b_t)
            repl.append((b_t, w_new, success))
        new_body_tokens.append(w_new)

    new_email = reconsruct_email(new_subj_tokens, header_leftovers, new_body_tokens)
    return num_potential, num_successful, repl, new_email

In [50]:
def poisoning_attack(email):
    spl = email.split("\n\n", 1)
    header, body = spl[0].strip(), spl[1].strip()
    subj_tokens, body_tokens, header_leftovers = get_subj_body_tokens(header, body)

    new_subj_tokens = []
    new_body_tokens = []

    num_replacements = 0 # we know there is a spammy word here
    repl = [] # replacement information
    
    for s_t in subj_tokens:
        w_new = s_t
        if s_t in spammy_words:
            w_new = poison_word(s_t)
            num_replacements += 1
            repl.append((s_t, w_new, True))
        new_subj_tokens.append(w_new)
            

    for b_t in body_tokens:
        w_new = b_t
        if b_t in spammy_words:
            w_new = poison_word(b_t)
            num_replacements += 1
            repl.append((b_t, w_new, True))
        new_body_tokens.append(w_new)
        
    new_email = reconsruct_email(new_subj_tokens, header_leftovers, new_body_tokens)
    return num_replacements, repl, new_email

In [51]:
good_spam_df = pd.read_csv('data/goodSpam/goodSpam.csv')
good_spam_text = good_spam_df['text']
good_spam_labels = good_spam_df['label']

In [52]:
# executing the attack
syn_repl_att_ret = [synonym_replace_attack(t) for t in good_spam_text]
poison_att_ret = [poisoning_attack(t) for t in good_spam_text]
both_att_ret = [synonym_replace_attack(t, fallback_poison=True) for t in good_spam_text]

In [53]:
syn_repl_text = [t[3] for t in syn_repl_att_ret]
poison_text = [t[2] for t in poison_att_ret]
both_att_text = [t[3] for t in both_att_ret]

In [56]:
print(both_att_text[5])

Subject: yakubu lawal @ international trust bank plc.
Message-ID: <GTUBE1.1010101@example.net>
Date: Wed, 23 Jul 2003 23:30:00 +0200
From: Sender <sender@example.net>
To: Recipient <recipient@example.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

dr yakubu lawal, head, operations manager, international trust bank plc . plot 84 / 88, ajose adeogun street, victoria island, lagos Federal Republic of Nigeria . attention: confidential letterfirst, i must solicit your confidence in thistransaction, this is by virtue of its nature as beingutterly confidential and top s..e..c..r..e..t . though i knowthat a transaction of this magnitude will make any oneapprehensive and worried, but i am assuring you thatall will be well at the end of the transaction . for the purpose of introduction, i am dr yakubu lawal, head, operations manager, international trust bank plc, in charge of personal and corporate accountsabove us $5, 000, 000 . 00 (five millio

In [54]:
# evaluate baseline
_, pred, scores = evaluate(zip(good_spam_text, good_spam_labels))
# get avg score
print(f'AVG Score: {np.mean(scores)}')



              precision    recall  f1-score   support

           1       1.00      1.00      1.00       160

    accuracy                           1.00       160
   macro avg       1.00      1.00      1.00       160
weighted avg       1.00      1.00      1.00       160

[[160]]
AVG Score: 5.527500000000001


In [None]:
# evaluate poison
_, pred, scores = evaluate(zip(poison_text, good_spam_labels))
print(f'AVG Score: {np.mean(scores)}')



              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.83      0.91       268

    accuracy                           0.83       268
   macro avg       0.50      0.41      0.45       268
weighted avg       1.00      0.83      0.91       268

[[  0   0]
 [ 46 222]]
AVG Score: 4.9619402985074625


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# evaluate synonym replacement
_, pred, scores = evaluate(zip(syn_repl_text, good_spam_labels))
print(f'AVG Score: {np.mean(scores)}')



              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.82      0.90       268

    accuracy                           0.82       268
   macro avg       0.50      0.41      0.45       268
weighted avg       1.00      0.82      0.90       268

[[  0   0]
 [ 48 220]]
AVG Score: 5.068283582089553


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# evaluate synonym replacement and poisoning fallback
_, pred, scores = evaluate(zip(both_att_text, good_spam_labels))
print(f'AVG Score: {np.mean(scores)}')



              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.80      0.89       268

    accuracy                           0.80       268
   macro avg       0.50      0.40      0.45       268
weighted avg       1.00      0.80      0.89       268

[[  0   0]
 [ 53 215]]
AVG Score: 4.888059701492537


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
