In [139]:
import nltk
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from spamassassin_client import SpamAssassin
from nltk.tokenize.treebank import TreebankWordDetokenizer
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nickhansen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [152]:
df = pd.read_csv('data/spam_ham_dataset.csv')
spam = df[df['label'] == 'spam']
ham = df[df['label'] == 'ham']
print(spam.head(2))
print(ham.head(2))

   Unnamed: 0 label                                               text  \
3        4685  spam  Subject: photoshop , windows , office . cheap ...   
7        4185  spam  Subject: looking for medication ? we ` re the ...   

   label_num  
3          1  
7          1  
   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   

   label_num  
0          0  
1          0  


In [155]:
spam_text = spam['text'].values
ham_text = ham['text'].values
len(spam_text), len(ham_text)

(1499, 3672)

In [146]:
spam_ds = [(text, 'spam') for text in spam_text]
ham_ds = [(text, 'ham') for text in ham_text]
combined_ds = spam_ds + ham_ds

import random
random.shuffle(combined_ds)

combined_ds[:2]

[('Subject: finding email addresses : advice\n \n there seem to be pretty frequent requests for email addresses on the linguist list . people with unix accounts can use the " finger " command to look up addresses - - if they know the name and affiliation of the person they \' re looking for ( others may have access to some sort of finger utility ) . for example , there was a recent request for the internet address of someone whose bitnet address is escatton @ albnyvms here \'s how i used finger to find his internet address : finger escatton @ albnyvms finger : albnyvms : unknown host finger escatton @ albnyvms . edu finger : albnyvms . edu : unknown host well , albnyvms does n\'t sound very internet-like , so let \'s start guessing : finger scatton @ albany . edu [ albany . edu ] ( there is no account scatton on this node . ) name : ernest scatton title : professor , german and slavic languages + literat address : humanities 246 university at albany 1400 washington av albany ny 12222-0

In [156]:
STOPWORDS = nltk.corpus.stopwords.words('english')

vocab = set()
for text, label in combined_ds:
    for word in nltk.word_tokenize(text):
        w_l = word.lower()
        if w_l.isalpha():
            vocab.add(w_l)

In [123]:
def get_features(text):
    rec_words = [w.lower() for w in nltk.word_tokenize(text)]
    features = {}
    for w in rec_words:
        if w in vocab:
            features[w] = True
    return features

In [124]:
combined = [(get_features(text), label) for text, label in combined_ds]
splitPoint = len(combined) // 3
train, test = combined[splitPoint:], combined[:splitPoint]
len(train), len(test)

(3448, 1723)

In [125]:
# NLTK's built-in implementation of the Naive Bayes classifier is trained
classifier = nltk.NaiveBayesClassifier.train(train)

# now, it is tested on the test set and the accuracy reported
print("Accuracy: ", nltk.classify.accuracy(classifier, test)) #nltk.classify

Accuracy:  0.93905977945444


In [126]:
def show_most_informative_features_in_list(classifier, n=10):
    """
    Return a nested list of the "most informative" features 
    used by the classifier along with it's predominant labels
    """
    cpdist = classifier._feature_probdist       # probability distribution for feature values given labels
    feature_list = []
    for (fname, fval) in classifier.most_informative_features(n):
        def labelprob(l):
            return cpdist[l, fname].prob(fval)
        labels = sorted([l for l in classifier._labels if fval in cpdist[l, fname].samples()], 
                        key=labelprob)
        feature_list.append([fname, labels[-1]])
    return feature_list

In [134]:
spam_features = [x for x in show_most_informative_features_in_list(classifier, n=500) if x[1] == 'spam']
len(spam_features)

417

In [150]:
spam_features

[['prescription', 'spam'],
 ['pain', 'spam'],
 ['stocks', 'spam'],
 ['sex', 'spam'],
 ['spam', 'spam'],
 ['popular', 'spam'],
 ['creative', 'spam'],
 ['advisor', 'spam'],
 ['adobe', 'spam'],
 ['conflict', 'spam'],
 ['ibm', 'spam'],
 ['unique', 'spam'],
 ['congress', 'spam'],
 ['deciding', 'spam'],
 ['pertaining', 'spam'],
 ['epson', 'spam'],
 ['sexual', 'spam'],
 ['sony', 'spam'],
 ['complaints', 'spam'],
 ['foresee', 'spam'],
 ['cheap', 'spam'],
 ['super', 'spam'],
 ['cisco', 'spam'],
 ['draw', 'spam'],
 ['target', 'spam'],
 ['generic', 'spam'],
 ['publisher', 'spam'],
 ['symbol', 'spam'],
 ['health', 'spam'],
 ['advises', 'spam'],
 ['beliefs', 'spam'],
 ['doctors', 'spam'],
 ['risks', 'spam'],
 ['ad', 'spam'],
 ['affordable', 'spam'],
 ['proven', 'spam'],
 ['shareholder', 'spam'],
 ['advanced', 'spam'],
 ['anxiety', 'spam'],
 ['shares', 'spam'],
 ['ali', 'spam'],
 ['solicitation', 'spam'],
 ['women', 'spam'],
 ['charset', 'spam'],
 ['effects', 'spam'],
 ['mix', 'spam'],
 ['penny', 's

In [12]:
spam_freqs = get_word_freqs(spam_text)
# total_spam = sum(spam_freqs.values())
ham_freqs = get_word_freqs(ham_text)
# total_ham = sum(ham_freqs.values())




In [48]:
spam_bigram_freqs = get_bigram_freqs(spam_text)
ham_bigram_freqs = get_bigram_freqs(ham_text)
spam_bigram_freqs.subtract(ham_bigram_freqs)
print(spam_bigram_freqs.most_common(15))

[(('?', '?'), 2386), ((':', '/'), 749), (('/', '/'), 747), (('http', ':'), 725), (('=', '3'), 462), (('.', '00'), 455), (('www', '.'), 411), (('nbsp', ';'), 378), (('=', 'http'), 333), (('com', '/'), 317), (('width', '='), 304), (('/', 'www'), 299), (('height', '='), 296), (('the', 'company'), 285), ((';', 'nbsp'), 285)]


In [14]:
diff = {k: v - (ham_freqs.get(k, 0)) for k, v in spam_freqs.items()}
most_spammy_words = sorted(diff.items(), key=lambda x: x[1], reverse=True)

In [25]:
with open("data/spam_working_with.txt", 'r') as f:
    raw_text = f.read().split("\n\n")
    header, body = raw_text[0], raw_text[1]
    body = nltk.word_tokenize(body)
    poisoned_text = [w if w.lower() not in set(most_spammy_words[:200]) else poison_word(w)
                 for w in body]
    poisoned_text = TreebankWordDetokenizer().detokenize(body)
    full_text = header + "\n\n" + poisoned_text
    
print(full_text)

# spam_text = spam_text.lower()
full_bytes = full_text.encode('utf-8')
assassin = SpamAssassin(full_bytes)
print(assassin.get_fulltext())

Subject: A CRY FOR HELP
Message-ID: <GTUBE1.1010101@example.net>
Date: Wed, 23 Jul 2003 23:30:00 +0200
From: Sender <sender@example.net>
To: Recipient <recipient@example.net>
Precedence: junk
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

DEAR FRIEND, I AM MRS. SESE-SEKO WIDOW OF LATE PRESIDENT MOBUTU SESE-SEKO OF ZAIRE? NOW KNOWN AS DEMOCRATIC REPUBLIC OF CONGO (DRC). I AM MOVED TO WRITE YOU THIS LETTER, THIS WAS IN CONFIDENCE CONSIDERING MY PRESENTCIRCUMSTANCE AND SITUATION . I ESCAPED ALONG WITH MY HUSBAND AND TWO OF OUR SONS GEORGE KONGOLO AND BASHER OUT OF DEMOCRATIC REPUBLIC OF CONGO (DRC) TO ABIDJAN, COTE D'IVOIRE WHERE MY FAMILY AND I SETTLED, WHILE WE LATER MOVED TO SETTLED IN MORROCO WHERE MY HUSBAND LATER DIED OF CANCER DISEASE . HOWEVER DUE TO THIS SITUATION WE DECIDED TO CHANGED MOST OF MY HUSBAND'S BILLIONS OF DOLLARS DEPOSITED IN SWISS BANK AND OTHER COUNTRIES INTO OTHER FORMS OF MONEY CODED FOR SAFE PURPOSE BECAUSE THE NEW 