In [279]:
import pandas as pd
import nltk
import re
import numpy as np

# Preprocessing

In [280]:
def merge_cols(row, columns):
    merged = ''
    is_na = row.isna()
    for col in columns:
        if not is_na[col]:
            merged = merged + row[col] + ' '
    merged = merged[:-1]
    return merged

In [281]:
def tokenize(text):
    # Remove single periods, commas, apostrophes, quotes, and parentheses
    text = re.sub(r'(?<!\.)\.(?!\.)|,|\'|\"|\(|\)|‘|’|“|”', '', text)
    
    # Use nltk to split into tokens
    tokens = nltk.word_tokenize(text)
    
    # I'm leaving other punctuation since it can be an indicator of being spam, such as many exclamation marks, etc.
    # nltk tokenize should mostly split those punctuations into their own tokens
    
    # remove stopwords and numbers
    stopwords = set(nltk.corpus.stopwords.words('english'))
    filtered = [w for w in tokens if not ((w.lower() in stopwords) or (w.isnumeric()))]
    return filtered

In [282]:
def split(df):
    """
    Removes *standard* punctuation and stop words
    Concatenates all columns other than index
    Leaves capitalization, as that information could be used as another feature
    (e.g. number of all-caps words)
    :param df:
    :return: original dataframe, preprocessed dataframe
    """
    text_cols = df.columns[1:]
    out_df = pd.DataFrame(df.iloc[:, 0].values, columns=['class'])
    out_df['body'] = df.apply(lambda row: tokenize(merge_cols(row, text_cols)), axis='columns')
    return out_df

In [283]:
def frequencies(df):
    freqs = pd.DataFrame(index=pd.Index(data=[], name='token'), columns=['ham', 'spam'], dtype=int)
    for i, row in df.iterrows():
        for w in row['body']:
            word = w.lower()
            if word in freqs.index:
                freqs.loc[word, row['class']] += 1
            else:
                freqs.loc[word, row['class']] = 1
    freqs.fillna(0, inplace=True)
    return freqs

# (Multinomial) Naive Bayes Classifier

### For each word, use the probability that any given occurrence of that word is in a spam email or not in a spam email
### To classify an email as spam or not, we check the probabilities of each occurence of each word within its body

In [284]:
def learn_nbc(freqs):
    """
    Multinomial Naive Bayes Classifier
    :param freqs: 
    :return: Probabilities df, priors
    """
    probs = freqs.astype(float)
    # For each cell, add 1 and divide by (number of total occurrences in class + number of distinct tokens)
    for w in probs.index:
        for cl in probs.columns:
            probs.loc[w, cl] = (freqs.loc[w, cl] + 1) / (freqs[cl].sum() + freqs.shape[0])
    total_occ = freqs.values.sum()
    return probs, {cl: freqs[cl].sum() / total_occ for cl in ['ham', 'spam']}

In [285]:
def nbc_predict(row, probs_df, priors):
    words = row['body']
    log_probs = {cl: 0 for cl in priors.keys()}
    probs = {cl: 0 for cl in priors.keys()}
    for cl in probs_df.columns:
        for w in words:
            if w not in probs_df.index:
                # Does not affect probability in either direction
                continue
            log_probs[cl] += np.log(probs_df.loc[w.lower(), cl])
        log_probs[cl] += priors[cl]
        probs[cl] = np.exp(log_probs[cl])
    if probs['ham'] > probs['spam']:
        return 'ham'
    else:
        return 'spam'

In [286]:
def nbc_accuracy(test_df, probs_df, priors):
    prediction = test_df.apply(lambda row: nbc_predict(row, probs_df, priors), axis=1, result_type='expand')
    accuracy = 1 - prediction.compare(test_df['class']).shape[0] / test_df.shape[0]
    return accuracy

In [287]:
def nbc_test():
    df = split(pd.read_csv('data/spam.csv', encoding = 'ISO-8859-1'))
    train_df = df.sample(frac=0.8)
    test_df = df.drop(train_df.index)
    probs_df, priors = learn_nbc(frequencies(train_df))
    
    # Overall accuracy
    print('Overall Accuracy')
    train_accuracy = nbc_accuracy(train_df, probs_df, priors)
    test_accuracy = nbc_accuracy(test_df, probs_df, priors)
    print(f'Percentages of dataset: ham - {priors["ham"]:.2f}, spam - {priors["spam"]:.2f}')
    print(f'Training Accuracy: {train_accuracy:.2f}')
    print(f'Testing Accuracy: {test_accuracy:.2f}')
    print('------------------------------')
    # Ham Accuracy
    print('Ham Accuracy')
    train_ham = train_df[train_df['class'] == 'ham']
    test_ham = test_df[test_df['class'] == 'ham']
    train_ham_accuracy = nbc_accuracy(train_ham, probs_df, priors)
    test_ham_accuracy = nbc_accuracy(test_ham, probs_df, priors)
    print(f'Training Accuracy: {train_ham_accuracy:.2f}')
    print(f'Testing Accuracy: {test_ham_accuracy:.2f}')
    print('------------------------------')
    # Spam Accuracy
    print('Spam Accuracy')
    train_spam = train_df[train_df['class'] == 'spam']
    test_spam = test_df[test_df['class'] == 'spam']
    train_spam_accuracy = nbc_accuracy(train_spam, probs_df, priors)
    test_spam_accuracy = nbc_accuracy(test_spam, probs_df, priors)
    print(f'Training Accuracy: {train_spam_accuracy:.2f}')
    print(f'Testing Accuracy: {test_spam_accuracy:.2f}')
    print('------------------------------')

In [288]:
nbc_test()

Overall Accuracy
Percentages of dataset: ham - 0.84, spam - 0.16
Training Accuracy: 0.91
Testing Accuracy: 0.90
------------------------------
Ham Accuracy
Training Accuracy: 0.92
Testing Accuracy: 0.92
------------------------------
Spam Accuracy
Training Accuracy: 0.86
Testing Accuracy: 0.83
------------------------------


### For the problem of determining spam emails especially, it is good to know how our algorithm performs in false positives vs false negatives, as looking at the overall accuracy does not tell the full picture (especially since non-spam emails are the vast majority of the dataset)