In [1]:
import math
from collections import defaultdict

In [2]:
# Naive Bayes Code

class NaiveBayesClassifier:
    def __init__(self, k = 0.5):
        self.k = k

        self.tokens = set()
        self.token_spam_counts = defaultdict(int)
        self.token_ham_counts = defaultdict(int)
        self.spam_messages = self.ham_messages = 0

    def train(self, messages):
        for message in messages:
            if message.is_spam:
                self.spam_messages += 1
            else:
                self.ham_messages += 1

            # Increment word counts
            for token in tokenize(message.text):
                self.tokens.add(token)
                if message.is_spam:
                    self.token_spam_counts[token] += 1
                else:
                    self.token_ham_counts[token] += 1

    def _probabilities(self, token):
        """
        Returns P(token|spam) and P(token|ham)
        """
        spam = self.token_spam_counts[token]
        ham = self.token_ham_counts[token]

        p_token_spam = (spam + self.k) / (self.spam_messages + 2 * self.k)
        p_token_ham = (ham + self.k) / (self.ham_messages + 2 * self.k)

        return p_token_spam, p_token_ham

    def predict(self, text):
        text_tokens = tokenize(text)
        log_prob_if_spam = log_prob_if_ham = 0.0

        # Iterate through each word in vocabulary
        for token in self.tokens:
            prob_if_spam, prob_if_ham = self._probabilities(token)

            # If token appears in message, add the log probability of seeing it
            if token in text_tokens:
                log_prob_if_spam += math.log(prob_if_spam)
                log_prob_if_ham += math.log(prob_if_ham)
            # Otherwise add the log probability of not seeing it which is log(1-probability of seeing it)
            else:
                log_prob_if_spam += math.log(1-prob_if_spam)
                log_prob_if_ham += math.log(1-prob_if_ham)
        prob_if_spam = math.exp(log_prob_if_spam)
        prob_if_ham = math.exp(log_prob_if_ham)

In [3]:
import re

def tokenize(text):
    text = text.lower()
    all_words = re.findall("[a-z0-9']+", text)
    return set(all_words)

In [14]:
# Loading the dataset
from collections import namedtuple
SpamData = namedtuple("SpamData", ['text','is_spam'])

training_data = []

with open('SMSSpamCollection') as f:
    for line in f:
        is_spam = True if line.split()[0] == 'spam' else False
        data_point = SpamData(text=' '.join(line.split()[1:]),is_spam=is_spam)
        training_data.append(data_point)

In [18]:
def p_spam_given_token(token, model):
    prob_if_spam, prob_if_ham = model._probabilities(token)

    return prob_if_spam/(prob_if_spam + prob_if_ham)

In [19]:
model = NaiveBayesClassifier()

In [20]:
model.train(training_data)

In [22]:
words = sorted(model.tokens, key=lambda t: p_spam_given_token(t,model))
print("spammiest_words", words[-10:])
print("hammiest_words", words[:10])

spammiest_words ['1000', 'tone', 'cs', '500', 'guaranteed', '18', '150p', 'won', 'prize', 'claim']
hammiest_words ['gt', 'lt', "i'll", 'he', 'lor', 'da', 'later', 'she', 'doing', 'ask']
