# Naive Bayes classifier for spam detection

[Python refresher exercise](http://misza222.github.io/ml/2018/04/02/how-to-refresh-my-python-skills.html) for me. Data from a [kaggle](https://www.kaggle.com/uciml/sms-spam-collection-dataset#), idea from [Raj'es video](https://www.youtube.com/watch?v=PrkiRVcrxOs)

Compare results with [this kernel on Kaggle](https://www.kaggle.com/muzzzdy/sms-spam-detection-with-various-classifiers)

In [1]:
# Pandas used for importing csv only - forgot how to deal with those beautiful animals - to be refreshed later
import pandas as pd

In [2]:
df = pd.read_csv("./spam.csv", encoding = "ISO-8859-1")
df = df[['v1', 'v2']]
df.columns = ['class', 'sms']
df_test = df[-100:]
df_train = df[:-100]

In [3]:
from collections import Counter
import operator, functools

class SpamModel:
        
    def fit(self, df):
        """
        Fitting method will compute necessary statistics, but what do we need?
         
        P(ham|sms) = np.prod([P(ham|word) for word in sms])
        P(spam|sms) = np.prod([P(spam|word) for word in sms])
        
        P(ham|word) = P(ham) * P(word|ham) / P(word) 
        P(spam|word) = P(spam) * P(word|spam) / P(word) 
        in fact in the equasions above we can ommit division by P(word)
        as for spam we do the same divisions, so it won't impact the result
        
        P(spam) = 1 - P(ham)
        P(spam) = # spam / # total
        
        P(word) =
        
        P(word|spam) = 
        P(word|ham) =
        """
    
        self.df_raw_data = self.df_to_array(df)
        self.training_size = len(self.df_raw_data)
        
        self.nr_of_spam = sum([key == "spam" for key, value in self.df_raw_data])
        self.nr_of_ham = sum([key == "ham" for key, value in self.df_raw_data])
        
        self.df_tokenized_data = list(map(lambda elem: (elem[0], self.tokenize(elem[1])), self.df_raw_data))
        self.df_counted_data = list(map(lambda elem: (elem[0], Counter(elem[1])), self.df_tokenized_data))
        
#         print([elem[1] for elem in self.df_counted_data if elem[0] == 'spam'])
        
        self.spam_counted_data = sum(
            [elem[1] for elem in self.df_counted_data if elem[0] == 'spam'],
            Counter()
        )
        self.spam_total_words = sum(self.spam_counted_data.values())

        self.ham_counted_data = sum(
            [elem[1] for elem in self.df_counted_data if elem[0] == 'ham'],
            Counter()
        )
        self.ham_total_words = sum(self.ham_counted_data.values())
        
        self.counted_data = self.spam_counted_data + self.ham_counted_data
        
    def predict(self, sms):
        """
        Predict computes probabilities for spam and not spam and chooses the larger number to report the result
        """
        words = self.tokenize(sms)
        
        spam_prediction = self.predict_words_for("spam", words)
        ham_prediction = self.predict_words_for("ham", words)
        
        if spam_prediction > ham_prediction:
            return "spam"
        else:
            return "ham"
        
    def predict_words_for(self, clas, words):
        """
        Here we compute:
        P(clas|sms) = np.prod([P(clas|word) for word in sms])
        
        where:
        P(clas|word) = P(clas) * P(word|clas) / P(word) 
        
        """
        if clas == "spam":
            counters = self.spam_counted_data
            total = self.spam_total_words
            class_probability = self.nr_of_spam / self.training_size
        else:
            counters = self.ham_counted_data
            total = self.ham_total_words
            class_probability = self.nr_of_ham / self.training_size
        
        
        probabilities = [
            # P(clas|word) = P(clas) * P(word|clas) / P(word) 
            class_probability * self.training_size * (counters[word] + 1) / total / (self.counted_data[word] + 1)
            for word in words
        ]
        
        return functools.reduce(
                    operator.mul,
                    probabilities,
                    1
                )
        
    def tokenize(self, strng):
        """Naive tokenization method"""
        return strng.split(" ")
    
    def df_to_array(self, df):
        """Convert DataFram to plain python array, as forgot how to handle pandas"""
        data_array = []
        for _, row in df.iterrows():
            data_array.append((row['class'], row['sms']))
            
        return data_array
    
sm = SpamModel()
sm.fit(df_train)

# print("Spam:", sm.nr_of_spam)
# print("Ham:", sm.nr_of_ham)
# print("spam:", sm.spam_counted_data["to"])
# print("ham:", sm.ham_counted_data["to"])

# sm.predict("You have won the main price")

In [10]:
# print(sm.predict("You have won the main price"))
# print(sm.predict("Free entry in 2 a wkly comp to win FA Cup"))
# print(sm.predict("SIX chances to win CASH! From 100 to 20,000"))

test_array = sm.df_to_array(df_test) # just convert a dataframe to array

# for clas, sms in test_array:
#     if clas != sm.predict(sms):
#         print(f"'{sms}' predicted as '{sm.predict(sms)}' is really of class '{clas}'")

# accuracy
accuracy = sum([clas == sm.predict(sms) for clas, sms in test_array]) / len(test_array)
print(f"Accuracy on a test set is {round(accuracy * 100)}%")

Accuracy on a test set is 93%
