In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Sample dataset: list of messages and their labels (spam or not spam)
dataset = [
    {"message": "win money now", "label": "spam"},
    {"message": "earn money from home", "label": "spam"},
    {"message": "meeting at work", "label": "not spam"},
    {"message": "urgent money transfer", "label": "spam"},
    {"message": "see you at the meeting", "label": "not spam"},
    {"message": "I can help you make a lot of money in very little time", "label": "spam"},
    {"message": "Can you please submit the report asap", "label": "not spam"},
    {"message": "Need your help", "label": "not spam"},
    {"message": "You won a lottery", "label": "spam"}


]

# Step 1: Tokenize messages
def tokenize(message):
    """Simple tokenizer that splits message into words."""
    return message.lower().split()

# Step 2: Calculate Priors
def calculate_priors(dataset):
    """Calculates the prior probability of each class (spam or not spam)."""
    total_messages = len(dataset)
    spam_messages = sum(1 for data in dataset if data["label"] == "spam")

    not_spam_messages = total_messages - spam_messages
    return {
        "spam": spam_messages / total_messages,
        "not spam": not_spam_messages / total_messages
    }

# Tokenize messages and calculate priors
tokenized_messages = [(tokenize(data["message"]), data["label"]) for data in dataset]
priors = calculate_priors(dataset)

# Display tokenized messages and priors for reference
tokenized_messages, priors


([(['win', 'money', 'now'], 'spam'),
  (['earn', 'money', 'from', 'home'], 'spam'),
  (['meeting', 'at', 'work'], 'not spam'),
  (['urgent', 'money', 'transfer'], 'spam'),
  (['see', 'you', 'at', 'the', 'meeting'], 'not spam'),
  (['i',
    'can',
    'help',
    'you',
    'make',
    'a',
    'lot',
    'of',
    'money',
    'in',
    'very',
    'little',
    'time'],
   'spam'),
  (['can', 'you', 'please', 'submit', 'the', 'report', 'asap'], 'not spam'),
  (['need', 'your', 'help'], 'not spam'),
  (['you', 'won', 'a', 'lottery'], 'spam')],
 {'spam': 0.5555555555555556, 'not spam': 0.4444444444444444})

In [3]:
allspam = []
allnotspam = []
for i in tokenized_messages:
    if(i[1]=="spam"):
        for j in i[0]:
            allspam.append(j)
    if(i[1]=="not spam"):
        for j in i[0]:
            allnotspam.append(j)

def createdict(wordarr):
    word_dict = {}
    for i in wordarr:
        if i in word_dict.keys():
            word_dict[i] += 1
        else:
            word_dict[i] = 1
    return word_dict

work_dict_spam = createdict(allspam)
work_dict_notspam = createdict(allnotspam)

prob_spam = {k: v / len(allspam) for k, v in work_dict_spam.items()}
prob_not_spam = {k:  v / len(allnotspam) for k, v in work_dict_notspam.items()}
def posterior(x,prob,prior):
    if x in prob.keys():
        return prob[x]*prior 
    else :
        return 0.001           

In [4]:
def check(test,prob_spam,prob_not_spam):
    test_arr = tokenize(test)
    spam_ = []
    notspam = []
    for i in test_arr: 
        spam_.append(posterior(i,prob_spam,0.6))
        notspam.append(posterior(i,prob_not_spam,0.4))

    spam_np = np.array(spam_)
    notspam_np = np.array(notspam)

    if(np.sum(np.log(spam_np))>np.sum(np.log(notspam_np))):
        predicted = "spam"
        print("Spam: " + test)
    else:
        print("Not spam: " + test)
        predicted = "not spam"
    return predicted

In [5]:
test_set = [
    {"message": "we have a meeting with John and Stuart", "label": "not spam"},
    {"message": "win a cash prize", "label": "spam"},
    {"message": "earn money fast", "label": "spam"},
    {"message": "see you at lunch", "label": "not spam"},
    {"message": "I have some very important information for you", "label": "not spam"}
]
accuracy = 0
for test in test_set:
    predicted = check(test['message'],prob_spam,prob_not_spam)
    if(predicted==test["label"]):
        accuracy+=1  
print("My models accuracy is {acc:.2f}".format(acc = accuracy/5*100))

Not spam: we have a meeting with John and Stuart
Spam: win a cash prize
Spam: earn money fast
Not spam: see you at lunch
Spam: I have some very important information for you
My models accuracy is 80.00
