<a href="https://colab.research.google.com/github/matthewshan/CIS-678/blob/master/Project%202%20Spam%20Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports 


In [0]:
import random
import nltk.corpus
from nltk.stem import PorterStemmer

"""
  NLTK Set up
"""
nltk.download('stopwords')
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Function

In [0]:
"""
  "Trains" our algorithm adding counts of the words of the message into its 
  corresponding dictionary. (spam_counts or ham_counts)

  @param classifcation - The class of the given message
  @param msg - The message content
"""
def train(classification, msg):
  if (classification == "spam"):
    for word in msg:
      if word in spam_counts:
        spam_counts[word] += 1
      else:
        spam_counts[word] = 1
  elif (classification == "ham"):
    for word in msg:
      if word in ham_counts:
        ham_counts[word] += 1
      else:
        ham_counts[word] = 1
  else:
    pass


"""
  Retrieves the word count of the given word if avaliable. If not, returns 0

  @param word - The given word
  @param dic - The dictionary to read from

  @returns - The number of time the word appears of the given dictionary. 
             Returns 0 if the key does not exists
"""
def try_key(word, dic):
  try:
    return dic[word]
  except:
    return 0


"""
  This calcuates the P(word | class)
  [Probability of the word showing up in the class]

  @param - The word
  @param - classification of the word
  
  @returns the probability of P(word | class)
"""
def calc_prob(word, classification):
  class_list = []
  if (classification.lower() == "spam"):
    class_list = spam_counts
  else:
    class_list = ham_counts
  return (try_key(word, class_list) + 1) / (len(class_list) + len(unique_words))


"""
  Calculates whether or not the message is spam or ham 
  based on previous training data.

  @param line - The message given

  @returns Spam or Ham based on the model.
"""
def calc_class(line):
  prob_of_spam = len(spam_training) / (len(spam_training) + len(ham_training))
  msg_prob = 1.0
  for word in line:
    msg_prob *= calc_prob(word, "spam")
  final_spam_prob = msg_prob * prob_of_spam

  prob_of_ham = len(ham_training) / (len(spam_training) + len(ham_training))
  msg_prob = 1.0
  for word in line:
    msg_prob *= calc_prob(word, "ham")
  final_ham_prob = msg_prob * prob_of_ham

  if (final_spam_prob > final_ham_prob):
    return "spam"
  else:
    return "ham"

# Run

In [0]:
"""
Variables used for the model
"""
# Holds the amount of times a word shows up in the spam training set
spam_counts = {} 
# Holds the amount of times a word shows up in the ham training set
ham_counts = {}
# The percent of the data that we want in the training set
TRAIN_PERCENT = .8
# Spam messages for training
spam_training = []
# Ham messages for training
ham_training = []
# Spam messages for testing
spam_test = []
# Ham messages for testing
ham_test = []
# Set of unique words all all the training data
unique_words = set()

"""
Read in the data
"""
spam_messages = []
ham_messages = []
file = open("textMsgs.data")
for line in file:
  temp = line.split('\t')
  classification = temp[0]
  message = temp[1].split(" ")
  msg = []
  for word in message:
    word = word.lower().replace(",", "").replace("'", "").replace(".", "").replace("(", "").replace(")", "").replace("\"", "").replace("!", "").replace("\n", "")
    if len(word) > 2 and word not in stopwords.words('english'):
      msg.append(ps.stem(word))

  if (classification == "spam"):
    spam_messages.append(msg)
  elif (classification == "ham"):
    ham_messages.append(msg)
spam_count = len(spam_messages)
ham_count = len(ham_messages)


"""
Split the data to test and training
"""
random.shuffle(spam_messages)
random.shuffle(ham_messages)

for i in range(int(TRAIN_PERCENT*len(spam_messages))):
  spam_training.append(spam_messages[i])

for i in range(int(TRAIN_PERCENT*len(spam_messages)), len(spam_messages)):
  spam_test.append(spam_messages[i])

for i in range(int(TRAIN_PERCENT*len(ham_messages))):
  ham_training.append(ham_messages[i])

for i in range(int(TRAIN_PERCENT*len(ham_messages)), len(ham_messages)):
  ham_test.append(ham_messages[i])

print(len(spam_training), ":", len(spam_test), "[Spam - Training Data : Test Data]")
print(len(ham_training), ":", len(ham_test), "[Ham - Training Data : Test Data]")


"""
Train the model
"""
for spam in spam_training:
  train("spam", spam)
for ham in ham_training:
  train("ham", ham)
temp = []
temp.append(spam_counts.keys())
temp.append(ham_counts.keys())
for i in spam_counts.keys():
  unique_words.add(i)
for i in ham_counts.keys():
  unique_words.add(i)

"""
Test the model
"""
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
for message in spam_test:
  prediction = calc_class(message)
  if (prediction == "spam"):
    true_neg += 1
  else:
    false_pos += 1

for message in ham_test:
  prediction = calc_class(message)
  if (prediction == "ham"):
    true_pos += 1
  else:
    false_neg += 1

print("true_pos: " + str(true_pos) + "\nfalse_pos: " + str(false_pos) + "\ntrue_neg: " + str(true_neg) + "\nfalse_neg: " + str(false_neg) + "\n")
print("Percent correct: " + str((true_pos + true_neg)/(true_pos+true_neg+false_pos+false_neg)))
  

597 : 150 [Spam - Training Data : Test Data]
3861 : 966 [Ham - Training Data : Test Data]
true_pos: 964
false_pos: 19
true_neg: 131
false_neg: 2

Percent correct: 0.9811827956989247
