In [112]:
import pandas as pd
import string
from collections import Counter


### **Reading and Sampling the Dataset**

In this section, we read the **SMS Spam Collection** dataset into a pandas DataFrame and perform the following tasks:

1. **Reading the Dataset**: The dataset is loaded using `pandas.read_csv()` with the specified separator (`'\t'`) to properly separate the columns.
2. **Adding Column Names**: The dataset doesn't have column headers, so we add custom column names: 'Label' (for the label indicating spam or non-spam) and 'Email_Messages' (for the text of the messages).
3. **Sampling**: We randomly sample 0.2% of the dataset to work with a smaller subset. This is done to improve performance and reduce memory usage, especially when dealing with large datasets.




In [None]:
#Read in the dataset and add columns because the give dataset is not in the proper format by specifying the separator as '\t' and adding column names as 'Label' and 'Email_Messages'
df = pd.read_csv('SMSSpamCollection', sep = '\t', header = None)
df.columns = ['Label', 'Email_Messages']

#Sample 0.2 percent of the dataset
sample_df = df.sample(frac=0.2, random_state=12)

print(sample_df.head())

### **Calculating Prior Probabilities for Spam and Non-Spam Messages**

In this section, we calculate the **prior probabilities** for spam and non-spam messages based on the data. The prior probability refers to the likelihood of an event (in this case, a message being spam or non-spam) before any further information (features or words) is considered. These probabilities are used as the base or initial assumption for classification in a Naive Bayes model.

The **prior probabilities** are calculated as follows:

- **Prior probability of spam messages**: This is the proportion of spam messages out of the total number of messages.
- **Prior probability of non-spam messages**: This is the proportion of non-spam messages out of the total number of messages.



In [None]:
# Calculate the total number of messages in the dataset
total_msgs = len(sample_df)

# Calculate the number of spam messages
# Filter the dataset where the 'Label' column is equal to 'spam'
number_of_spam_msgs = len(sample_df[sample_df['Label'] == 'spam'])

# Calculate the number of non-spam (ham) messages
# Subtract the number of spam messages from the total number of messages
number_of_non_spam_msgs = total_msgs - number_of_spam_msgs

# Calculate the prior probability of a message being spam
# This is the ratio of spam messages to the total number of messages
probability_of_spam_msgs = number_of_spam_msgs / total_msgs

# Calculate the prior probability of a message being non-spam (ham)
# This is the ratio of non-spam messages to the total number of messages
probability_of_non_spam_msgs = number_of_non_spam_msgs / total_msgs

# Print the results for verification
print('Total number of messages:', total_msgs)  # The total count of all messages in the dataset
print('Number of spam messages:', number_of_spam_msgs)  # The count of messages labeled as spam
print('Number of non-spam messages:', number_of_non_spam_msgs)  # The count of messages labeled as non-spam
print('Probability of spam messages:', probability_of_spam_msgs)  # Prior probability of spam messages
print('Probability of non-spam messages:', probability_of_non_spam_msgs)  # Prior probability of non-spam messages


### **Text Cleaning and Preprocessing Function**

The `clean_text` function is used to clean and preprocess the text data in the dataset. This ensures that the text is in a standardized form for analysis. The function performs the following operations:

1. **Lowercase Conversion**: Converts all text to lowercase to avoid any case sensitivity during analysis.
2. **Remove Punctuation**: Removes all punctuation marks to standardize the words and ensure that only the text content is considered.
3. **Tokenization or Splitting**: Splits the text into individual words, allowing for easier processing and analysis.

The function returns a list of cleaned words.



In [115]:
# This is a function to clean and preprocess the text data:
# 1. Convert the text to lowercase (avoiding text sensitivity)
# 2. Remove punctuation to standardize the words
# 3. Split the text into words (tokenization)
# 4. Return the cleaned text as a list of words
def clean_text(text):
    # Convert the text to lower case to avoid case sensitivity and remove punctuation
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    
    # Split the text into words (tokenization)
    return text.split()

# Apply the cleaning function to the 'Email_Messages' column and create a new column 'Cleaned_Email_Messages'
sample_df['Cleaned_Email_Messages'] = sample_df['Email_Messages'].apply(clean_text)

# Separate the Cleaned_Email_Messages into spam and non-spam messages based on the label
spam_messages = sample_df[sample_df['Label'] == 'spam']
non_spam_messages = sample_df[sample_df['Label'] == 'ham']

# Count the number of words in spam and non-spam messages
spam_words = Counter(word for message in spam_messages['Cleaned_Email_Messages'] for word in message)
non_spam_words = Counter(word for message in non_spam_messages['Cleaned_Email_Messages'] for word in message)

# Total number of spam and non-spam words
total_spam_words = sum(spam_words.values())
total_non_spam_words = sum(non_spam_words.values())

# Set of unique words (union of words in both spam and non-spam messages)
unique_words = set(spam_words).union(set(non_spam_words))

# Total number of unique words in the entire vocabulary
len_unique_words = len(unique_words)

### **Calculate Likelihood Naive Bayes with Laplace Smoothing**

This implementation calculates the likelihood of a word being spam or non-spam using **Naive Bayes with Laplace Smoothing**, which is preferred over **Maximum Likelihood Estimation (MLE)** for several reasons:

1. **Handles Missing Words**: 
   - Laplace smoothing assigns a small probability to unseen words, unlike MLE, which assigns zero probability to any word not seen in the training data.

2. **Zero Probability Prevention**: 
   - By adding a constant (usually 1), Laplace smoothing ensures that unseen words do not result in a zero probability, which could severely impact the model's performance.

3. **Scalability**: 
   - Naive Bayes with Laplace smoothing is computationally efficient and works well even with large vocabularies and sparse text data (a common scenario in text classification).

4. **Simplified Assumption**: 
   - The method assumes word independence given the class (spam or non-spam). This simplifying assumption often leads to good performance in practice, even though it may not hold true in all cases.


In [None]:
# The function calculates the likelihood of a word being spam or non-spam using Laplace smoothing
# This is where we prefer Naive Bayes over MLE for several reasons:
# 1.Naive Bayes accounts for missing words (words that might not appear in training data)
# 2.It uses Laplace smoothing to handle zero probabilities, which MLE cannot handle unless manually adjusted
# 3.Naive Bayes assumes word independence (which is a simplifying assumption but often works well in practice)
def calculate_likelihood(word, spam_words, non_spam_words, total_spam_words, total_non_spam_words, len_unique_words, alpha=1):
    # Get the count of the word in spam and non-spam messages
    spam_word_count = spam_words.get(word, 0)
    non_spam_word_count = non_spam_words.get(word, 0)
    
    # Laplace smoothing - adds alpha to the word count and total words to avoid zero probabilities
    # This ensures we don't end up with a zero probability for unseen words
    spam_likelihood = (spam_word_count + alpha) / (total_spam_words + len_unique_words)  # Spam likelihood
    non_spam_likelihood = (non_spam_word_count + alpha) / (total_non_spam_words + len_unique_words)  # Non-spam likelihood
    
    # Without smoothing, using MLE directly could result in zero probabilities for unseen words
    # Example of MLE without smoothing (commented out):
    # spam_likelihood = spam_word_count / total_spam_words if total_spam_words > 0 else 0
    # non_spam_likelihood = non_spam_word_count / total_non_spam_words if total_non_spam_words > 0 else 0

    # Return both likelihoods for comparison (spam vs non-spam)
    return spam_likelihood, non_spam_likelihood

# Loop through the unique words in the vocabulary and calculate the likelihood of each word being spam and non-spam
for word in unique_words:
    # Get the likelihood of the word being spam and non-spam (with Laplace smoothing)
    spam_likelihood, non_spam_likelihood = calculate_likelihood(word, spam_words, non_spam_words, total_spam_words, total_non_spam_words, len_unique_words)
    
    # Print out the likelihoods for each word
    print('Word:', word)
    print('Spam Likelihood (with Laplace Smoothing):', spam_likelihood)
    print('Non Spam Likelihood (with Laplace Smoothing):', non_spam_likelihood)
    
    # Uncomment to compare with MLE approach (without smoothing)
    # print('Spam Likelihood (MLE):', spam_word_count / total_spam_words if total_spam_words > 0 else 0)
    # print('Non Spam Likelihood (MLE):', non_spam_word_count / total_non_spam_words if total_non_spam_words > 0 else 0)
    
    print('\n')

# The Naive Bayes method is generally preferred over MLE for text classification tasks like spam detection:
# 1.Laplace Smoothing: Naive Bayes uses Laplace smoothing (additive smoothing) to handle unseen words, which is a common issue in real-world data where some words may be missing in the training set.
# 2.Independence Assumption**: Naive Bayes assumes that words are independent given the class (spam or non-spam). This assumption simplifies the computation and works surprisingly well in practice, even if the assumption is not perfectly true.
# 3.Handling Zero Probabilities**: Without smoothing, MLE would assign a zero probability to unseen words, which can severely affect the model performance. Naive Bayes with smoothing avoids this issue, making it more robust in real-world applications.
# 4.Scalability: Naive Bayes is computationally efficient, especially when working with large vocabularies and sparse data (as is common with text data).
# 5.Interpretability**: Naive Bayes provides clear probabilities for each word being spam or non-spam, making it easy to interpret and understand the model's predictions.
# 6.Loop through the unique words and calculate the likelihood of each word being spam and non-spam
for word in unique_words:
    # Get the likelihood of the word being spam and non-spam (with Laplace smoothing)
    spam_likelihood, non_spam_likelihood = calculate_likelihood(word, spam_words, non_spam_words, total_spam_words, total_non_spam_words, len_unique_words)
    print('Word:', word)
    print('Spam Likelihood:', spam_likelihood)
    print('Non Spam Likelihood:', non_spam_likelihood)
    print('\n')


### **Function: `calculate_posterior_probability`**

This function calculates the posterior probabilities of a message being spam or non-spam using a Naive Bayes approach.

#### **Parameters:**
- `message`: The message to classify.
- `spam_words`: A list of words that appear in spam messages.
- `non_spam_words`: A list of words that appear in non-spam messages.
- `total_spam_words`: The total count of words in spam messages.
- `total_non_spam_words`: The total count of words in non-spam messages.
- `len_unique_words`: The number of unique words in the dataset.
- `probability_of_spam_msgs`: The prior probability of a message being spam.
- `probability_of_non_spam_msgs`: The prior probability of a message being non-spam.
- `alpha`: The Laplace smoothing parameter (default is 1).

#### **Process:**
1. **Clean the Message**: The message is cleaned by removing unnecessary characters or formatting.
2. **Initialize Priors**: The prior probabilities for both spam and non-spam are set using `probability_of_spam_msgs` and `probability_of_non_spam_msgs`.
3. **Loop Through Words**: For each word in the cleaned message:
   - Calculate the likelihood of the word being in a spam or non-spam message using `calculate_likelihood`.
   - Multiply the likelihoods for both classes to update the posterior probabilities.
4. **Return Posterior Probabilities**: The function returns the posterior probabilities for both spam and non-spam classes.

#### **Explanation:**
The function computes the likelihood of the message being spam or non-spam by iterating over the words in the message, updating the priors with each word's likelihood. The posterior probability is then used to classify the message based on which class has a higher probability.


In [117]:

#Function to calculate the posterior probability of the message being spam and non spam
#It accepts the message, spam_words, non_spam_words, total_spam_words, total_non_spam_words, len_unique_words, probability_of_spam_msgs, probability_of_non_spam_msgs, and alpha as input
def calculate_posterior_probability(message, spam_words, non_spam_words,
                                               total_spam_words, total_non_spam_words,
                                               len_unique_words, probability_of_spam_msgs,
                                               probability_of_non_spam_msgs, alpha=1):
    # Clean the message
    cleaned_message = clean_text(message)
    
    # Initialize priors for both classes
    spam_posterior = probability_of_spam_msgs
    non_spam_posterior = probability_of_non_spam_msgs
    
    # Loop through each word in the cleaned message
    for word in cleaned_message:
        # Get the likelihood for the word in both spam and non-spam
        spam_likelihood, non_spam_likelihood = calculate_likelihood(
            word, spam_words, non_spam_words, total_spam_words, total_non_spam_words,
            len_unique_words, alpha
        )
        
        # Multiply the likelihoods we donot need to divide by the evidence because we are comparing the posterior probabilities
        spam_posterior *= spam_likelihood
        non_spam_posterior *= non_spam_likelihood
        
        
    
    # Return the posterior probabilities for both classes
    return spam_posterior, non_spam_posterior



### **Classifying Messages as Spam or Non-Spam**

In this section, we classify a list of sample email messages as either spam or non-spam based on posterior probabilities calculated using a Naive Bayes classifier.

#### **Steps:**
1. **Messages to Classify**: A list of sample messages is provided.
2. **Posterior Probabilities Calculation**: For each message, we compute the posterior probabilities for spam and non-spam using the `calculate_posterior_probability` function.
3. **Classification**: The message is classified as spam if the posterior probability for spam is higher than that for non-spam. Otherwise, it is classified as non-spam.
4. **Output**: For each message, the spam and non-spam posterior probabilities are printed, followed by the classification result.

#### **Example Output**:
- For each message, the spam and non-spam posterior probabilities are shown along with the classification result. The algorithm outputs whether the message is classified as "Spam" or "Non-Spam".


In [None]:
messages = [
    'WINNER This is the secret code to unlock the money: C3421.',
    'Sounds good, Tom, then see u there',
    '',
    "You won't believe it but it's true. It's Incredible Txts! Reply G now to learn truly amazing things that will blow your mind. From O2FWD only 18p/txt",
    'You have a limited time offer to claim your free vacation!',
    'Congratulations! You have been selected for a special prize.',
    'Reminder: Your account has been suspended. Please update your information.',
    'Free gift card for you! Respond now to claim your prize.',
    'Hey, are we still on for dinner tonight? Let me know!',
    'Important: Verify your account to avoid service interruptions.',
    'Limited-time offer: Buy one, get one free on all products!',
    'Alert: Your password was recently changed. If this wasn’t you, click here.',
    'Get an exclusive deal now! Click here to claim your discount.'
]


# Loop through the messages and classify each message as spam or non-spam
for message in messages:
    # Calculate the posterior probabilities
    spam_posterior, non_spam_posterior = calculate_posterior_probability(
        message, spam_words, non_spam_words, total_spam_words, total_non_spam_words, 
        len_unique_words, probability_of_spam_msgs, probability_of_non_spam_msgs
    )
    # Print the results
    print('Message:', message)
    print('Spam Posterior:', spam_posterior)
    print('Non Spam Posterior:', non_spam_posterior)

    # Classify the message
    if spam_posterior > non_spam_posterior:
        print('Message is Spam')
    else:
        print('Message is Non Spam')
    print('\n')


### **Evaluating the Spam Detection Algorithm**

This block of code evaluates the performance of the spam detection algorithm on the entire dataset. It calculates the posterior probabilities for each email, comparing the probabilities for spam and non-spam classes to classify the message.


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Initialize variables for predictions and actual labels
predictions = []  # This will hold the predicted labels (spam/ham)
actual_labels = df['Label']  # This holds the actual labels from the dataset (spam/ham)

# Loop through each message and its corresponding label in the dataset
for message, label in zip(df['Email_Messages'], df['Label']):
    # Calculate the posterior probabilities for both spam and non-spam using the trained model
    # 'spam_words', 'non_spam_words', 'total_spam_words', 'total_non_spam_words', 'len_unique_words' 
    # are all parameters learned from the training data.
    spam_posterior, non_spam_posterior = calculate_posterior_probability(
        message, spam_words, non_spam_words, total_spam_words, total_non_spam_words, 
        len_unique_words, probability_of_spam_msgs, probability_of_non_spam_msgs
    )
    
    # Classify the message as spam if the posterior probability for spam is higher
    if spam_posterior > non_spam_posterior:
        predictions.append('spam')  # If the message is more likely to be spam, append 'spam'
    else:
        predictions.append('ham')  # Otherwise, append 'ham' (non-spam)

# Calculate the confusion matrix to evaluate the performance of the classifier
# The confusion matrix will compare the actual labels vs predicted labels
conf_matrix = confusion_matrix(actual_labels, predictions, labels=['spam', 'ham'])
print("Confusion Matrix:\n", conf_matrix)

# Calculate precision, recall, and F1-score for the spam class
# These metrics help evaluate the classifier's performance for identifying spam messages
precision = precision_score(actual_labels, predictions, pos_label='spam') 
recall = recall_score(actual_labels, predictions, pos_label='spam')  
f1 = f1_score(actual_labels, predictions, pos_label='spam')  

# Print evaluation metrics to assess model performance
print("Total messages:", len(df))  # Print the total number of messages in the dataset
print("Accuracy:", sum(1 for p, l in zip(predictions, actual_labels) if p == l) / len(df))  # Accuracy: Percentage of correctly classified messages
print("Precision:", precision)  # Precision: Proportion of true positive spam out of predicted spam
print("Recall:", recall)  # Recall: Proportion of true positive spam out of all actual spam
print("F1-Score:", f1)  # F1-Score: Harmonic mean of precision and recall
