In [1]:
import numpy as np
import matplotlib.pyplot as plt
from src import util
import collections

In [2]:
train_messages, train_labels = util.load_spam_dataset('data/ds6_train.tsv')
val_messages, val_labels = util.load_spam_dataset('data/ds6_val.tsv')
test_messages, test_labels = util.load_spam_dataset('data/ds6_test.tsv')

In [3]:
print(type(train_messages))
print(len(train_messages))
print(train_messages[0])

print('\n')
print(train_labels)
print(type(train_labels))
print(len(train_labels))
print(train_labels[0])
print(np.unique(train_labels))

<class 'list'>
4457
THANX 4 PUTTIN DA FONE DOWN ON ME!!


[0 0 0 ... 0 0 0]
<class 'numpy.ndarray'>
4457
0
[0 1]


In [4]:
def get_words(message):
    """Get the normalized list of words from a message string.

    This function should split a message into words, normalize them, and return
    the resulting list. For splitting, you should split on spaces. For normalization,
    you should convert everything to lowercase.

    Args:
        message: A string containing an SMS message

    Returns:
       The list of normalized words from the message.
    """
    norm_sentence = message.lower()
    norm_words = norm_sentence.split(' ') # Alert-1: this does not get rid of punctuation marks
    
    return norm_words


def create_dictionary(messages):
    """Create a dictionary mapping words to integer indices.

    This function should create a dictionary of word to indices using the provided
    training messages. Use get_words to process each message. 

    Rare words are often not useful for modeling. Please only add words to the dictionary
    if they occur in at least five messages.

    Args:
        messages: A list of strings containing SMS messages

    Returns:
        A python dict mapping words to integers.
    """
    words_dict = {}
    all_words = [] # collecting all words for counting including repition
    final_dict = {}
    
    # step-1: gather all words 
    for message in messages:   
        words = get_words(message)
        for word in words:
            all_words.append(word)
            
    # Step-2: initalise dictionary with zero values
    
    for word in all_words:
        words_dict[word] = 0
        
    # Step-3: update word count in dictionary 
    
    for word in all_words:
        words_dict[word]+=1
        
    # step-5: get words whose key value is greater than or equal to 5
    
    mappping_counter = 0
    for key in words_dict.keys():
        if words_dict[key]>=5:
            #final_dict[key] = words_dict[key] # Alert-2: Does not perform laplace smoothing yet
            final_dict[key] = mappping_counter  # Alert-3: not indexed alphabateically
            mappping_counter+=1
    
        
    return final_dict
        
    
def transform_text(messages, word_dictionary):
    """Transform a list of text messages into a numpy array for further processing.

    This function should create a numpy array that contains the number of times each word
    appears in each message. Each row in the resulting array should correspond to each 
    message and each column should correspond to a word.

    Use the provided word dictionary to map words to column indices. Ignore words that 
    are not present in the dictionary. Use get_words to get the words for a message.

    Args:
        messages: A list of strings where each string is an SMS message.
        word_dictionary: A python dict mapping words to integers.

    Returns:
        A numpy array marking the words present in each message.
    """
    # m rows x n columns
    m = len(messages)            
    n = len(word_dictionary.keys())
    
    my_array = np.zeros((m,n))
    
    for i in range(len(messages)):
        message = messages[i]
        words = get_words(message)
        for word in words:
            if word in word_dictionary.keys():
                my_array[i][word_dictionary[word]]+=1
    
    return my_array



def fit_naive_bayes_model(matrix, labels):
    """Fit a naive bayes model.

    This function should fit a Naive Bayes model given a training matrix and labels.

    The function should return the state of that model.

    Feel free to use whatever datatype you wish for the state of the model.

    Args:
        matrix: A numpy array containing word counts for the training data
        labels: The binary (0 or 1) labels for that training data

    Returns: The trained model
    
    """
    # Alert:5 - Resolved - Laplace smoothing done

    
    mask1 = np.where(labels == 1)
    mask2 = np.where(labels == 0)
    
    spam_emails_array = matrix[mask1]
    nonspam_emails_array = matrix[mask2]
    
    spam_num = spam_emails_array.shape[0]
    nonspam_num = nonspam_emails_array.shape[0]
    
    phy_spam = spam_num/labels.shape[0]
    phy_nonspam = 1-phy_spam    
    
    num_x_spam = np.sum(spam_emails_array, axis=0) # sum of a word x in all spam emails
    num_x_spam +=1 # laplace smoothing
    all_spam_words = np.sum(np.sum(spam_emails_array, axis=1)) # All words in spam emails
    all_spam_words += matrix.shape[1] # laplace smoothing
    
    p_x_spam = num_x_spam/all_spam_words # p(x|y=1)
    
    num_x_nonspam = np.sum(nonspam_emails_array, axis=0) # sum of a word x in all non-spam emails
    num_x_nonspam +=1 # laplace smoothing
    all_nonspam_words = np.sum(np.sum(nonspam_emails_array, axis=1)) # All words in non-spam emails
    all_nonspam_words += matrix.shape[1] # laplace smoothing
    p_x_nonspam = num_x_nonspam/all_nonspam_words # p(x|y=0)

    
    return phy_spam, phy_nonspam, p_x_spam, p_x_nonspam


def predict_from_naive_bayes_model(phy_spam, phy_nonspam, p_x_spam, p_x_nonspam, matrix):
    """Use a Naive Bayes model to compute predictions for a target matrix.

    This function should be able to predict on the models that fit_naive_bayes_model
    outputs.

    Args:
        model: A trained model from fit_naive_bayes_model
        matrix: A numpy array containing word counts

    Returns: A numpy array containg the predictions from the model
    """
    p_spam = np.zeros((matrix.shape[0]))
    
    # iterate message-wise
    for i in range(matrix.shape[0]):
        message = matrix[i]
        words_mask = np.where(message > 0) # find indexes of words that appear in the email
        all_p_x_spam = p_x_spam[words_mask]  #p(x1|y=1), p(x5|y=1), p(x8|y=1), .....
        pi_x_spam = np.prod(all_p_x_spam[:]) # p(x1|y=1)*p(x5|y=1)*p(x8|y=1)*.....
        all_p_x_nonspam = p_x_nonspam[words_mask]  #p(x1|y=0), p(x5|y=0), p(x8|y=0), .....
        pi_x_nonspam = np.prod(all_p_x_nonspam[:]) # p(x1|y=0)*p(x5|y=0)*p(x8|y=0)*.....
        
        p_spam[i] = pi_x_spam * phy_spam/ (pi_x_spam * phy_spam + pi_x_nonspam * phy_nonspam)
        
    return p_spam
        
    
def NB_accuracy(labels, preds):
    true_spam_ids = np.where(labels==1)
    pred_spam_ids = np.where(preds>=0.5)
    
    true_nonspam_ids = np.where(labels==0)
    pred_nonspam_ids = np.where(preds<0.5)
    
    true_spam_set  = set(true_spam_ids[0])
    pred_spam_set = set(pred_spam_ids[0])
    true_nonspam_set = set(true_nonspam_ids[0])
    pred_nonspam_set = set(pred_nonspam_ids[0])
    
    TP = len(true_spam_set.intersection(pred_spam_set))
    TN = len(true_nonspam_set.intersection(pred_nonspam_set))
    
    accuracy = np.round((TP+TN)*100/(len(true_spam_set)+len(true_nonspam_set)),4)
    
    return accuracy
    

In [5]:
train_dict = create_dictionary(train_messages); print('Total words in dictionary: ', len(train_dict.keys()))
train_array = transform_text(train_messages, train_dict); print(train_array.shape); print('first 5 message: ', train_array[:5,:])

print('Train labels: ',train_labels.shape, ' labels: ', train_labels)

print('\n\n\n\nFitting on Training data')
phy_spam, phy_nonspam, p_x_spam, p_x_nonspam = fit_naive_bayes_model(train_array, train_labels)
p_spam = predict_from_naive_bayes_model(phy_spam, phy_nonspam, p_x_spam, p_x_nonspam, train_array)
train_accuracy = NB_accuracy(train_labels, p_spam)
print('Train accuracy: ', train_accuracy,'%')


print('\n\n\n\nPredicting on Validation data')
val_array = transform_text(val_messages, train_dict)
p_spam = predict_from_naive_bayes_model(phy_spam, phy_nonspam, p_x_spam, p_x_nonspam, val_array)
val_accuracy = NB_accuracy(val_labels, p_spam)
print('Val accuracy: ', val_accuracy,'%')


print('\n\n\n\nPredicting on Testing data')
test_array = transform_text(test_messages, train_dict)
p_spam = predict_from_naive_bayes_model(phy_spam, phy_nonspam, p_x_spam, p_x_nonspam, test_array)
test_accuracy = NB_accuracy(test_labels, p_spam)
print('Test accuracy: ', test_accuracy,'%')

Total words in dictionary:  1758
(4457, 1758)
first 5 message:  [[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Train labels:  (4457,)  labels:  [0 0 0 ... 0 0 0]




Fitting on Training data
Train accuracy:  98.3621 %




Predicting on Validation data
Val accuracy:  98.2047 %




Predicting on Testing data
Test accuracy:  97.8495 %


In [38]:
def get_top_five_naive_bayes_words(phy_spam, phy_nonspam, p_x_spam, p_x_nonspam, dictionary):
    
    
    my_probs = np.log(p_x_spam) - np.log(p_x_nonspam) # Alert-6: Here we have to sort after taking difference. 
    # Without difference and filtering just by which word appear most in spam email won't give accurate results at all. 
    # Because, in spam email word like 'you' can appear the most but we want to find the relative proportion of word in 
    # spam email and non-spam emails. 
    
    top_5_words_ids  = np.argsort(my_probs)[-5:]
    
    reverse_dict = {value: key for key, value in dictionary.items()}
    
    counter = 0
    for idx in top_5_words_ids:
        print('Top-',5-counter, ' word is : ', reverse_dict[idx] )
        counter+=1
        


In [39]:
phy_spam, phy_nonspam, p_x_spam, p_x_nonspam = fit_naive_bayes_model(train_array, train_labels)
get_top_five_naive_bayes_words(phy_spam, phy_nonspam, p_x_spam, p_x_nonspam, train_dict)

Top- 5  word is :  urgent!
Top- 4  word is :  tone
Top- 3  word is :  prize
Top- 2  word is :  won
Top- 1  word is :  claim
