# notebook imports

In [122]:
from os import walk
from os.path import join
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from PIL import Image
from sklearn.model_selection import train_test_split

# constants

In [155]:
SPAM_CAT = 1
HAM_CAT = 0
VOCAB_SIZE = 2500

TRAINING_DATA_FILE = 'SpamData/02_Training/train_data.txt'
TEST_DATA_FILE = 'SpamData/02_Training/test_data.txt'


TOKEN_SPAM_PROB_FILE = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = 'SpamData/03_Testing/prob-all-tokens.txt'

TEST_FEATURE_MATRIX = 'SpamData/03_Testing/test-features.txt'
TEST_TARGET_FILE = 'SpamData/03_Testing/test-target.txt'


# Read and load features from .txt Files  into Numpy Array

In [124]:
sparse_train_data = np.loadtxt(TRAINING_DATA_FILE, delimiter=' ', dtype=int)

In [125]:
sparse_test_data = np.loadtxt(TEST_DATA_FILE, delimiter=' ', dtype=int)

In [126]:
sparse_train_data[:5]

array([[ 0,  2,  1,  1],
       [ 0,  3,  1,  2],
       [ 0,  4,  1,  1],
       [ 0,  7,  1,  3],
       [ 0, 11,  1,  1]])

In [127]:
print("Nr of rows in training file ", sparse_train_data.shape[0])
print("Nr of rows in training file ", sparse_test_data.shape[0])

Nr of rows in training file  258360
Nr of rows in training file  117776


In [128]:
print("Number of emails in training file ", np.unique(sparse_train_data[:, 0].size))
print("Number of emails in test file ", np.unique(sparse_test_data[:, 0].size))

Number of emails in training file  [258360]
Number of emails in test file  [117776]


### How to create an empty dataFrame

In [129]:
column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0,VOCAB_SIZE))
column_names[:5]

['DOC_ID', 'CATEGORY', 0, 1, 2]

In [130]:
len(column_names)

2502

In [131]:
index_names = np.unique(sparse_test_data[:, 0])
index_names

array([   8,   12,   14, ..., 5788, 5792, 5793])

In [132]:
full_train_data = pd.DataFrame(columns=column_names, index=index_names)
full_train_data.fillna(value=0, inplace=True)

In [133]:
full_train_data.head()

Unnamed: 0,DOC_ID,CATEGORY,0,1,2,3,4,5,6,7,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Create a full matrix from a Sparse matrix

In [134]:
def make_full_matrix(sparse_matrix, nr_words, doc_idx=0,word_idx=1,cat_idx=2,freq_idx=3):
    """
    Form a full matrix from a sparse matrix
    KeyWords:
    :param sparse_matrix: numpy array
    :param nr_words: size of the vocabulary, total number of tokens
    :param doc_idx: position if the document id un the sparse matrix, default 1st column
    :param word_idx: position od the word id in the sparse matrix, default 2nd column
    :param cat_idx: position of the label (spam is 1, non-spam is 0) default 3rd column
    :param freq_idx: position of the occurrence of word in sparse matrix, default 4th column
    :return: pandas DF
    """
    column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0,VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:,0])
    full_matrix = pd.DataFrame(columns=column_names, index=doc_id_names)
    full_matrix.fillna(value=0, inplace=True)

    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        label = sparse_matrix[i][cat_idx]
        occurrence = sparse_matrix[i][freq_idx]

        full_matrix.at[doc_nr, 'DOC_ID'] = doc_nr
        full_matrix.at[doc_nr, 'CATEGORY'] = label
        full_matrix.at[doc_nr, word_id] = occurrence
        # full_matrix.at[doc_nr, 'DOC_ID'] = doc_nr
    full_matrix.set_index('DOC_ID', inplace=True)
    return full_matrix

In [135]:
%%time
full_train_data = make_full_matrix(sparse_train_data, VOCAB_SIZE)

CPU times: total: 23.8 s
Wall time: 23.9 s


In [136]:
full_train_data.head()

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,1,2,1,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
1,1,7,1,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,6,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,6,0,0,2,4,0,3,14,0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,1,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Training the Naive Bayes Model
## Calculating the probability of spam

In [137]:
prob_spam =  full_train_data.CATEGORY.value_counts()[1] /full_train_data.CATEGORY.size
print("probability of spam is ",prob_spam)

probability of spam is  0.310989284824321


### Total number of words/ tokens

In [138]:
full_train_features = full_train_data.loc[:,full_train_data.columns != 'CATEGORY']
email_lengths = full_train_features.sum(axis = 1)

In [139]:
email_lengths

DOC_ID
0        87
1        53
2        40
3       183
4        43
       ... 
5789     18
5790     72
5791     46
5794     57
5795    264
Length: 4013, dtype: int64

In [140]:
total_wc = email_lengths.sum()
total_wc

429241

### Number of tokens in spam & ham emails

In [141]:
spam_lengths = email_lengths[full_train_data.CATEGORY == 1] ## look at the docid and get only the spam ones
ham_lengths = email_lengths[full_train_data.CATEGORY == 0]

spam_wc = spam_lengths.sum()
ham_wc = ham_lengths.sum()

In [142]:
# to verify that is true:
email_lengths.shape[0] - spam_lengths.shape[0] - ham_lengths.shape[0] # returns 0

0

In [143]:
nonspam_wc = ham_wc

In [144]:
ham_wc

252907

In [145]:
spam_wc

176334

In [146]:
print('avg num of words in ham: {:.0f}'.format(ham_wc / ham_lengths.shape[0]))
print("avg num of words in spam: {:.0f}".format(spam_wc / spam_lengths.shape[0]))

avg num of words in ham: 91
avg num of words in spam: 141


### Summing the tokens occurring in spam

In [147]:
full_train_features.shape

(4013, 2500)

In [148]:
trains_spam_tokens = full_train_features.loc[full_train_data.CATEGORY == 1]
trains_ham_tokens = full_train_features.loc[full_train_data.CATEGORY == 0]

In [149]:
trains_spam_tokens.shape

(1248, 2500)

In [150]:
summed_spam_tokens=  trains_spam_tokens.sum(axis=0) + 1# 0 - column, 1 - row| adding 1 to avoid dividing by zero
summed_ham_tokens =  trains_ham_tokens.sum(axis=0) + 1

# P(Token |  Spam) - Probability that a token occurs given the email is spam

In [151]:
prob_tokens_spam = summed_spam_tokens / (spam_wc + VOCAB_SIZE) # add vocab size because we added 1 2500 times at the summed_spam_tokens sum line
prob_tokens_spam.sum()
# spam_wc = number of spam emails

1.0

# P(Token |  Ham) - Probability that a token occurs given the email is Nonspam

In [152]:
prob_tokens_ham = summed_ham_tokens / (ham_wc + VOCAB_SIZE)
prob_tokens_ham.sum()

1.0

# P(Token) - Probability that token occurs

In [153]:
prob_tokens_all = full_train_features.sum(axis=0) / total_wc
prob_tokens_all.sum()

1.0

# Save the trained model

In [154]:
np.savetxt(TOKEN_SPAM_PROB_FILE, prob_tokens_spam)
np.savetxt(TOKEN_HAM_PROB_FILE, prob_tokens_ham)
np.savetxt(TOKEN_ALL_PROB_FILE, prob_tokens_all)

# Prepare TestData


In [159]:
%%time
full_matrix = make_full_matrix(sparse_test_data, VOCAB_SIZE)

CPU times: total: 10.7 s
Wall time: 10.7 s


In [160]:
X_test = full_matrix.loc[:,full_matrix.columns != 'CATEGORY']
y_test = full_matrix.CATEGORY
np.savetxt(TEST_FEATURE_MATRIX, X_test)
np.savetxt(TEST_TARGET_FILE, y_test)
