# Notebook Imports

In [126]:
import numpy as np
import pandas as pd

# Constants

In [213]:
VOCAB_NO = 2500
TRAIN_DATA_FILE = 'SpamData/02_Training/train-data.txt'
TEST_DATA_FILE = 'SpamData/02_Training/test-data.txt'

TOKEN_PROB_SPAM_FILE = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_PROB_NON_SPAM_FILE = 'SpamData/03_Testing/prob-non-spam.txt'
TOKEN_PROB_ALL_FILE = 'SpamData/03_Testing/prob-all.txt'

TEST_FEATURE_MATRIX_FILE = 'SpamData/03_Testing/test-feature.txt'
TEST_TARGET_FILE = 'SpamData/03_Testing/test-target.txt'

# Read and Load data from txt files

In [128]:
sparse_train_data = np.loadtxt(TRAIN_DATA_FILE, delimiter=' ', dtype=int)

In [129]:
sparse_test_data = np.loadtxt(TEST_DATA_FILE, delimiter=' ', dtype=int)

In [130]:
sparse_train_data[:5]

array([[ 0,  2,  1,  1],
       [ 0,  3,  1,  1],
       [ 0,  4,  1,  1],
       [ 0,  7,  1,  3],
       [ 0, 11,  1,  1]])

In [131]:
sparse_test_data[:5]

array([[8, 2, 1, 1],
       [8, 3, 1, 4],
       [8, 4, 1, 2],
       [8, 5, 1, 1],
       [8, 6, 1, 2]])

## Checking how data are arranged in this array

In [132]:
sparse_train_data[3] # give a particula row of an array

array([0, 7, 1, 3])

In [133]:
# how to access each column in a particula row
print('DOC_ID - ', sparse_train_data[3][0])
print('WORD_ID - ', sparse_train_data[3][1])
print('CATEGORY - ', sparse_train_data[3][2])
print('OCCURENCE - ', sparse_train_data[3][3])

DOC_ID -  0
WORD_ID -  7
CATEGORY -  1
OCCURENCE -  3


# Creating An Empty Dataframe
- Note : This is for making solid understanading of what we are going to deal with

In [134]:
colum_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0, VOCAB_NO))

colum_names[:5]

['DOC_ID', 'CATEGORY', 0, 1, 2]

In [135]:
len(colum_names)

2502

In [136]:
index_names = np.unique(sparse_train_data[:, 0])
index_names

array([   0,    1,    2, ..., 5791, 5794, 5795])

In [137]:
full_tarin_data = pd.DataFrame(index=index_names, columns=colum_names)
full_tarin_data.head()

Unnamed: 0,DOC_ID,CATEGORY,0,1,2,3,4,5,6,7,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [138]:
full_tarin_data.fillna(value=0, inplace=True)
full_tarin_data.head()

Unnamed: 0,DOC_ID,CATEGORY,0,1,2,3,4,5,6,7,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Creating Full Matrix From Sparse Matrix

In [139]:
def full_matrix(sparse_matrix, no_words, pos_doc_id=0, pos_word_id=1, pos_category=2, pos_occurence=3):
    """
    Returns Full Matrix Dataframe from  Sparse Matrix.
    
    Parameters:
    sparse_matrix: Sparse Matrix -- NumPy Array.
    no_words: Number of Vocabulary -- Total number of Tokens.
    pos_doc_id: Position of DOC_ID in Sparse Matrix -- default is 1st Position.
    pos_word_id: Position of WORD_ID in Sparse Matrix -- default is 2nd Position.
    pos_category: Position of CATEGORY in Sparse Matrix -- default is 3rd Position.
    pos_occurence: Position of OCCURENCE in Sparse Matrix -- default is 4th Position.
    """
    colum_names = ['DOC_ID'] + ['CATEGORY'] + list(range(no_words))
    index_names = np.unique(sparse_matrix[:, 0])
    
    full_matrix= pd.DataFrame(index=index_names, columns=colum_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_id = sparse_matrix[i][pos_doc_id]
        word_id = sparse_matrix[i][pos_word_id]
        category = sparse_matrix[i][pos_category]
        occurence = sparse_matrix[i][pos_occurence]
        
        
        full_matrix.at[doc_id, 'DOC_ID'] = doc_id
        full_matrix.at[doc_id, 'CATEGORY'] = category
        full_matrix.at[doc_id, word_id] = occurence
        
        
    full_matrix.set_index('DOC_ID', inplace=True)
    return full_matrix
    

In [140]:
%%time
full_matrix_train_data = full_matrix(sparse_train_data, VOCAB_NO)

Wall time: 12.4 s


In [141]:
full_matrix_train_data.head()

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,1,1,1,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
1,1,7,1,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,6,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,6,0,0,2,4,0,3,14,0,...,1,0,0,0,0,0,0,0,0,0
4,1,5,1,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [142]:
full_matrix_train_data.tail()

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5789,0,3,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5790,0,1,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5791,0,3,1,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5794,0,1,1,1,0,0,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
5795,0,3,4,2,0,5,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0


# Trainign The Niave Bayes Model
## P(Spam) Calculating the Probability that an Email is Spam

In [143]:
number_spam_emails = full_matrix_train_data[full_matrix_train_data.CATEGORY == 1].shape[0]
number_spam_emails

1247

In [144]:
number_non_spam_emails = full_matrix_train_data[full_matrix_train_data.CATEGORY == 0].shape[0]
number_non_spam_emails

2765

In [145]:
total_emails = number_non_spam_emails + number_spam_emails
total_emails

4012

In [146]:
prob_spam = number_spam_emails / total_emails
prob_spam

0.3108175473579262

## Total Number of Tokens/Words

In [147]:
full_matrix_train_data.loc[5795, full_matrix_train_data.columns !='CATEGORY'].sum()

264

In [148]:
# First Approach
total = 0
for i in range(5796):
    try:
        total += full_matrix_train_data.loc[i,full_matrix_train_data.columns !='CATEGORY'].sum()
    except:
        continue
total_words = total

In [149]:
total_words

428749

In [150]:
# Second Approach
email_length = full_matrix_train_data.loc[:, full_matrix_train_data.columns !='CATEGORY']
email_length

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,1,1,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,0,0,2,4,0,3,14,0,0,...,1,0,0,0,0,0,0,0,0,0
4,5,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5789,3,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5790,1,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5791,3,1,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5794,1,1,1,0,0,1,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [151]:
sum_of_words_per_email = email_length.sum(axis=1)
sum_of_words_per_email

DOC_ID
0        87
1        53
2        40
3       183
4        43
       ... 
5789     18
5790     72
5791     46
5794     57
5795    264
Length: 4012, dtype: int64

In [152]:
total_words = sum_of_words_per_email.sum()
total_words

428749

## Total number of Spam & non-Spam Tokens(words) 

In [153]:
non_spam_matrix = full_matrix_train_data[full_matrix_train_data.CATEGORY == 0]
non_spam_email_length = non_spam_matrix.loc[:, non_spam_matrix.columns !='CATEGORY']
non_spam_tokens_per_email = non_spam_email_length.sum(axis=1)
total_non_spam_words = non_spam_tokens_per_email.sum()
total_non_spam_words

252747

In [154]:
spam_matrix = full_matrix_train_data[full_matrix_train_data.CATEGORY == 1]
spam_email_length = spam_matrix.loc[:, spam_matrix.columns !='CATEGORY']
spam_tokens_per_email = spam_email_length.sum(axis=1)
total_spam_words = spam_tokens_per_email.sum()
total_spam_words

176002

In [155]:
total_spam_words / spam_email_length.shape[0]

141.14033680834

In [156]:
total_non_spam_words / non_spam_email_length.shape[0]

91.40940325497287

## Summing Tokens occuring in Spam

In [157]:
full_matrix_train_data.shape

(4012, 2501)

In [175]:
train_spam_tokens = email_length.loc[full_matrix_train_data.CATEGORY == 1]
train_spam_tokens

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,1,1,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,0,0,2,4,0,3,14,0,0,...,1,0,0,0,0,0,0,0,0,0
4,5,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1885,1,0,0,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1887,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1889,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1890,2,0,0,0,1,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [187]:
summed_spam_tokens = train_spam_tokens.sum(axis=0) + 1 # Laplace Smoothing Method
summed_spam_tokens.shape

(2500,)

In [188]:
summed_spam_tokens.tail()

2495    19
2496     1
2497     2
2498     1
2499    10
dtype: int64

In [172]:
train_non_spam_tokens = email_length.loc[full_matrix_train_data.CATEGORY == 0]
train_non_spam_tokens

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1896,1,2,1,0,1,1,2,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1898,2,1,0,1,1,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1899,2,1,1,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1900,3,4,0,1,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1901,2,2,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5789,3,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5790,1,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5791,3,1,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5794,1,1,1,0,0,1,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [185]:
summed_non_spam_tokens = train_non_spam_tokens.sum(axis=0) + 1 # Laplace Smoothing Method
summed_non_spam_tokens.shape

(2500,)

In [186]:
summed_non_spam_tokens.tail()

2495    19
2496    29
2497    32
2498    33
2499    21
dtype: int64

## P(Token|Spam) Probability that a token occurs given that an Email is spam

In [189]:
prob_spam_token = summed_spam_tokens / (total_spam_words + VOCAB_NO) # VOCAB_NO added so that to counterbalance the Laplace Smoothing Method
prob_spam_token[:5]

0    0.012101
1    0.005238
2    0.006734
3    0.011210
4    0.006829
dtype: float64

In [190]:
prob_spam_token.sum()

1.0

## P(Token|Non-Spam) Probability that a token occurs given that an Email is Non-spam

In [194]:
prob_non_spam_token = summed_non_spam_tokens / (total_non_spam_words + VOCAB_NO) # VOCAB_NO added so that to counterbalance the Laplace Smoothing Method
prob_non_spam_token[:5]

0    0.021465
1    0.010135
2    0.008004
3    0.003671
4    0.006315
dtype: float64

In [195]:
prob_non_spam_token.sum()

1.0

## P(Token) Probability that a Token occurs in an Email

In [207]:
prob_token_all = email_length.sum(axis=0) / total_words

In [209]:
prob_token_all.sum()

1.0

## Saving Trained Model

In [212]:
np.savetxt(TOKEN_PROB_SPAM_FILE, prob_spam_token)
np.savetxt(TOKEN_PROB_NON_SPAM_FILE, prob_non_spam_token)
np.savetxt(TOKEN_PROB_ALL_FILE, prob_token_all)

# Preparing Test Data

In [215]:
%%time
full_matrix_test_data = full_matrix(sparse_test_data, VOCAB_NO)

Wall time: 9.9 s


In [216]:
full_matrix_test_data

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,1,0,0,1,4,2,1,2,4,1,...,0,0,0,0,0,0,0,0,0,0
12,1,6,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
14,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,1,0,2,1,1,2,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
17,1,0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5783,0,2,1,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5786,0,5,5,2,2,1,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5788,0,0,4,0,2,4,3,3,1,4,...,0,0,0,0,0,0,0,0,0,0
5792,0,2,2,0,1,0,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0


In [219]:
X_test = full_matrix_test_data.loc[:, full_matrix_test_data.columns != 'CATEGORY']
y_test = full_matrix_test_data.CATEGORY

In [220]:
X_test

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0,0,1,4,2,1,2,4,1,2,...,0,0,0,0,0,0,0,0,0,0
12,6,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,2,1,1,2,0,0,3,0,4,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5783,2,1,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5786,5,5,2,2,1,2,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5788,0,4,0,2,4,3,3,1,4,3,...,0,0,0,0,0,0,0,0,0,0
5792,2,2,0,1,0,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [222]:
y_test

DOC_ID
8       1
12      1
14      1
15      1
17      1
       ..
5783    0
5786    0
5788    0
5792    0
5793    0
Name: CATEGORY, Length: 1722, dtype: int64

In [223]:
np.savetxt(TEST_FEATURE_MATRIX_FILE, X_test)
np.savetxt(TEST_TARGET_FILE, y_test)

# ######################-END-#########################