In [1]:
import pandas as pd 
import numpy as np 

In [4]:
TRAINING_DATA_FILE = './02_Training/train-data.txt'
TEST_DATA_FILE = './02_Training/test-data.txt'

TOKEN_SPAM_PROB_FILE = './03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = './03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = './03_Testing/prob-all.txt'
TEST_FEATURE_MATRIX = './03_Testing/test-feature.txt'
TEST_TARGET_FILE = './03_Testing/test-target.txt'
VOCAB_SIZE = 2500

# Loading data from .txt

In [5]:
sparce_train_data = np.loadtxt(TRAINING_DATA_FILE, delimiter=' ', dtype=int)
sparce_test_data = np.loadtxt(TEST_DATA_FILE, delimiter=' ', dtype=int)

In [6]:
sparce_train_data[-5:]

array([[5795, 1539,    0,    2],
       [5795, 1903,    0,    1],
       [5795, 1915,    0,    2],
       [5795, 2127,    0,    1],
       [5795, 2279,    0,    1]])

In [7]:
print('training: ', sparce_train_data.shape)
print('training: ', sparce_test_data.shape[0])

training:  (265427, 4)
training:  110522


In [8]:
print('training unique: ', np.unique(sparce_train_data[:, 0]).size)
print('testing: unique', np.unique(sparce_test_data[:, 0]).size)

training unique:  4015
testing: unique 1724


# Empty DataFrame

In [9]:
coloumn_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
print(len(coloumn_names))
coloumn_names[:5]

2502


['DOC_ID', 'CATEGORY', 0, 1, 2]

In [10]:
index_names = np.unique(sparce_train_data[:, 0])
index_names 

array([   0,    1,    2, ..., 5791, 5794, 5795])

In [11]:
# full_train_data = pd.DataFrame(index=index_names, columns=coloumn_names)
# full_train_data.fillna(value=0, inplace=True)
# full_train_data.head()

Unnamed: 0,DOC_ID,CATEGORY,0,1,2,3,4,5,6,7,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Sparce matrix to full matrix

In [11]:
def make_full_matrix(sparce_matrix, nr_words, doc_idx=0, word_idx=1, cat_idx=2, freq_idx=3):
    coloumn_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparce_matrix[:, 0])
    full_matrix = pd.DataFrame(index=index_names, columns=coloumn_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparce_matrix.shape[0]):
        doc_nr = sparce_matrix[i][doc_idx];
        word_id = sparce_matrix[i][word_idx];
        label = sparce_matrix[i][cat_idx];
        occurance = sparce_matrix[i][freq_idx];
        
        full_matrix.at[doc_nr, 'DOC_ID'] = doc_nr
        full_matrix.at[doc_nr, 'CATEGORY'] = label
        full_matrix.at[doc_nr, word_id] = occurance
    
    return full_matrix

In [12]:
%%time 
full_train_data = make_full_matrix(sparce_train_data, VOCAB_SIZE)

Wall time: 8.87 s


In [13]:
full_train_data.head()

Unnamed: 0,DOC_ID,CATEGORY,0,1,2,3,4,5,6,7,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,1,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,2,0,1,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,2,0,2,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1,3,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1,0,0,0,1,2,4,2,3,...,0,0,0,0,0,0,0,0,0,0


# Training Naive Bayes classifier
## Spam Probability calculation

In [14]:
prob_spam = full_train_data.CATEGORY.sum() / full_train_data.CATEGORY.size
prob_spam 

0.31133250311332505

In [15]:
full_train_features = full_train_data.loc[:, full_train_data.columns != 'CATEGORY']
full_train_features.head()

Unnamed: 0,DOC_ID,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,2,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,2,0,1,2,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2,0,2,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,1,2,4,2,3,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
email_lengths = full_train_features.sum(axis=1)
print(email_lengths.shape)
email_lengths[-5:]

(4015,)


5789    5845
5790    5899
5791    5838
5794    5882
5795    5848
dtype: int64

In [17]:
total_wc = email_lengths.sum()
total_wc

12285838

# Total tokens in spam and ham emails

In [18]:
spam_lengths = email_lengths[full_train_data.CATEGORY == 1]
spam_lengths.shape 

(1250,)

In [19]:
spam_wc = spam_lengths.sum()
spam_wc

1405008

In [20]:
ham_lengths = email_lengths[full_train_data.CATEGORY == 0]
print(ham_lengths.shape)
nonspam_wc = ham_lengths.sum()
nonspam_wc

(2765,)


10880830

In [21]:
print(email_lengths.shape[0] - spam_lengths.shape[0] - ham_lengths.shape[0])
total_wc - spam_wc - nonspam_wc

0


0

In [22]:
print('Avg spam ', spam_wc / spam_lengths.shape[0])
print('Avg ham ', nonspam_wc / ham_lengths.shape[0])

Avg spam  1124.0064
Avg ham  3935.2007233273057


# Summing Tokens

In [23]:
train_spam_tokens = full_train_features.loc[full_train_data.CATEGORY == 1]
print(train_spam_tokens.head())
train_spam_tokens.shape

   DOC_ID  0  1  2  3  4  5  6  7  8  ...  2490  2491  2492  2493  2494  2495  \
0       0  2  0  0  0  0  0  0  1  0  ...     0     0     0     0     0     0   
1       1  2  0  1  2  1  1  0  0  0  ...     0     0     0     0     0     0   
2       2  2  0  2  0  0  3  0  0  0  ...     0     0     0     0     0     0   
3       3  3  0  0  1  0  1  1  0  0  ...     0     0     0     0     0     0   
4       4  0  0  0  1  2  4  2  3  1  ...     0     0     0     0     0     0   

   2496  2497  2498  2499  
0     0     0     0     0  
1     0     0     0     0  
2     0     0     0     0  
3     0     0     0     0  
4     0     0     0     0  

[5 rows x 2501 columns]


(1250, 2501)

In [24]:
summed_spam_tokens = train_spam_tokens.sum(axis=0) + 1 
summed_spam_tokens.shape 

(2501,)

In [25]:
train_ham_tokens = full_train_features.loc[full_train_data.CATEGORY == 0]
print(train_ham_tokens.shape)
summed_ham_tokens = train_ham_tokens.sum(axis=0) + 1 
print(summed_ham_tokens.shape)
# train_ham_tokens.head()
summed_ham_tokens.tail()

(2765, 2501)
(2501,)


2495    13
2496    19
2497     1
2498    16
2499    35
dtype: int64

# Spam Probability P(Token | Spam)

In [26]:
prob_tokens_spam = summed_spam_tokens / (spam_wc + VOCAB_SIZE)
print(prob_tokens_spam[:5])
prob_tokens_spam.sum()

DOC_ID    0.859224
0         0.001274
1         0.000688
2         0.000963
3         0.001491
dtype: float64


1.0000007104755353

# Ham probability P(Token | Ham)

In [27]:
prob_tokens_ham = summed_ham_tokens / (nonspam_wc + VOCAB_SIZE)
print(prob_tokens_ham[:5])
prob_tokens_ham.sum()

DOC_ID    0.976777
0         0.000485
1         0.000230
2         0.000187
3         0.000087
dtype: float64


1.0000000918836423

# P(Token)

In [28]:
prob_tokens_all = full_train_features.sum(axis=0) / total_wc 
prob_tokens_all.sum()

1.0

# Save Trained Model

In [51]:
np.savetxt(TOKEN_SPAM_PROB_FILE, prob_tokens_spam)
np.savetxt(TOKEN_HAM_PROB_FILE, prob_tokens_ham)
np.savetxt(TOKEN_ALL_PROB_FILE, prob_tokens_all)

# Save Testing Model

In [29]:
%%time 
full_test_data = make_full_matrix(sparce_test_data, nr_words=VOCAB_SIZE)

Wall time: 1min 21s


In [32]:
full_test_data.tail()

Unnamed: 0,DOC_ID,CATEGORY,0,1,2,3,4,5,6,7,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
5783,5783.0,0.0,2.0,2.0,2.0,,,2.0,,,...,,,,,,,,,,
5786,5786.0,0.0,2.0,1.0,1.0,1.0,,,1.0,,...,,,,,,,,,,
5788,5788.0,0.0,1.0,1.0,,,,,,,...,,,,,,,,,,
5792,5792.0,0.0,2.0,,,,,,,,...,,,,,,,,,,
5793,5793.0,0.0,1.0,,1.0,,,1.0,,,...,,,,,,,,,,


In [33]:
X_test = full_test_data.loc[:, full_test_data.columns != 'CATEGORY'] 
y_test = full_test_data.CATEGORY 

In [34]:
np.savetxt(TEST_TARGET_FILE, y_test)
np.savetxt(TEST_FEATURE_MATRIX, X_test)