# **Imports and downloads** 

In [190]:
import math
import nltk
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mohammad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mohammad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# **Reading and splitting data**
Here we read the main csv file and split data randomly to train and validation datasets (80% training data)

In [191]:
df_total = pd.read_csv('train_test.csv')
df_train, df_val = train_test_split(df_total, test_size=0.2, shuffle=True)

# **Preprocessing**
Here we apply some preprocessing on our data to get better results in classification.
Preprocessings include:
- lowercasing all characters
- tokenization
- stemming the tokens

In [192]:
def preprocess_string(s):
    s = s.lower()                       # only use lowercase characters
    s = ' '.join(s.split())             # remove extra whitespace
    tokens = word_tokenize(s)           # tokenization
    
    # remove stopwords
    # swords = stopwords.words('english')
    # nonstop_tokens = []
    # for token in tokens:
    #     if token not in swords:
    #         nonstop_tokens.append(token)
    # tokens = nonstop_tokens

    # remove punctuations
    regexp_tokenizer = RegexpTokenizer(r'\w+')
    tokens = regexp_tokenizer.tokenize(' '.join(tokens))

    stemmer = PorterStemmer()
    final_tokens = [stemmer.stem(t) for t in tokens]

    s = ' '.join(final_tokens)
    return s


Here we show a sample text and its normalized form.

In [193]:
old_string = df_train.sample().iloc[0]['text']
new_string = preprocess_string(old_string)
print('Sample text:', old_string)
print('Normalized text:', new_string)

Sample text: No drama Pls.i have had enough from you and family while i am struggling in the hot sun in a strange place.No reason why there should be an ego of not going 'IF NOT INVITED' when actually its necessity to go.wait for very serious reppurcussions.
Normalized text: no drama pl i have had enough from you and famili while i am struggl in the hot sun in a strang place no reason whi there should be an ego of not go if not invit when actual it necess to go wait for veri seriou reppurcuss


In [194]:
def preprocess_data(df):
    new_df = df.copy()
    new_df['text'] = new_df['text'].apply(preprocess_string)
    return new_df

df_train = preprocess_data(df_train)
df_val = preprocess_data(df_val)

# **Training naive bayes parameters**
For each email, the feature vector consists of all bigrams and unigrams present in that mail.
Then we find the parameters of naive bayes classifier based on the training data.
This is how we act:
1. Loop through all emails.
2. Count the number of all bigrams and unigrams in each class (spam and ham)
3. Count the number of spam and ham emails (use it for prior probability)

To determine the start and end of emails we use 2 special tokens: "START" and "END" 

So our feature vector for each email is basically a **vector that determines whether each bigram is present** in this instance or not.

In [195]:
total_bigrams = {'spam': 0, 'ham': 0}
total_unigrams = {'spam': 0, 'ham': 0}
bigram_counts = {'spam': {}, 'ham': {}}
unigram_counts = {'spam': {}, 'ham': {}}
num_spam = 0
num_ham = 0
vocab = {'<START>', '<END>'}

for _, row in df_train.iterrows():
    text = row['text']
    type = row['type']
    tokens = word_tokenize(text)
    if len(tokens) == 0:
        continue
    bigram_counts[type][('<START>', tokens[0])] = bigram_counts[type].get(('<START>', tokens[0]), 0) + 1
    unigram_counts[type]['<START>'] = unigram_counts[type].get('<START>', 0) + 1
    unigram_counts[type]['<END>'] = unigram_counts[type].get('<END>', 0) + 1
    total_bigrams[type] += 1
    total_unigrams[type] += 2
    for i in range(len(tokens)):
        unigram_counts[type][tokens[0]] = unigram_counts[type].get(tokens[0], 0) + 1
        if i == len(tokens) - 1:
            bigram_counts[type][(tokens[i], '<END>')] = bigram_counts[type].get((tokens[i], '<END>'), 0) + 1
        else:
            bigram_counts[type][(tokens[i], tokens[i+1])] = bigram_counts[type].get((tokens[i], tokens[i+1]), 0) + 1
        total_bigrams[type] += 1
        total_unigrams[type] += 1
        vocab.add(tokens[i])
    if type == 'spam':
        num_spam += 1
    else:
        num_ham += 1


Here we visualize how this feature choosing can lead to good classifier. As it can be seen in the next 2 cells frequent bigrams in spam<br /> and non-spam messages could help detecting spam messages. This is the reason that we chose bigrams as our feature representation. 

In [211]:
freq_spam = {k: v for k, v in sorted(bigram_counts['spam'].items(), key=lambda item: item[1])[-10:][::-1]}
print('Top ten most frequenc bigrams in spam messages')
freq_spam


Top ten most frequenc bigrams in spam messages


{('you', 'have'): 47,
 ('a', 'å'): 46,
 ('co', 'uk'): 41,
 ('<START>', 'you'): 40,
 ('to', 'claim'): 38,
 ('have', 'won'): 38,
 ('<START>', 'urgent'): 35,
 ('your', 'mobil'): 30,
 ('thi', 'is'): 29,
 ('pleas', 'call'): 29}

In [212]:
freq_ham = {k: v for k, v in sorted(bigram_counts['ham'].items(), key=lambda item: item[1])[-10:][::-1]}
print('Top ten most frequenc bigrams in ham messages')
freq_ham


Top ten most frequenc bigrams in ham messages


{('<START>', 'i'): 397,
 ('i', 'm'): 271,
 ('n', 't'): 240,
 ('lt', 'gt'): 182,
 ('i', 'll'): 144,
 ('are', 'you'): 129,
 ('<START>', 'ok'): 106,
 ('i', 'am'): 104,
 ('have', 'a'): 93,
 ('do', 'n'): 91}

# **Validation**
During inference we used the parameters calculated before to determine the probability of being spam or ham.
<br />Then we report Accuracy, Precision, Recall on our validation set. 

In [177]:
def calculate_probability(tokens, bigrams, unigrams, prior, alpha):
    prob = 0
    prob += math.log((bigrams.get(('<START>', tokens[0]), 0) + alpha)/(unigrams.get('<START>', 0) + alpha * len(vocab)))
    for i in range(len(tokens)):
        if i == len(tokens) - 1:
            prob += math.log((bigrams.get((tokens[i], '<END>'), 0) + alpha) / (
                unigrams.get(tokens[i], 0) + alpha * len(vocab)))
        else:
            prob += math.log((bigrams.get((tokens[i], tokens[i+1]), 0) + alpha) / (
                unigrams.get(tokens[i], 0) + alpha * len(vocab)))
    prob += math.log(prior)
    return prob


In [188]:
predicted_labels = np.zeros(len(df_val))
true_labels = ((df_val['type'] == 'spam') * 1).values

i = 0
for _, row in df_val.iterrows():
    tokens = word_tokenize(row['text'])
    if len(tokens) == 0:
        continue
    prob_spam = calculate_probability(tokens, bigram_counts['spam'], unigram_counts['spam'], num_spam/(num_spam + num_ham), 1)
    prob_ham = calculate_probability(tokens, bigram_counts['ham'], unigram_counts['ham'], num_ham/(num_spam + num_ham), 1)
    if prob_spam > prob_ham:
        predicted_labels[i] = 1
    i += 1

In [189]:
TP = np.sum(predicted_labels * true_labels)
FP = np.sum(predicted_labels * (1-true_labels))
FN = np.sum((1-predicted_labels) * true_labels)
TN = np.sum((1-predicted_labels) * (1-true_labels))

precision = TP/(TP + FP)
recall = TP/(TP + FN)
accuracy = (TP + TN)/(TP + FP + TN + FN)

print('Precision:', precision)
print('Recall:', recall)
print('Accuracy:', accuracy)

Precision: 1.0
Recall: 0.8435374149659864
Accuracy: 0.9773399014778326


**Handling overfitting** <br />
3 cells back we defined a function called calculate_probability. This function takes a parameter called "alpha" as its input.<br />
This parameter is used for smoothing. Without it if a given message has a bigram not seen before in training data it will assign 0 <br /> probability to that message which is unrealistic and indicates that our model has overfitted the training data. <br />
So with this parameter we are doing some kind of regularization to increase our model's capability to generalize to unseen data.

# **Creating output on test data**
A file consisting of test data is given. <br />
We compute our predictions on this test set and save the results in a csv file.

In [143]:
df_test = pd.read_csv('evaluate.csv')
results = []
for _, row in df_test.iterrows():
    idx = row['id']
    tokens = word_tokenize(preprocess_string(row['text']))
    if len(tokens) == 0:
        results.append((idx, 0))
        continue
    prob_spam = calculate_probability(
        tokens, bigram_counts['spam'], unigram_counts['spam'], num_spam/(num_spam + num_ham))
    prob_ham = calculate_probability(
        tokens, bigram_counts['ham'], unigram_counts['ham'], num_ham/(num_spam + num_ham))
    if prob_spam > prob_ham:
        results.append((idx, 'spam'))
    else:
        results.append((idx, 'ham'))
    
df_results = pd.DataFrame(results, columns=['id', 'type'])
df_results.to_csv('output.csv', index=False)