# NEGATIVE SELECTION ALGORITHM(NSA) FOR SPAM DETECTION

## Imports

In [8]:
import string
import nltk
import random
import abc

nltk.download('stopwords')

stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/fbb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Start with the data extraction

In [2]:
def data_extraction(file_path="Data/SMSSpamCollection"):
    """
    Given a filepath, extract the data within the file
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [line.strip() for line in f]
            return data
    except FileNotFoundError:
        print(f"ERROR: File not found at {filepath}")
        return []

In [3]:
def data_processing(data, stop_words):
    """
    Given the data, return tuple of data and label
    """
    finished_data =[]
    for sms in data:
        label, text = sms.split("\t")
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.replace('\x92', "'")
        words = text.split()
        words = [word for word in words if word not in stop_words]
        text = " ".join(words)
        finished_data.append((label, text))
    return finished_data

In [4]:
file_path = "Data/SMSSpamCollection"

data = data_extraction(file_path)
print(data[0])
print(data[1])

data = data_processing(data, stop_words)
print(data[0])
print(data[1])
#print(data[0:20])

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
('ham', 'go jurong point crazy available bugis n great world la e buffet cine got amore wat')
('ham', 'ok lar joking wif u oni')
[('ham', 'go jurong point crazy available bugis n great world la e buffet cine got amore wat'), ('ham', 'ok lar joking wif u oni'), ('spam', 'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s'), ('ham', 'u dun say early hor u c already say'), ('ham', 'nah dont think goes usf lives around though'), ('spam', 'freemsg hey darling 3 weeks word back id like fun still tb ok xxx std chgs send £150 rcv'), ('ham', 'even brother like speak treat like aids patent'), ('ham', 'per request melle melle oru minnaminunginte nurungu vettam set callertune callers press 9 copy friends callertune'), ('spam', 'winner valued network customer selected

## Get some statistics

In [5]:
count_of_spam = 0
for label, _ in data:
    if label == 'spam':
        count_of_spam += 1
count_of_ham = len(data) - count_of_spam

print(f"The data contains {count_of_spam} spam messages")
print(f"The data contains {count_of_ham} harmless messages")

The data contains 747 spam messages
The data contains 4827 harmless messages


## The NSA Class

The basic idea is to create a detector based only on the self, so the strings which are not harmful. So the detector is trained on random strings, which are matched against the 'ham' labeled data. 

### The abstract class

In [12]:
class NSA(abc.ABC):

    @abc.abstractmethod
    def encode(self):
        pass

    @abc.abstractmethod
    def generate_detectors(self):
        pass

    @abc.abstractmethod
    def run(self):
        pass

    @abc.abstractmethod
    def detect_with_detectors(self):
        pass

### K-Grams 

In [18]:
class k_gram_NSA(NSA):

    def __init__(self, r):
        self.r = r

    def encode(self, data):
        k_grams = set()
        for label, sms in data:
            if label == 'spam':
                continue
            if len(sms) < self.r:
                continue
            for i in range(len(sms) - self.r + 1):
                k_grams.add(sms[i:i+self.r])
        return k_grams

    def random_string(self, alphabet):
        return ''.join(random.choice(alphabet) for _ in range(self.r))

    def generate_detectors(self, k_grams, num_detectors, alphabet=None, max_iters=1000):
        if alphabet == None:
            alphabet = string.ascii_lowercase + string.digits
        detectors = set()
        iters = 0
        while len(detectors) < num_detectors and iters < max_iters:
            iters += 1
            cand = self.random_string(alphabet)
            if cand not in k_grams:
                detectors.add(cand)
        if len(detectors) < num_detectors:
            print("Warning: could only generate", len(detectors), "detectors after", iters, "iters.")
        return list(detectors)

    def detect_with_detectors(self, message, detectors):
        for d in detectors:
            if d in message:
                return True, d
        return (False, None)    

    def run(self, data):
        k_grams = self.encode(data)
        detectors = self.generate_detectors(k_grams,5000, max_iters=10000)
        anomalies = []
        for label, sms in data:
            anomalies.append((label, self.detect_with_detectors(sms, detectors)))
        return anomalies

### K-Grams with binary encoding

In [None]:
#### TO BE IMPLEMENTED

### Model Statistics

In [16]:
def model_stats(results):
    pred_count = 0
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for result in results:
        label, pred = result
        if pred[0]:
            pred_count += 1
            if label == 'spam':
                TP += 1
            else:
                FP += 1
        if not pred[0]:
            if label == 'spam':
                FN += 1
            else:
                TN += 1
    print(f"Count of predicted Anomalies: {pred_count}")
    print(f"Count of True Positives: {TP}")
    print(f"Count of False Positives: {FP}")
    print(f"Count of True Negatives: {TN}")
    print(f"Count of False Negatives: {FN}")

    print(f"Accuracy: {(TP + TN) / len(results)}")
    print(f"Precision: {TP / (TP + FP)}")
    print(f"Recall: {TP / (TP + FN) }")
    print(f"F1-Score: {2 * ((TP / (TP + FP)) * (TP / (TP + FN)) / (TP / (TP + FP)) + (TP / (TP + FN)))}")

In [23]:
r_range = range(2,5)

for i in r_range:
    print(f"Running K-Grams NSA with r = {i}")
    k_grams_nsa = k_gram_NSA(i)
    model_stats(k_grams_nsa.run(data))

Running K-Grams NSA with r = 2
Count of predicted Anomalies: 631
Count of True Positives: 631
Count of False Positives: 0
Count of True Negatives: 4827
Count of False Negatives: 116
Accuracy: 0.97918909221385
Precision: 1.0
Recall: 0.8447121820615796
F1-Score: 3.3788487282463184
Running K-Grams NSA with r = 3
Count of predicted Anomalies: 520
Count of True Positives: 520
Count of False Positives: 0
Count of True Negatives: 4827
Count of False Negatives: 227
Accuracy: 0.959275206315034
Precision: 1.0
Recall: 0.6961178045515395
F1-Score: 2.784471218206158
Running K-Grams NSA with r = 4
Count of predicted Anomalies: 21
Count of True Positives: 21
Count of False Positives: 0
Count of True Negatives: 4827
Count of False Negatives: 726
Accuracy: 0.8697524219590959
Precision: 1.0
Recall: 0.028112449799196786
F1-Score: 0.11244979919678715
