# NEGATIVE SELECTION ALGORITHM(NSA) FOR SPAM DETECTION

## Imports

In [1]:
import string
import nltk
import random
import abc

nltk.download('stopwords')

stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/fbb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Start with the data extraction

In [2]:
def data_extraction(file_path="Data/SMSSpamCollection"):
    """
    Given a filepath, extract the data within the file
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [line.strip() for line in f]
            return data
    except FileNotFoundError:
        print(f"ERROR: File not found at {filepath}")
        return []

In [3]:
def data_processing(data, stop_words):
    """
    Given the data, return tuple of data and label
    """
    finished_data =[]
    for sms in data:
        label, text = sms.split("\t")
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.replace('\x92', "'")
        words = text.split()
        words = [word for word in words if word not in stop_words]
        text = " ".join(words)
        finished_data.append((label, text))
    return finished_data

In [4]:
file_path = "Data/SMSSpamCollection"

data = data_extraction(file_path)
print(data[0])
print(data[1])

data = data_processing(data, stop_words)
print(data[0])
print(data[1])
#print(data[0:20])

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
('ham', 'go jurong point crazy available bugis n great world la e buffet cine got amore wat')
('ham', 'ok lar joking wif u oni')


## Get some statistics

In [5]:
count_of_spam = 0
for label, _ in data:
    if label == 'spam':
        count_of_spam += 1
count_of_ham = len(data) - count_of_spam

print(f"The data contains {count_of_spam} spam messages")
print(f"The data contains {count_of_ham} harmless messages")

The data contains 747 spam messages
The data contains 4827 harmless messages


## The NSA Class

The basic idea is to create a detector based only on the self, so the strings which are not harmful. So the detector is trained on random strings, which are matched against the 'ham' labeled data. 

### The abstract class

In [6]:
class NSA(abc.ABC):

    @abc.abstractmethod
    def encode(self):
        pass

    @abc.abstractmethod
    def generate_detectors(self):
        pass

    @abc.abstractmethod
    def run(self):
        pass

    @abc.abstractmethod
    def detect_with_detectors(self):
        pass

### K-Grams 

In [7]:
class k_gram_NSA(NSA):

    def __init__(self, r):
        self.r = r

    def encode(self, data):
        k_grams = set()
        for label, sms in data:
            if label == 'spam':
                continue
            if len(sms) < self.r:
                continue
            for i in range(len(sms) - self.r + 1):
                k_grams.add(sms[i:i+self.r])
        return k_grams

    def random_string(self, alphabet):
        return ''.join(random.choice(alphabet) for _ in range(self.r))

    def generate_detectors(self, k_grams, num_detectors, alphabet=None, max_iters=1000):
        if alphabet == None:
            alphabet = string.ascii_lowercase + string.digits
        detectors = set()
        iters = 0
        while len(detectors) < num_detectors and iters < max_iters:
            iters += 1
            cand = self.random_string(alphabet)
            if cand not in k_grams:
                detectors.add(cand)
        if len(detectors) < num_detectors:
            print("Warning: could only generate", len(detectors), "detectors after", iters, "iters.")
        self.len_detector = len(detectors)
        return list(detectors)

    def detect_with_detectors(self, message, detectors):
        for d in detectors:
            if d in message:
                return True, d
        return (False, None)    

    def run(self, data):
        k_grams = self.encode(data)
        detectors = self.generate_detectors(k_grams,5000, max_iters=10000)
        anomalies = []
        for label, sms in data:
            anomalies.append((label, self.detect_with_detectors(sms, detectors)))
        return anomalies

### K-Grams with binary encoding

In [None]:
#### TO BE IMPLEMENTED

### Model Statistics

In [15]:
def print_model_stats(accuracy, precision, recall, f1_score):
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1_score}")

def print_mean_model_stats(stats):
    accuracy, precision, recall, f1_score = stats
    print(f"Mean Accuracy: {accuracy}")
    print(f"Mean Precision: {precision}")
    print(f"Mean Recall: {recall}")
    print(f"Mean F1-Score: {f1_score}")


def model_stats(results):
    pred_count = 0
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for result in results:
        label, pred = result
        if pred[0]:
            pred_count += 1
            if label == 'spam':
                TP += 1
            else:
                FP += 1
        if not pred[0]:
            if label == 'spam':
                FN += 1
            else:
                TN += 1
    accuracy = (TP + TN) / len(results)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1_score = 2 * ((precision * recall) / (precision + recall))

    return accuracy, precision, recall, f1_score

def mean_stats_calc(stats):
    num_runs = len(stats)
    total_accuracy = sum(acc for acc, _, _, _ in stats)
    total_precision = sum(pre for _, pre, _, _ in stats)
    total_recall = sum(rec for _, _, rec, _ in stats)
    total_f1 = sum(f1 for _, _, _, f1 in stats)
    
    mean_acc = total_accuracy / num_runs
    mean_pre = total_precision / num_runs
    mean_rec = total_recall / num_runs
    mean_f1 = total_f1 / num_runs

    return mean_acc, mean_pre, mean_rec, mean_f1

In [16]:
r_range = range(2,5)
test_runs = 5
mean_stats = []

for i in r_range:
    print(f"Running K-Grams NSA with r = {i}")
    stats_in_r = []
    for _ in range(test_runs):
        k_grams_nsa = k_gram_NSA(i)
        stats_in_r.append(model_stats(k_grams_nsa.run(data)))
    means = mean_stats_calc(stats_in_r)
    mean_stats.append(means)
    print_mean_model_stats(means)


Running K-Grams NSA with r = 2
Mean Accuracy: 0.97918909221385
Mean Precision: 1.0
Mean Recall: 0.8447121820615797
Mean F1-Score: 0.9158200290275762
Running K-Grams NSA with r = 3
Mean Accuracy: 0.957481162540366
Mean Precision: 1.0
Mean Recall: 0.6827309236947791
Mean F1-Score: 0.8102445118512727
Running K-Grams NSA with r = 4
Mean Accuracy: 0.8712235378543237
Mean Precision: 1.0
Mean Recall: 0.03908969210174029
Mean F1-Score: 0.07480918393062388
