# Preprocessing Data for NLP 

In [91]:
import pandas as pd 
import re

In [92]:
# import of data
# write and store
data = pd.read_csv("data\emails.csv")
data = data[['text', 'spam']]
data = data.dropna()


In [None]:
print(data)

                                                   text spam
0     Subject: naturally irresistible your corporate...    1
1     Subject: the stock trading gunslinger  fanny i...    1
2     Subject: unbelievable new homes made easy  im ...    1
3     Subject: 4 color printing special  request add...    1
4     Subject: do not have money , get software cds ...    1
...                                                 ...  ...
5725  Subject: re : research and development charges...    0
5726  Subject: re : receipts from visit  jim ,  than...    0
5727  Subject: re : enron case study update  wow ! a...    0
5728  Subject: re : interest  david ,  please , call...    0
5729  Subject: news : aurora 5 . 2 update  aurora ve...    0

[5728 rows x 2 columns]


### Tokenisation of words 

In [None]:
def tokenisation(data):
    data['text'] = data['text'].apply(lambda words: re.sub(r'[^A-Za-z0-9\s]', '', words))
    data['text'] = data['text'].fillna('').astype(str).str.split()
    return data

                                                   text spam
0     [Subject, naturally, irresistible, your, corpo...    1
1     [Subject, the, stock, trading, gunslinger, fan...    1
2     [Subject, unbelievable, new, homes, made, easy...    1
3     [Subject, 4, color, printing, special, request...    1
4     [Subject, do, not, have, money, get, software,...    1
...                                                 ...  ...
5725  [Subject, re, research, and, development, char...    0
5726  [Subject, re, receipts, from, visit, jim, than...    0
5727  [Subject, re, enron, case, study, update, wow,...    0
5728  [Subject, re, interest, david, please, call, s...    0
5729  [Subject, news, aurora, 5, 2, update, aurora, ...    0

[5728 rows x 2 columns]


### Testing of Tokenisation

### Remove Stopping Words

In [95]:
# Function to remove stopwords
stop_words = {"a", "about", "above", "after", "again", "against", "all", "am", "an", "and", 
    "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", 
    "between", "both", "but", "by", "can", "did", "do", "does", "doing", "down", 
    "during", "each", "few", "for", "from", "further", "had", "has", "have", 
    "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", 
    "how", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "me", 
    "more", "most", "my", "myself", "no", "nor", "not", "now", "of", "off", 
    "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", 
    "over", "own", "same", "she", "should", "so", "some", "such", "than", 
    "that", "the", "their", "theirs", "them", "themselves", "then", "there", 
    "these", "they", "this", "those", "through", "to", "too", "under", "until", 
    "up", "very", "was", "we", "were", "what", "when", "where", "which", 
    "while", "who", "whom", "why", "will", "with", "you", "your", "yours", 
    "yourself", "yourselves", "subject", "re", "fw", "fwd", "http", "https", "www", "com", "email", 
    "click", "please", "thank", "thanks"}

def remove_stopwords(data,stop_words):
    data['text'] = data['text'].apply(lambda words: [w for w in words if w.lower() not in stop_words])
    return data

data = remove_stopwords(data,stop_words)

### Testing of Stopping Words

### Stemming of words 

In [96]:
def PoterStem(word):
    # Step 1a: Handle plurals
    if word.endswith("sses"):
        word = word[:-4] + "ss"
    elif word.endswith("ies"):
        word = word[:-3] + "i"
    elif word.endswith("s") and not word.endswith("ss"):
        word = word[:-1]
    
    # Step 1b: Handle past tense
    if word.endswith("eed"):
        if measure(word[:-3]) > 0:
            word = word[:-3] + "ee"
    elif word.endswith("ed"):
        stem = word[:-2]
        if has_vowel(stem):
            word = stem
            # Apply additional rules
            if word.endswith("at") or word.endswith("bl") or word.endswith("iz"):
                word = word + "e"
            elif word[-2:] in ["ll", "ss", "zz"] and measure(word) > 1:
                word = word[:-1]
            elif measure(word) == 1 and cvc(word):
                word = word + "e"
    elif word.endswith("ing"):
        stem = word[:-3]  # Fixed: 'ing' is 3 characters
        if has_vowel(stem):
            word = stem
            # Apply additional rules
            if word.endswith("at") or word.endswith("bl") or word.endswith("iz"):
                word = word + "e"
            elif word[-2:] in ["ll", "ss", "zz"] and measure(word) > 1:
                word = word[:-1]
            elif measure(word) == 1 and cvc(word):
                word = word + "e"
    
    # Step 1c: Handle 'y'
    if word.endswith("y") and has_vowel(word[:-1]):
        word = word[:-1] + "i"
    
    # Step 2: Replace suffixes
    step2_dict = {
        "ational": "ate",
        "tional": "tion",
        "enci": "ence",
        "anci": "ance",
        "izer": "ize",
        "bli": "ble",
        "alli": "al",
        "entli": "ent",
        "eli": "e",
        "ousli": "ous",
        "ization": "ize",
        "ation": "ate",
        "ator": "ate",
        "alism": "al",
        "iveness": "ive",
        "fulness": "ful",
        "ousness": "ous",
        "aliti": "al",
        "iviti": "ive",
        "biliti": "ble",
        "logi": "log"
    }

    for suffix in step2_dict.keys():
        if word.endswith(suffix) and measure(word[:-len(suffix)]) > 0:
            word = word[:-len(suffix)] + step2_dict[suffix]
            break
    
    # Step 3: Replace suffixes
    step3_dict = {
        "icate": "ic",
        "ative": "",
        "alize": "al",
        "iciti": "ic",
        "ical": "ic",
        "ful": "",
        "ness": ""
    }

    for suffix in step3_dict.keys():
        if word.endswith(suffix) and measure(word[:-len(suffix)]) > 0:
            word = word[:-len(suffix)] + step3_dict[suffix]
            break

    # Step 4: Remove suffixes
    step4_dict = {
        "al": "", "ance": "", "ence": "", "er": "", "ic": "",
        "able": "", "ible": "", "ant": "", "ement": "",
        "ment": "", "ent": "", "ou": "", "ism": "",
        "ate": "", "iti": "", "ous": "", "ive": "", "ize": ""
    }

    for suffix in step4_dict.keys():
        if word.endswith(suffix) and measure(word[:-len(suffix)]) > 1:
            word = word[:-len(suffix)]
            break
    
    # Special case for 'ion'
    if word.endswith("ion") and len(word) > 3:
        if measure(word[:-3]) > 1 and word[-4] in "st":
            word = word[:-3]
    
    # Step 5a: Remove 'e'
    if word.endswith("e"):
        stem = word[:-1]
        if measure(stem) > 1:
            word = stem
        elif measure(stem) == 1 and not cvc(stem):
            word = stem
    
    # Step 5b: Remove double 'l'
    if word.endswith("ll") and measure(word) > 1:
        word = word[:-1]

    return word


def measure(word):
    """Calculate the measure of a word (number of VC sequences)"""
    vowels = "aeiou"
    m = 0
    in_vowel_seq = False

    for char in word:
        if char in vowels:
            in_vowel_seq = True
        else:
            if in_vowel_seq:
                m += 1
                in_vowel_seq = False
    return m


def has_vowel(word):
    """Check if word contains a vowel"""
    vowel = ["a", "e", "i", "o", "u"]
    for char in word:
        if char in vowel:
            return True
    return False


def cvc(word):
    """
    Returns True if the word ends with a consonant-vowel-consonant sequence,
    where the last consonant is NOT w, x, or y.
    """
    if len(word) < 3:
        return False

    vowels = "aeiou"
    last_three = word[-3:]

    first, second, last = last_three[0], last_three[1], last_three[2]

    if (first not in vowels) and (second in vowels) and (last not in vowels) and last not in "wxy":
        return True
    return False

def run_porter(data):
    data['text'] = data['text'].apply(lambda words: [PoterStem(w) for w in words])
    return data

In [97]:
### Testing of Stemming

### Lemmantiation

In [98]:
def lemmatise(word):
    word = word.lower()
    
    # Irregular verbs
    irregular_verbs = {
        'was': 'be', 'were': 'be', 'been': 'be', 'being': 'be', 'am': 'be', 'are': 'be', 'is': 'be',
        'had': 'have', 'has': 'have', 'having': 'have',
        'did': 'do', 'does': 'do', 'doing': 'do', 'done': 'do',
        'went': 'go', 'gone': 'go', 'going': 'go', 'goes': 'go',
        'said': 'say', 'says': 'say', 'saying': 'say',
        'made': 'make', 'makes': 'make', 'making': 'make',
        'took': 'take', 'takes': 'take', 'taken': 'take', 'taking': 'take',
        'came': 'come', 'comes': 'come', 'coming': 'come',
        'saw': 'see', 'seen': 'see', 'sees': 'see', 'seeing': 'see',
        'got': 'get', 'gets': 'get', 'gotten': 'get', 'getting': 'get',
        'gave': 'give', 'given': 'give', 'gives': 'give', 'giving': 'give',
        'found': 'find', 'finds': 'find', 'finding': 'find',
        'told': 'tell', 'tells': 'tell', 'telling': 'tell',
        'became': 'become', 'becomes': 'become', 'becoming': 'become',
        'left': 'leave', 'leaves': 'leave', 'leaving': 'leave',
        'felt': 'feel', 'feels': 'feel', 'feeling': 'feel',
        'brought': 'bring', 'brings': 'bring', 'bringing': 'bring',
        'began': 'begin', 'begun': 'begin', 'begins': 'begin', 'beginning': 'begin',
        'kept': 'keep', 'keeps': 'keep', 'keeping': 'keep',
        'held': 'hold', 'holds': 'hold', 'holding': 'hold',
        'wrote': 'write', 'written': 'write', 'writes': 'write', 'writing': 'write',
        'stood': 'stand', 'stands': 'stand', 'standing': 'stand',
        'ran': 'run', 'runs': 'run', 'running': 'run',
        'bought': 'buy', 'buys': 'buy', 'buying': 'buy',
        'spoke': 'speak', 'spoken': 'speak', 'speaks': 'speak', 'speaking': 'speak',
    }
    
    if word in irregular_verbs:
        return irregular_verbs[word]
    
    # Irregular plurals
    irregular_plurals = {
        'children': 'child', 'men': 'man', 'women': 'woman', 'people': 'person',
        'feet': 'foot', 'teeth': 'tooth', 'geese': 'goose', 'mice': 'mouse',
        'oxen': 'ox', 'sheep': 'sheep', 'deer': 'deer', 'fish': 'fish',
        'lives': 'life', 'wives': 'wife', 'knives': 'knife', 'leaves': 'leaf',
    }
    
    if word in irregular_plurals:
        return irregular_plurals[word]
    
    if word.endswith('ing'):
        if len(word) > 5 and word[-4] == word[-5]:
            return word[:-4]
        return word[:-3]
    
    if word.endswith('ed'):
        if len(word) > 4 and word[-3] == word[-4]:
            return word[:-3]
        return word[:-2]
    
    if word.endswith('ies') and len(word) > 4:
        return word[:-3] + 'y'
    
    if word.endswith('es') and len(word) > 3:
        if word[-3] in 'sxz' or word[-4:-2] in ['ch', 'sh']:
            return word[:-2]
    
    if word.endswith('s') and len(word) > 2:
        return word[:-1]
    
    if word.endswith('ly') and len(word) > 4:
        return word[:-2]
    
    if word.endswith('er') and len(word) > 4:
        if word[-3] == word[-4]:
            return word[:-3]
        return word[:-2]
    
    if word.endswith('est') and len(word) > 5:
        if word[-4] == word[-5]:
            return word[:-4]
        return word[:-3]
    
    return word

def run_lemmantisation(data):
    data['text'] = data['text'].apply(lambda words: [lemmatise(w) for w in words])
    return data


### Testing of Lemmantisation

In [99]:
testing_data = ["seen", "saw","came","come","coolest", "saw"]
for i in testing_data:
    print(lemmatise(i))


see
see
come
come
cool
see


# Writing of NLP

In [100]:
def preprocessing(data,if_lemma=True):
    data = tokenisation(data)  # tokens put in 'text_tokens'
    data = data[['text', 'spam']]
    data = remove_stopwords(data,stop_words)
    if if_lemma:
        data = run_lemmantisation(data)
    else:
        data = run_porter(data)

    return data
    
    
    

In [101]:
def ask_user():
    while True:
        choice = input("Select 1 for lemmantisation or 2 for stemming: ").strip()
        if choice in ["1","2"]:
            if int(choice) == 1:
                choice = True
            else:
                choice = False
            break
        else:
            print("please answer 1 or 2")
    
    return choice    

In [None]:
from collections import Counter
import json
def nlp(data,alpha):

    # setting params 
    min_count = 5
    ngram = 1
    
    # preprocess data
    choice = ask_user()
    dat = preprocessing(data,choice)

    count_spam = Counter()
    count_ham = Counter()
    msg_count_spam = 0
    msg_count_ham = 0 

    # processing the data
    for index, cols in dat.iterrows():

        tokens = cols['text']
        if ngram > 1:
            tokens = make_ngrams(tokens,ngram)
    
        if cols['spam'] == "1":
            msg_count_spam += 1
            count_spam.update(tokens)
        else:
            msg_count_ham += 1
            count_ham.update(tokens)

    # calculate totals and prior proabilities
    total_tokens_spam = sum(count_spam.values())
    total_tokens_ham = sum(count_ham.values())

    vocab = set(count_spam.keys()) | set(count_spam.keys())
    V = len(vocab)

    P_spam_prior = msg_count_spam / (msg_count_spam + msg_count_ham)

    # building the dictionary
    dictionary = {}
    for w in vocab:
        Cs = count_spam[w]
        Ch = count_ham[w]

        p_w_given_spam = (Cs + alpha) / (total_tokens_spam + alpha * V)
        p_w_given_ham = (Ch + alpha) / (total_tokens_ham + alpha * V)

        p_spam_given_w = (p_w_given_spam * P_spam_prior) / \
                     (p_w_given_spam * P_spam_prior + p_w_given_ham * (1 - P_spam_prior))
        

        total_count = Cs + Ch
        if total_count >= min_count:
            dictionary[w] = {
                "count_spam": Cs,
                "count_ham": Ch,
                "p_spam": round(p_spam_given_w, 6)
            }
    
    # Step 4: Export
    with open("spam_dictionary.json", "w", encoding="utf-8") as f:
        json.dump(dictionary, f, ensure_ascii=False, indent=2)
    

def make_ngrams(tokens, n=1):
    if n == 1:
        return tokens
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngram = '_'.join(tokens[i:i+n])
        ngrams.append(ngram)
    return ngrams

if __name__ == "__main__":
    nlp(data,2)

### Run Tests against Spam Words Dictionary

In [None]:
import pandas as pd

# Read and clean the data
data = pd.read_csv(r"data\emails.csv")
data = data[['text', 'spam']]
data = data.dropna()

# Get the first row as a dictionary
text_val_data = data.loc[1].to_dict()

# Create a new DataFrame from that dictionary
text_val = pd.DataFrame({
    "text": [text_val_data['text']],
    "spam": [text_val_data['spam']]
})



In [110]:
import json

def model(fit):
    with open("spam_dictionary.json", "r", encoding="utf-8") as file:
        dictionary = json.load(file)

    list_of_probs = []
    test_val = preprocessing(fit)

    for word in test_val['text']:
        for w in word:
            try:
                list_of_probs.append(dictionary[w]['p_spam'])
            except KeyError:
                continue
    
    prop_val_mean = sum(list_of_probs)/len(list_of_probs)
    if prop_val_mean > 0.5:
        return "SPAM"
    else:
        return "NOT SPAM"

if __name__ == "__main__":
    print(model(text_val))
                

NOT SPAM
