# Feature Extraction: This file is used for parsing emails and extracting the features

In [27]:
pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [39]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/NSD/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/NSD/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
%matplotlib inline
import os
import sys
import numpy
from pandas import DataFrame
import csv
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re

In [8]:
# create the file and write names of the colums run this only once
with open('spam_x.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["file_name", "volume", "subject", "body","ham/spam"])

In [9]:
HAM = 'ham'
SPAM = 'spam'
NEWLINE = '\n'
CLEAN = re.compile(r'<[^>]+>')
SW_NLTK = set(stopwords.words('english'))

SOURCES = [
    ('data/spam',        SPAM),
    ('data/easy_ham',    HAM),
    ('data/hard_ham',    HAM),
    ('data/beck-s',      HAM),
    ('data/farmer-d',    HAM),
    ('data/kaminski-v',  HAM),
    ('data/kitchen-l',   HAM),
    ('data/lokay-m',     HAM),
    ('data/williams-w3', HAM),
    ('data/BG',          SPAM),
    ('data/GP',          SPAM),
    ('data/SH',          SPAM)
]

SKIP_FILES = {'cmds'}

# extracts and preprocess subject from the header
def extract_subject(header):
    lines = header.splitlines()
    for line in lines:
        if "subject:" in line.lower():
            subject = line.lower().replace('subject: ', '')
            subject = preprocess_body(subject)
            return subject
    
def preprocess_body(body):
    # remove extra white spaces
    body = " ".join(body.split())
    
    # remove numbers
    body = ''.join([i for i in body if not i.isdigit()])
    
    # remove urls
    body = re.sub(r'http\S+', '', body)
    
    # remove all HTML tags
    body = re.sub(CLEAN, '', body)
    
    # to lowercase
    body = body.lower()
    
    # apply stop words removal and stemming
    processed_body = ""
    porter = PorterStemmer()
    word_tokens = word_tokenize(body)
    for word in word_tokens:
        if word not in SW_NLTK:
            stemmed = porter.stem(word.lower())
            processed_body += stemmed + " "

    #remove punctuation
    processed_body = processed_body.translate(str.maketrans('', '', string.punctuation))
    return processed_body


def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        # crawl every path
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, header, body = False, [], []
                    f = open(file_path, encoding="latin-1")
                    # get the header and body
                    for line in f:
                        if past_header:
                            body.append(line)
                        elif line == NEWLINE:
                            past_header = True
                        else:
                            header.append(line)
                    f.close()
                    header = NEWLINE.join(header)
                    body = NEWLINE.join(body)
                    subject = extract_subject(header)  # get the subject from header
                    yield file_path, body, subject


def load_file(path, classification):
    rows = []
    index = []
    for i, (file_name, body, subject) in enumerate(read_files(path)):
        volume = len(body)
        body = preprocess_body(body)
        # write features to csv
        with open('spam_x.csv', 'a') as file:
            writer = csv.writer(file)
            writer.writerow([file_name, volume, subject, body, classification])
            

def load_data():
    l = 0
    for path, classification in SOURCES:
        load_file(path, classification)

In [10]:
load_data()