# Documents Classifier (Spam or Not Spam)

## Setup

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize

In [2]:
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mmenendezg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mmenendezg/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Load the dataset

In [3]:
df = pd.read_csv("datasets/datasets/email/csv/spam-apache.csv", names=["class", "content"])
df["tokens"] = df.content.apply(lambda x: word_tokenize(x))

In [4]:
df.tokens.values[0]

['<',
 '!',
 'DOCTYPE',
 'HTML',
 'PUBLIC',
 '``',
 '-//W3C//DTD',
 'HTML',
 '4.0',
 'Transitional//EN',
 "''",
 '>',
 '<',
 'HTML',
 '>',
 '<',
 'HEAD',
 '>',
 '<',
 'META',
 'http-equiv=Content-Type',
 'content=',
 "''",
 'text/html',
 ';',
 'charset=iso-8859-1',
 "''",
 '>',
 '<',
 'META',
 'content=',
 "''",
 'MSHTML',
 '6.00.2600.0',
 "''",
 'name=GENERATOR',
 '>',
 '<',
 'STYLE',
 '>',
 '<',
 '/STYLE',
 '>',
 '<',
 '/HEAD',
 '>',
 '<',
 'BODY',
 'bgColor=',
 '#',
 'ffffff',
 '>',
 '<',
 'DIV',
 '>',
 '<',
 'FONT',
 'face=Arial',
 'size=2',
 '>',
 '<',
 'FONT',
 'face=',
 "''",
 'Times',
 'New',
 'Roman',
 "''",
 'size=3',
 '>',
 'Dear',
 'Friend',
 ',',
 '<',
 'BR',
 '>',
 '<',
 'BR',
 '>',
 'A',
 'recent',
 'survey',
 'by',
 'Nielsen/Netratings',
 'says',
 'that',
 '``',
 'The',
 'Internet',
 '<',
 'BR',
 '>',
 'population',
 'is',
 'rapidly',
 'approaching',
 'a',
 "'Half",
 'a',
 'Billion',
 "'",
 'people',
 '!',
 '``',
 '<',
 'BR',
 '>',
 '<',
 'BR',
 '>',
 'SO',
 'WHAT',
 'D

In [5]:
all_words = nltk.FreqDist([w for tokenlist in df.tokens.values for w in tokenlist])
top_words = all_words.most_common(100)
top_words

[(',', 2173),
 ('.', 2171),
 ('the', 1967),
 ('>', 1787),
 ('--', 1611),
 ('to', 1435),
 (':', 1220),
 ('*', 1149),
 ('and', 1064),
 ('of', 958),
 ('a', 879),
 ('you', 744),
 ('in', 742),
 ('I', 741),
 ('<', 718),
 ('!', 698),
 ('%', 677),
 ('for', 609),
 ('is', 578),
 ('#', 521),
 ('BR', 494),
 ('that', 479),
 (')', 463),
 ('it', 458),
 ("''", 434),
 ('$', 413),
 ('this', 384),
 ('(', 380),
 ('on', 378),
 ('http', 362),
 ('?', 360),
 ('your', 359),
 ('have', 351),
 ('with', 334),
 ('``', 307),
 ('be', 299),
 ('-', 289),
 ('from', 271),
 ("'s", 263),
 ('are', 257),
 ('31', 255),
 ('or', 252),
 ('as', 251),
 ('will', 243),
 ('not', 226),
 ('30', 220),
 ('my', 206),
 ('at', 199),
 ('The', 198),
 ('has', 195),
 ('can', 194),
 ('&', 181),
 ('all', 176),
 ("n't", 175),
 ('do', 167),
 ('out', 166),
 ('but', 164),
 ('...', 160),
 ('our', 160),
 ('by', 156),
 ('if', 152),
 ('was', 149),
 ('one', 129),
 ('an', 129),
 ('just', 128),
 ('@', 128),
 ('This', 125),
 ('1', 123),
 ('If', 118),
 ('more

In [6]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word, n_words in top_words:
        features[f"contains [{word}]"] = (word in document_words)
    
    return features

In [7]:
document_features(df.tokens.values[0])

{'contains [,]': True,
 'contains [.]': True,
 'contains [the]': True,
 'contains [>]': True,
 'contains [--]': True,
 'contains [to]': True,
 'contains [:]': True,
 'contains [*]': True,
 'contains [and]': True,
 'contains [of]': True,
 'contains [a]': True,
 'contains [you]': True,
 'contains [in]': True,
 'contains [I]': True,
 'contains [<]': True,
 'contains [!]': True,
 'contains [%]': True,
 'contains [for]': True,
 'contains [is]': True,
 'contains [#]': True,
 'contains [BR]': True,
 'contains [that]': True,
 'contains [)]': True,
 'contains [it]': True,
 "contains ['']": True,
 'contains [$]': True,
 'contains [this]': True,
 'contains [(]': True,
 'contains [on]': True,
 'contains [http]': False,
 'contains [?]': True,
 'contains [your]': True,
 'contains [have]': True,
 'contains [with]': True,
 'contains [``]': True,
 'contains [be]': True,
 'contains [-]': True,
 'contains [from]': True,
 "contains ['s]": True,
 'contains [are]': True,
 'contains [31]': False,
 'contains 

In [8]:
fset = [(document_features(text), class_) for text, class_ in zip(df.tokens.values, df["class"].values)]
fset