In [1]:
import pandas as pd
import numpy as np

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import os
import tarfile
from six.moves import urllib
import re

In [2]:
DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()
        

In [None]:
hams = ["20021010_easy_ham.tar.bz2", "20021010_hard_ham.tar.bz2", 
        "20030228_easy_ham.tar.bz2", "20030228_easy_ham_2.tar.bz2",
        "20030228_hard_ham.tar.bz2"]
spams = ["20021010_spam.tar.bz2","20030228_spam.tar.bz2",
         "20030228_spam_2.tar.bz2","20050311_spam_2.tar.bz2"]

def fetch_ham_data(ham_url=SPAM_URL, ham_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

In [3]:
# fetch_spam_data() 

In [4]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")

In [5]:
ham_filenames = [name for name in os.listdir(HAM_DIR) if len(name) > 20]
spam_filenames = [name for name in os.listdir(SPAM_DIR) if len(name) > 20]
print(len(ham_filenames))
print(len(spam_filenames))
ham_filenames = ham_filenames
spam_filenames = spam_filenames

2500
500


In [6]:
import email
import email.policy as policy
from email.parser import BytesParser

Montando duas listas de e-mails, uma para cada fonte

In [7]:
def load_emails(filenames, directory):
    emails = []
    for fname in filenames:
        with open(directory + "/" + fname, mode="rb") as file:
            emails.append(BytesParser(policy=policy.default).parse(file))
    return emails

In [8]:
ham_emails = load_emails(ham_filenames, HAM_DIR)
spam_emails = load_emails(spam_filenames, SPAM_DIR)

Posso encapsular a estração das características em uma classe

In [35]:
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def content(self, email):
        words = []
        if type(email).__name__ == "EmailMessage":
            if email.is_multipart():
                words = words + self.content([e for e in email.get_payload()])
            else:
                try:
                    if type(email.get_content()).__name__ == "str":
                        new_words_raw = self.splitter.split(email.get_content())
                        # print(new_words_raw)
                        new_words = []
                        for word in new_words_raw:
                            if self.convert_to_lowercase:
                                word = word.lower()
                            if self.remove_punctuation:
                                word = punctuation_re.sub("", word)
                            if self.replace_numbers:
                                word = numbers_re.sub("NUMBER", word)
                            if self.replace_urls:
                                word = urls_re.sub("URL", word)

                            new_words.append(word)    

                        words += new_words
                except:
                    print('')

        return words
    
    def fetch_words(self, emails=[]):
        content_ = []
        for e in emails:
            content_ += self.content(e)
        return content_
    
    def feature_vector(self, email, word_dict):
        from collections import OrderedDict
        occurrences = OrderedDict()
        email_content = self.content(email)
        for word in word_dict:
            occurrences[word] = 1 if word in email_content else 0
        return list(occurrences.values())    
    
    def build_features(self, emails=[]):
        data = []
        for email in emails:
            data.append(self.feature_vector(email, self.vocabulary))
        return data
    
    def get_vocabulary(self):
        return self.vocabulary
    
    def __init__(self, 
                 vocabulary=[],
                 remove_punctuation=False, 
                 replace_numbers=False, 
                 replace_urls=False, 
                 convert_to_lowercase=False):
        
        self.remove_punctuation = remove_punctuation
        self.replace_numbers = replace_numbers
        self.replace_urls = replace_urls
        self.convert_to_lowercase = convert_to_lowercase
        self.vocabulary = vocabulary
        
        patterns = ["\"", "\'", "\.", "!", "\?", ":", ";", ",", "\(", "\)", 
                    "\*", "\#", "[-]{2,30}", "\|", "\[", "\]", "/", ">", "<", "[_]{2,100}"]
        self.punctuation_re = re.compile("|".join(patterns))
        self.numbers_re = re.compile("\d+")
        self.urls_re = re.compile("(www|http|https)+[^\s]+[\w]")
        self.splitter = re.compile("\s|\n|,")
        
    def fit(self, X, y=None):
        from collections import Counter
        if len(self.vocabulary) == 0:
            words = self.fetch_words(X)
            word_counter = Counter(words)
            minimum_accepted = int(len(X) * 0.1)
            selected_items = [(word, count) for (word, count) in word_counter.items() if count > minimum_accepted]
            self.vocabulary = [word for (word, _) in selected_items]
        return self
    
    def transform(self, X, y=None):
        return self.build_features(X)

In [38]:
extractor = FeatureExtractor(remove_punctuation=True, 
                             replace_numbers=True, 
                             replace_urls=True, 
                             convert_to_lowercase=True)

from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ("extractor", extractor)
])

ham_features = pipeline.fit_transform(ham_emails)
spam_features = pipeline.fit_transform(spam_emails)








In [60]:
ham_df = pd.DataFrame(ham_features, columns=extractor.get_vocabulary())
spam_df = pd.DataFrame(spam_features, columns=extractor.get_vocabulary())

In [61]:
ham_df['label'] = 0
spam_df['label'] = 1
df = pd.concat([ham_df, spam_df])

Separando os conjuntos de treino e teste

In [62]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=32)
train_set, valid_set = train_test_split(df, test_size=0.2, random_state=32)

In [63]:
y_train = train_set["label"]
X_train = train_set.drop("label", axis=1)

y_valid = valid_set["label"]
X_valid = valid_set.drop("label", axis=1)

y_test = test_set["label"]
X_test = test_set.drop("label", axis=1)

Testando alguns classificadores

In [77]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [78]:
lin_clf = SVC(gamma="scale")
lin_scores = cross_val_score(lin_clf, X_train, y_train, scoring="neg_mean_squared_error", cv=3)
lin_y_pred = cross_val_predict(lin_clf, X_train, y_train, cv=3)
lin_rmse_scores = np.sqrt(-lin_scores)

In [79]:
tree_clf = DecisionTreeClassifier()
tree_scores = cross_val_score(tree_clf, X_train, y_train, scoring="neg_mean_squared_error", cv=3)
tree_y_pred = cross_val_predict(tree_clf, X_train, y_train, cv=3)
tree_rmse_scores = np.sqrt(-tree_scores)

In [80]:
sgd_clf = SGDClassifier()
sgd_scores = cross_val_score(sgd_clf, X_train, y_train, scoring="neg_mean_squared_error", cv=3)
sgd_y_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
sgd_rmse_scores = np.sqrt(-sgd_scores)

In [81]:
forest_clf = RandomForestClassifier(n_estimators=10)
forest_scores = cross_val_score(forest_clf, X_train, y_train, scoring="neg_mean_squared_error", cv=3)
forest_y_pred = cross_val_predict(forest_clf, X_train, y_train, cv=3)
forest_rmse_scores = np.sqrt(-forest_scores)

In [82]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation ", scores.std())
    print("")

In [83]:
display_scores(lin_rmse_scores)
display_scores(tree_rmse_scores)
display_scores(sgd_rmse_scores)
display_scores(forest_rmse_scores)

Scores:  [0.2236068  0.18371173 0.21505813]
Mean:  0.20745888671159432
Standard deviation  0.017150616832032655

Scores:  [0.31622777 0.29154759 0.30618622]
Mean:  0.30465385953566676
Standard deviation  0.010133732613845032

Scores:  [0.25248762 0.25980762 0.22912878]
Mean:  0.24714134311405853
Standard deviation  0.013082680297981849

Scores:  [0.27386128 0.25248762 0.25248762]
Mean:  0.25961217522356234
Standard deviation  0.010075637731199715



In [90]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
def display_m_scores(clf, y, y_pred):
    acc = accuracy_score(y, y_pred)
    rec = recall_score(y, y_pred)
    pre = precision_score(y, y_pred)
    f1  = f1_score(y, y_pred)
    print("%s [accuracy=%.2f, recall=%.2f, precision=%.2f, f1=%.2f]" % (clf, acc, rec, pre, f1))

In [92]:
display_m_scores("svc", y_train, lin_y_pred)
display_m_scores("tree", y_train, tree_y_pred)
display_m_scores("sgd", y_train, sgd_y_pred)
display_m_scores("forest", y_train, forest_y_pred)

svc [accuracy=0.96, recall=0.93, precision=0.84, f1=0.88]
tree [accuracy=0.91, recall=0.86, precision=0.69, f1=0.77]
sgd [accuracy=0.94, recall=0.94, precision=0.77, f1=0.85]
forest [accuracy=0.93, recall=0.80, precision=0.80, f1=0.80]


Procurando hyperparametros melhores

In [106]:
def evaluate_search(search_obj, attributes):
    #cvres = search_obj.cv_results_
    print(search_obj.best_estimator_)
#     feature_importances = search_obj.best_estimator_.feature_importances_
#     for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#         print(np.sqrt(-mean_score), params)
#     feat_with_relevance = sorted(zip(feature_importances, attributes), reverse=True)
#     print(feat_with_relevance)

In [107]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
lin_param_dist = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "class_weight": ["balanced", None]
}
lin_rnd_search = RandomizedSearchCV(lin_clf, param_distributions=lin_param_dist, n_iter=5, cv=3)
lin_rnd_search.fit(X_train, y_train)  
evaluate_search(lin_rnd_search, extractor.get_vocabulary())

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [110]:
tree_param_dist = {
    "criterion": ["gini", "entropy"],
    "class_weight": ["balanced", None]
}
tree_rnd_search = RandomizedSearchCV(tree_clf, param_distributions=tree_param_dist, n_iter=4, cv=3)
tree_rnd_search.fit(X_train, y_train)
evaluate_search(tree_rnd_search, extractor.get_vocabulary())

DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


In [114]:
sgd_param_dist = {
    "learning_rate": ["optimal", "adaptive", "invscaling"],
    "class_weight": ["balanced", None],
    "eta0": [0.1, 0.01, 0.001]
}
sgd_rnd_search = RandomizedSearchCV(sgd_clf, param_distributions=sgd_param_dist, n_iter=5, cv=3)
sgd_rnd_search.fit(X_train, y_train)
evaluate_search(sgd_rnd_search, extractor.get_vocabulary())

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.01, fit_intercept=True,
              l1_ratio=0.15, learning_rate='adaptive', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


In [116]:
forest_param_dist = {
    "n_estimators": [10, 50, 100, 200],
    "criterion": ["gini", "entropy"],
    "class_weight": ["balanced", None] 
}
forest_rnd_search = RandomizedSearchCV(forest_clf, param_distributions=forest_param_dist, n_iter=5, cv=3)
forest_rnd_search.fit(X_train, y_train)
evaluate_search(forest_rnd_search, extractor.get_vocabulary())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
